In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
# notebook will reload external python modules
%load_ext autoreload
%autoreload 2

In [2]:
# import data
names = ['age','sex','cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', \
         'oldpeak', 'slope', 'ca', 'thal', 'goal']
data_raw = pd.read_csv('./data/processed.cleveland.data', names = names, header=None, na_values=['?'])
# drop rows with null values 
data_used = data_raw.dropna() # or use data_raw.fillna(values) to replace null data with values
data_used.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,goal
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,54.542088,0.676768,3.158249,131.693603,247.350168,0.144781,0.996633,149.599327,0.326599,1.055556,1.602694,0.676768,4.73064,0.946128
std,9.049736,0.4685,0.964859,17.762806,51.997583,0.352474,0.994914,22.941562,0.469761,1.166123,0.618187,0.938965,1.938629,1.234551
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,243.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,276.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,4.0


In [3]:
all_num = data_used.shape[0]
val_num = all_num // 5
test_num = all_num // 5
train_num = all_num - val_num - test_num
print('all_num:{}, val_num:{}, test_num:{}, train_num:{}'.format(all_num, val_num, test_num, train_num))

all_num:297, val_num:59, test_num:59, train_num:179


In [4]:
# shuffle the set
data_shuffled = np.array(data_used).astype(int)
np.random.shuffle(data_shuffled)

# mask
train_mask = range(train_num)
val_mask = range(train_num,train_num+val_num)
test_mask = range(train_num + val_num, train_num + val_num + test_num)
# devide set into 3 parts (train, val, test)
x_train = data_shuffled[train_mask, :-1]
y_train = data_shuffled[train_mask, -1]
x_val = data_shuffled[val_mask, :-1]
y_val = data_shuffled[val_mask, -1]
x_test = data_shuffled[test_mask, :-1]
y_test = data_shuffled[test_mask, -1]

In [5]:
# naive bayes
class NB():
    def __init__(self, X, Y, la=1):
        self.X = X
        self.Y = Y
        self.p = {}
        self.p_x_y = {}
        self.N = len(Y) # number of rows
        self.A = X.shape[1] # number of attributes
        self.la = la # lambda
    def train(self):
        goals, cnt_goals = np.unique(self.Y, return_counts=True)
        for idx_g, goal in enumerate(goals):
            self.p[goal] = cnt_goals[idx_g] / self.N
            mask_goal = self.Y == goal
            X_goal = self.X[mask_goal]
            for j in range(self.A):
                unique, count = np.unique(X_goal[:,j], return_counts=True)
                for idx_a, a in enumerate(unique):
                    self.p_x_y[(j, a, goal)] = (count[idx_a] + self.la) / (cnt_goals[idx_g] + len(unique)*self.la)
    def pred(self, x):
        preds = []
        for item in x:
            max_score = 0
            max_goal = -1
            for goal in self.p:
                score = self.p[goal]
                for idx_a in range(self.A):
                    score = score * self.p_x_y[(idx_a, item[idx_a], goal)]
                if score > max_score:
                    max_score = score
                    max_goal = goal
            preds.append(max_goal)
        return preds

In [6]:
def check_accuracy(Y, fx):
    num_correct = np.sum(Y == fx)
    num_samples = len(Y)
    acc = float(num_correct) / num_samples
    print('Got {} / {} correct {:.2%}'.format(num_correct, num_samples, acc))

In [7]:
model = NB(x_train, y_train)
model.train()
preds = model.pred(x_val)
check_accuracy(y_val, preds)

KeyError: (4, 254, 0)

## About the Error

理论上需要知道每个特征的所有取值，但是对于实际的数据集，还是存在一些问题，比如你不能保证训练集中包含了所有的取值。

解决方法：

1.对数据进行预处理，保证训练集中包含所有特征的取值

2.先把所有数据集传进去，收集所有特征，再使用训练集训练

3.对于不存在的组合，直接使用公式 $\frac{(0 + lambda)}{(0 + lambda \times S_j)}$

In [8]:
# naive bayes with modification
class NB_mdf():
    def __init__(self, X, Y, la=1):
        '''
        X, Y: data with all available features
        '''
        self.X = X # all data
        self.Y = Y # all data's label
        self.p = {}
        self.p_x_y = {}
        self.N = len(Y) # number of rows
        self.A = X.shape[1] # number of attributes
        self.la = la # lambda
        # collecting features
        self.goals = np.unique(self.Y)
        self.features = []
        for idx_f in range(X.shape[1]):
            self.features.append(np.unique(X[:,idx_f]))

    def train(self, x, y):
        '''
        x:x_train
        y:y_train
        '''
        for goal in self.goals:
            self.p[goal] = np.sum((y == goal)) / len(y)
            y_ck_mask = y == goal
            for idx_f, features in enumerate(self.features):
                x_j = x[:,idx_f][y_ck_mask]
                s_j = len(features)
                for feat in features:
                    self.p_x_y[(idx_f, feat, goal)] = (np.sum(x_j == feat) + self.la) / (self.p[goal] * len(y) + s_j*self.la)

    def pred(self, x):
        preds = []
        for item in x:
            max_score = 0
            max_goal = -1
            for goal in self.p:
                score = self.p[goal]
                for idx_a in range(self.A):
                    score = score * self.p_x_y[(idx_a, item[idx_a], goal)]
                if score > max_score:
                    max_score = score
                    max_goal = goal
            preds.append(max_goal)
        return preds

In [9]:
model = NB_mdf(data_shuffled[:,:-1], data_shuffled[:,-1])
# model.train(data_shuffled[:,:-1], data_shuffled[:,-1])
model.train(x_train, y_train)
preds = model.pred(x_val)
check_accuracy(y_val, preds)

Got 29 / 59 correct 49.15%


In [10]:
preds = model.pred(x_test)
check_accuracy(y_test, preds)

Got 31 / 59 correct 52.54%


感觉正确率挺低的所以用sklearn验证一下

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf_preds = clf.predict(x_val)
check_accuracy(y_val, clf_preds)
clf_preds = clf.predict(x_test)
check_accuracy(y_test, clf_preds)

Got 30 / 59 correct 50.85%
Got 27 / 59 correct 45.76%
