In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
# notebook will reload external python modules
%load_ext autoreload
%autoreload 2

In [2]:
# import data
data_raw = pd.read_csv('./data/iris.data', header=None)
# only 2 classes are needed
data = np.array(data_raw)[:100]

In [3]:
all_num = data.shape[0]
val_num = all_num // 5
test_num = all_num // 5
train_num = all_num - val_num - test_num
print('all_num:{}, val_num:{}, test_num:{}, train_num:{}'.format(all_num, val_num, test_num, train_num))

all_num:100, val_num:20, test_num:20, train_num:60


In [4]:
# change the label into {-1, 1}
for idx in range(all_num):
    item = data[idx][4]
    if item == 'Iris-setosa':
        data[idx][4] = 1
    elif item == 'Iris-versicolor':
        data[idx][4] = -1
    # elif item == 'Iris-virginica':
    #     data[idx][4] = 2
    else:
        raise ValueError("data_label unmatched")

# shuffle the set
data_shuffled = data.copy().astype(float)
np.random.shuffle(data_shuffled)

# mask
train_mask = range(train_num)
val_mask = range(train_num,train_num+val_num)
test_mask = range(train_num + val_num, train_num + val_num + test_num)
# devide set into 3 parts (train, val, test)
x_train = data_shuffled[train_mask, 0:4]
y_train = data_shuffled[train_mask, 4]
x_val = data_shuffled[val_mask, 0:4]
y_val = data_shuffled[val_mask, 4]
x_test = data_shuffled[test_mask, 0:4]
y_test = data_shuffled[test_mask, 4]

In [5]:
# Logistic Regression
class LR():
    def __init__(self, lr=0.001, batchsize=16, epoch=1, shuffle=False):
        self.X = None
        self.Y = None
        self.w = None
        self.batchsize = batchsize
        self.epoch = epoch
        self.lr = lr # learning rate
    def train(self, x, y, mu=0, sigma=1):
        '''
        x: N x F
        y: N in {0,1}
        '''
        self.w = np.random.normal(mu, sigma, x.shape[1])
        iterations = np.ceil(len(x) // self.batchsize)
        for e in range(epoch):
            for i in range(iterations):

    def forward(self, x, y):
        loss = np.sum(y * np.dot(self.w, x) - np.log(1 + np.exp(self.w, x)))
        cache = (x, y, self.w)
        return loss, cache
    def backward(self, cache):
        x, y, w = cache

    def pred(self, x):
        preds = []
        for item in x:
        
        return preds

In [6]:
def check_accuracy(Y, fx):
    num_correct = np.sum(Y == fx)
    num_samples = len(Y)
    acc = float(num_correct) / num_samples
    print('Got {} / {} correct {:.2%}'.format(num_correct, num_samples, acc))

In [7]:
model = NB(x_train, y_train)
model.train()
preds = model.pred(x_val)
check_accuracy(y_val, preds)

KeyError: (1, 2, 2)

## About the Error

理论上需要知道每个特征的所有取值，但是对于实际的数据集，还是存在一些问题，比如你不能保证训练集中包含了所有的取值。

解决方法：

1.对数据进行预处理，保证训练集中包含所有特征的取值

2.先把所有数据集传进去，收集所有特征，再使用训练集训练

3.对于不存在的组合，直接使用公式 $\frac{(0 + lambda)}{(0 + lambda \times S_j)}$

In [8]:
# naive bayes with modification
class NB_mdf():
    def __init__(self, X, Y, la=1):
        '''
        X, Y: data with all available features
        '''
        self.X = X # all data
        self.Y = Y # all data's label
        self.p = {}
        self.p_x_y = {}
        self.N = len(Y) # number of rows
        self.A = X.shape[1] # number of attributes
        self.la = la # lambda
        # collecting features
        self.goals = np.unique(self.Y)
        self.features = []
        for idx_f in range(X.shape[1]):
            self.features.append(np.unique(X[:,idx_f]))

    def train(self, x, y):
        '''
        x:x_train
        y:y_train
        '''
        for goal in self.goals:
            self.p[goal] = np.sum((y == goal)) / len(y)
            y_ck_mask = y == goal
            for idx_f, features in enumerate(self.features):
                x_j = x[:,idx_f][y_ck_mask]
                s_j = len(features)
                for feat in features:
                    self.p_x_y[(idx_f, feat, goal)] = (np.sum(x_j == feat) + self.la) / (self.p[goal] * len(y) + s_j*self.la)

    def pred(self, x):
        preds = []
        for item in x:
            max_score = 0
            max_goal = -1
            for goal in self.p:
                score = self.p[goal]
                for idx_a in range(self.A):
                    score = score * self.p_x_y[(idx_a, item[idx_a], goal)]
                if score > max_score:
                    max_score = score
                    max_goal = goal
            preds.append(max_goal)
        return preds

In [9]:
model = NB_mdf(data_shuffled[:,:-1], data_shuffled[:,-1])
# model.train(data_shuffled[:,:-1], data_shuffled[:,-1])
model.train(x_train, y_train)
preds = model.pred(x_val)
check_accuracy(y_val, preds)

Got 33 / 59 correct 55.93%


In [10]:
preds = model.pred(x_test)
check_accuracy(y_test, preds)

Got 34 / 59 correct 57.63%


感觉正确率挺低的所以用sklearn验证一下

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf_preds = clf.predict(x_val)
check_accuracy(y_val, clf_preds)
clf_preds = clf.predict(x_test)
check_accuracy(y_test, clf_preds)

Got 31 / 59 correct 52.54%
Got 32 / 59 correct 54.24%
