# Naive Bayes Alrogithm Implement

### Assumption: conditional probs over features are independent 

## $p(Y=c_k|X=x)=\frac{p(X=x|Y=c_k)p(Y=c_k)}{\sum_k{p(X=x|Y=c_k)p(Y=c_k)}}=\frac{p(Y=c_k)\prod_j{p(X^j=x^j|Y=c_k)}}{\sum_k{p(Y=c_k)\prod_j{p(X^j=x^j|Y=c_k)}}}$ 


### $\sum_k{p(Y=c_k)\prod_j{p(X^j=x^j|Y=c_k)}}$ is constant so we have:
### $\arg\max_{c_k}{p(Y=c_k|X=x)}=p(Y=c_k)\prod_j{p(X^j=x^j|Y=c_k)}$

In [1]:
import numpy as np
import pandas as pd

In [2]:
class NaiveBayes():
    def __init__(self, laplas=0):
        self.__x = None
        self.__y = None
        self.__laplas = laplas
        self.__predicts = None

    def dataProcessing(self, data):
        if isinstance(data, pd.DataFrame):
            data = data.to_numpy()
        self.__x = data[:, :-1]
        self.__y = data[:, -1]

    def predict(self, x):
        values, counts = np.unique(self.__y, return_counts=True)
        # priors= p(Y=ck) with laplas smoother
        priors = (counts + self.__laplas) / (len(self.__y) + self.__laplas * len(values))
        # iterate each data entry in set x
        preds = []
        for entry in x:
            clsProbs = dict()
            for i, v in enumerate(values):
                # which data point belong to class v
                vindex = np.argwhere(self.__y == v).reshape(-1)
                # get these data points
                vset = self.__x[vindex, :]
                # number of data points in the class
                dataNum = counts[i]
                # for each data point, iterate its attribute to get conditional probs
                conditProbs = []
                for j, attr in enumerate(entry):
                    # how many times the attribute shows in this column
                    attrNum = (attr == vset[:, j]).sum()
                    # how much different values in this column 
                    attrClsNum = len(np.unique(vset[:, j]))
                    conditProb = (attrNum + self.__laplas) / (dataNum + attrClsNum * self.__laplas)
                    conditProbs.append(conditProb)
                prodCP = np.cumprod(conditProbs)[-1]
                # the prob of data entry would be class v is prob
                prob = priors[i] * prodCP
                clsProbs[v] = prob
            # predict the class of the data point based on max probs 
            predCls = max(clsProbs,key=clsProbs.get)
            preds.append(predCls)
        self.__predicts = preds
        return preds

    def accuracy(self, y):
        return np.mean(self.__predicts == y)

In [3]:
# generate history data
np.random.seed(42)
hx=np.random.randint(0,5,size=(100,5))
hy=np.random.choice([-1,1],size=(100,1),replace=True)
hdata=np.hstack((hx,hy))

# generate test data
np.random.seed(0)
tx=np.random.randint(0,5,size=(100,5))
ty=np.random.choice([-1,1],size=(100,1),replace=True)
tdata=np.hstack((tx,ty))

In [4]:
NB=NaiveBayes()
# store history data in model
NB.dataProcessing(hdata)

# predict new data
predicts=NB.predict(tx)
# compute accuracy
accuracy=NB.accuracy(ty.flatten())

print(f'predicts={predicts[:5]}\naccuracy={accuracy}')

predicts=[1, -1, 1, 1, 1]
accuracy=0.52
