In [1]:
import numpy as np
import pandas as pd

In [2]:
def fit(X_train,Y_train):
    result = {}
    result["total_data"] = len(Y_train)
    class_number = set(Y_train)
    
    for class_val in class_number:
        result[class_val] = {}
        feature_number = X_train.shape[1]
        result[class_val]["total_count"] = len(Y_train[Y_train==class_val])
        
        for j in range(1,1+feature_number):
            result[class_val][j] = {}
            feature_val = set(X_train[:,j-1])
            
            for k in feature_val:
                rows_required = Y_train[Y_train==class_val]
                result[class_val][j][k] = (X_train[Y_train==class_val][:,j-1] == k).sum()
    
    return result     

In [3]:
def probability(result,x,ai):
    p_ai = np.log(result[ai]["total_count"]) - np.log(result["total_data"])
    
    p_x = 0
    feature_number = 1
    for f in x:
        p_x = p_x + np.log(((result[ai][feature_number][f] + 1)))-np.log((result[ai]["total_count"] + len(result[ai][feature_number].keys())))
        feature_number+=1
    
    p_total = p_ai+p_x
    return p_total

In [4]:
def PredictSinglePoint(result,x):
    classes = result.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    
    for curr_class in classes:
        if(curr_class=="total_data"):
            continue
        curr_probability = probability(result,x,curr_class)
        if(first_run or best_p<curr_probability):
            best_p = curr_probability
            best_class = curr_class
            first_run = False
    return best_class

In [5]:
def predict(result, X_test):
    Y_pred = []
    for x in X_test:
        x_class = PredictSinglePoint(result,x)
        Y_pred.append(x_class)
    
    return Y_pred

In [6]:
from sklearn import datasets
iris = datasets.load_breast_cancer()
X = iris.data
Y = iris.target

In [7]:
def LabelData(col):
    first = 0.5*np.mean(col)
    second = np.mean(col)
    third = 1.5*np.mean(col)
    
    for i in range(len(col)):
        if(col[i]<first):
            col[i] = 0
        elif(col[i]<second):
            col[i] = 1
        elif(col[i]<third):
            col[i] = 2
        else:
            col[i] = 3
    return col

In [8]:
for i in range(X.shape[1]):
    X[:,i] = LabelData(X[:,i])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state=0)

In [10]:
result = fit(X_train, Y_train)

In [11]:
Y_pred = predict(result, X_test)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(Y_test,Y_pred))
confusion_matrix(Y_test,Y_pred)

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        53
           1       0.95      0.91      0.93        90

    accuracy                           0.92       143
   macro avg       0.91      0.92      0.91       143
weighted avg       0.92      0.92      0.92       143



array([[49,  4],
       [ 8, 82]], dtype=int64)

In [13]:
a=np.array([[1,2,3,4],[5,2,7,8],[9,2,11,12],[13,14,15,16]])
b = np.array([1,1,1,0])
(a[b==1][:,1]==2).sum()

3

In [14]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB, MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,Y_train)
Y_pre_NB = clf.predict(X_test)
print(classification_report(Y_test,Y_pre_NB))
confusion_matrix(Y_test,Y_pre_NB)

              precision    recall  f1-score   support

           0       0.83      0.94      0.88        53
           1       0.96      0.89      0.92        90

    accuracy                           0.91       143
   macro avg       0.90      0.92      0.90       143
weighted avg       0.92      0.91      0.91       143



array([[50,  3],
       [10, 80]], dtype=int64)