In [20]:
import numpy as np

In [21]:
def fit(x_train, y_train):
    result = {}
    class_values = set(y_train)
    for current_class in class_values:
        result[current_class] = {}
        result["total_data"] = len(y_train)
        current_class_rows = (y_train == current_class)
        x_train_current = x_train[current_class_rows]
        y_train_current = y_train[current_class_rows]
        result[current_class]["total_count"] = len(y_train_current)
        num_features = x_train.shape[1]
        for j in range(num_features):
            result[current_class][j] = {}
            possible_values = set(x_train[:, j])
            for curr_val in possible_values:
                result[current_class][j][curr_val] = (x_train_current[:, j] == curr_val).sum()
    return result

In [22]:
def predict(x_test, result):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(x, result)
        y_pred.append(x_class)
    return y_pred

In [23]:
def predictSinglePoint(x, result):
    classes = result.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if(current_class == "total_data"):
            continue
        p_current_class = log_probability(x, result, current_class)
        if (first_run or p_current_class>best_p) :
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [24]:
# we can use log of probabilities
def log_probability(x, result, current_class):
    output = np.log(result[current_class]["total_count"]) - np.log(result["total_data"])
    num_features = len(result[current_class].keys()) - 1
    for j in range(num_features):
        xj = x[j]
        count_current_feature = result[current_class][j][xj] + 1
        current_class_prob = result[current_class]["total_count"] + len(result[current_class][j].keys())
        output += np.log(count_current_feature) - np.log(current_class_prob)
    return output

In [25]:
# change the iris data 
# change the continuous data to labelled data
def makelabelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5 * second_limit
    for i in range(len(column)):
        if(column[i] < first_limit):
            column[i] = 0
        elif(column[i] < second_limit):
            column[i] = 1
        elif(column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [26]:
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target

In [27]:
for i in range(x.shape[-1]):
    x[:, i] = makelabelled(x[:, i])

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [29]:
dictionary = fit(x_train, y_train)

In [30]:
y_pred = predict(x_test, dictionary)

In [31]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


In [32]:
# how to apply on continuous valued data
# labelling will not work always
# assume a gaussian probability curve
# use the gaussian equation to get the probability density function 
# inbuilt gaussian naive bayes uses this in sklearn

In [33]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.76      1.00      0.86        16
          2       1.00      0.67      0.80         9

avg / total       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]


In [36]:
clf.class_prior_
# the class probabilities

array([0.33035714, 0.30357143, 0.36607143])

In [37]:
# will perform good on text classification
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.00      0.00      0.00        16
          2       0.36      1.00      0.53         9

avg / total       0.43      0.53      0.44        38

[[11  2  0]
 [ 0  0 16]
 [ 0  0  9]]
