In [2]:
import numpy as np
from sklearn.datasets import load_iris

# Naive Bayes Classifier

In [43]:
class NBC:
    # class representing the naive bayes clasifier (NBC)
    
    def __init__(self, feature_types, num_classes):
        self.feature_types = feature_types  # list of feature types; so far only 'r' for 'real' supported
        self.num_classes = num_classes  # number of classes to clasify
        
    def fit(self, Xtrain, ytrain):
        # determines the parametres for the NBC model
        # a Gaussian is used to model these parameters, therefore the empirical mean and variance get stored as params
        self.means = None # means of every feature in the dataset, by class
        self.variances = None # variances of every teature in the dataset, by class
        self.y_params = None # parameters to determine probability of classes y
        self.N_c = None      # number of examples for each class c
        self.N = X.shape[0]  # total number of examples = number of rows in X
        for c in range(self.num_classes):
            #determine y_params -- empirical ratio of #(y=c)/N
            N_c = np.sum(y == c)
            y_param_c = N_c/self.N
            if self.N_c is None:
                self.N_c = np.array([N_c])
            else:
                self.N_c = np.append(self.N_c, [N_c])
            if self.y_params is None:
                self.y_params = np.array([y_param_c])
            else:
                self.y_params = np.append(self.y_params, [y_param_c])
                
            # determine variance and mean for every feature and class -- params for x
            X_c = X[y==c] # TODO: adapt this so that c can be arbitrary, not in range(num_classes)
            means_c, variances_c = X_c.mean(axis=0), X_c.var(axis=0)
            variances_c = np.array([1e-6 if var == 0 else var for var in variances_c])  # avoid division by 0
            if self.means is None:
                self.means = np.array([means_c])
            else:
                self.means = np.append(self.means, [means_c], axis=0)
            if self.variances is None:
                self.variances = np.array([variances_c])
            else:
                self.variances = np.append(self.variances, [variances_c], axis=0)
                
    def predict(self, x_new):
        # predicts a class (y) for a feature vector (x_new)
        winning_class, winning_stat = -1, float('-inf')
        denominator = 0  # denominator in the bayesian posterior expression
        for c in range(self.num_classes):
            mean, var = self.means[c], self.variances[c]  # arrays of means and vars for every feature in x_new
            log_p_x_given_yc = np.sum(np.log(self.gaussian(x_new, mean, var)))  # log likelihood of x_new given y = c
            log_p_yc = np.log(self.y_params[c])  # log prior of y = c
            log_p_xyc = log_p_x_given_yc + log_p_yc  # log probability of x_new and y = c; numerator of posterior
            denominator += np.exp(log_p_xyc) # only used to return actual probability for y
            if log_p_xyc > winning_stat:
                winning_class, winning_stat = c, log_p_xyc
        
        return winning_class, np.exp(winning_stat) / denominator
    
            
    def gaussian(self, val, mean, var):
        return (1 / (np.sqrt(var)*np.sqrt(2*np.pi))) * np.exp(-(1/2) * (((val-mean)/np.sqrt(var))) ** 2)
        

In [44]:
# load dataset and fit model
iris = load_iris()
X, y = iris['data'], iris['target']

nbc = NBC(feature_types=['r', 'r', 'r'], num_classes=3)
nbc.fit(X, y)

In [45]:
# small test to see if everything is working
test_sample = [1,8,12,17,25,32,60,80,90,120]
print(y)
labels = y[test_sample]
print(X[test_sample,:])
for i, x in enumerate(X[test_sample,:]):
    c, _ = nbc.predict(x)
    print("Prediction: " + str(c) + "; actual: " + str(labels[i]))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
[[4.9 3.  1.4 0.2]
 [4.4 2.9 1.4 0.2]
 [4.8 3.  1.4 0.1]
 [5.1 3.5 1.4 0.3]
 [5.  3.  1.6 0.2]
 [5.2 4.1 1.5 0.1]
 [5.  2.  3.5 1. ]
 [5.5 2.4 3.8 1.1]
 [5.5 2.6 4.4 1.2]
 [6.9 3.2 5.7 2.3]]
Prediction: 0; actual: 0
Prediction: 0; actual: 0
Prediction: 0; actual: 0
Prediction: 0; actual: 0
Prediction: 0; actual: 0
Prediction: 0; actual: 0
Prediction: 1; actual: 1
Prediction: 1; actual: 1
Prediction: 1; actual: 1
Prediction: 2; actual: 2
