<a href="https://colab.research.google.com/github/KayvanShah1/usc-dsci-552-lab-assignments-hw/blob/main/assignment-4/Assignment%204%20-%20Naive%20Bayes%20-%20solution-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4 Naive Bayes Classifier

For this assignment you will implement a Naive Bayes Classifier that implements the SKlearn classifier API with `fit`, `predict` and `score` methods.

The Naive Bayes Classifer takes as parameter the density function used in the likelihood calcuation: 
* `normal`: Normal density function
* `knn`: K nearest neighbor density function

Most of the code already has been written for you. You only need to fill in the missing part between 
```
## Insert your code BEGIN

## Insert your code END
```

In [2]:
from functools import partial

import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from scipy.stats import norm

In [86]:
class NaiveBayesClassifier:
    def __init__(self, likelihood='normal', k=None):
        self.likelihood = likelihood
        
        # Let
        #  K = number of unique classes
        #  N = number of test instances
        #  d = number of inputs (input dimensionality)

        # Numpy array unique classes, shape = (K,)
        self.classes = None
        
        # Numpy array of class priors, P(C), shape = (K,)
        self.priors = None
       
        # Numpy array of likelihoods, P(x|C), shape = (N, K),
        self.likelihoods = None

        # Numpy array of posterior probabilities, P(C|x), shape = (N, K)
        self.posteriors = None
        
        ## For the Guassian Density 
        # means, shape = (K, d)
        self.avgs = None
        # variances, shape = (K, d)
        self.vars = None
        
        ## For the knn Density
        # number of neighbors to use
        self.k = k
        # store training X
        self.X_train = None
        # store training y
        self.y_train = None

    
    def generate_classes(self, y):
        """
        Generate the classes based on y, and store in self.classes

        :param y: array of class targets
        """
        self.classes = np.unique(y)
        
    def generate_priors(self, y):
        """
        Compute the prior probabilities and store self.priors

        :param y: array of class targets
 
        """
        ## Insert your code BEGIN
        self.class_count_ = np.zeros(len(self.classes), dtype=np.float64)
        classes = self.classes

        for y_i in classes:
            i = classes.searchsorted(y_i)
            N_i = y[y==y_i].shape[0]
            self.class_count_[i] += N_i

        self.priors =  self.class_count_ / self.class_count_.sum()
        ## Insert your code END
    

    def knn_density_function(self, x_train, x_predict): 
        """
        Implements k-nearest neighbor density estimate (Alpaydin Eq 8.8)

        :param x_train 1d numpy array
        :param x_predict 1d numpy array
        :returns probabilities at x_prdict, shape = x_predict.shape
        """
        # Find the distance to kth nearest neighbor
        result = []
        for x0 in x_predict:
            dist = np.abs(x_train - x0)
            index = np.argsort(dist)
            result.append(dist[index[self.k - 1]])
        dist_k = np.array(result)
        
        # Find the probability at x using knn density
        # Note: Equation 8.8 may return probabilites greater than 1.
        #       For probabilities greater than 1, set it equal to 1.
        ## Insert your code BEGIN
        N = x_predict.shape[0]
        predict_proba = []
        for i in range(N):
            if dist_k[i]==0:
                prob = 1
            else:
                prob = min(self.k/(2 * N * dist_k[i]), 1)
            predict_proba.append(prob)
        return np.array(predict_proba)
        # Return ...
        ## Insert your code END
    
    # Gaussian part
    def generate_avgs(self, X, y):
        """
        Return mean for each class and for each attribute
        """
        ## Insert your code BEGIN
        classes = self.classes
        avgs = []
        for y_i in self.classes:
            X_i = X[y == y_i, :]
            avgs.append(np.mean(X_i, axis=0).tolist())
        return np.array(avgs)
        ## Insert your code END
    
    def generate_vars(self, X, y):
        """
        Return variance for each class and for each attribute
        """
        ## Insert your code BEGIN
        classes = self.classes
        vars = []
        for y_i in self.classes:
            X_i = X[y == y_i, :]
            vars.append(np.var(X_i, axis=0).tolist())
        return np.array(vars)
        ## Insert your code END
    
    ## Insert your code BEGIN
    # Place any method you need here
    # def ...
    ## Insert your code END

    def generate_guassian_likelihoods(self, X):
        ## Insert your code BEGIN
        log_likelihood = []
        for i in range(np.size(self.classes)):
            n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.vars[i, :]))
            n_ij -= 0.5 * np.sum(((X - self.avgs[i, :]) ** 2) / (self.vars[i, :]), 1)
            log_likelihood.append(n_ij)
        return np.array(log_likelihood).T
        ## Insert your code END

    def generate_knn_likelihoods(self, X):
        likelihoods = np.ones([len(self.classes), X.shape[0]])
        for i, aclass in enumerate(self.classes):
            index = self.y_train == aclass
            for attr in range(X.shape[1]):
                ## Insert your code BEGIN
                likelihoods[i, :] *= self.knn_density_function(
                    X[index, attr], X[:, attr]
                )
                ## Insert your code END
        return np.log(likelihoods.T)
    
    def fit(self, X, y):
        # define the classes with ascending order
        self.generate_classes(y)
        # compute the Priori probability
        self.generate_priors(y)
        
        # different likelihood function
        if self.likelihood == 'normal':
            # calculate the avg and var based on X and y
            self.avgs = self.generate_avgs(X, y)
            self.vars = self.generate_vars(X, y)
        elif self.likelihood == 'knn':
            self.X_train = X
            self.y_train = y
        else:
            raise ValueError('Invalid value for likelihood. Must be "normal" or "knn".')
        return self

    def generate_likelihoods(self, X):
        """
        :param ndarray x 
        :returns probabilities at X (like X.shape[0] * Number of classes -> {Poss for each class} )
        """
        # Gussian
        if self.likelihood == "normal":
            self.likelihoods = self.generate_guassian_likelihoods(X)
        elif self.likelihood == "knn":
            self.likelihoods = self.generate_knn_likelihoods(X)
        else:
            raise ValueError('Invalid value for likelihood Must be "normal" or "knn".')
        return self.likelihoods

    def predict(self, X):
        """
        :param ndarray x 
        :returns prediction
        """
        self.likelihoods = self.generate_likelihoods(X)
        ## Insert your code BEGIN
        self.posteriors = np.log(self.priors) + self.likelihoods
        prediction = [self.classes[np.argmax(i)] for i in self.posteriors]
        ## Insert your code END
        return prediction

    def score(self, X, y, sample_weight=None):
        return accuracy_score(self.predict(X), y, sample_weight=sample_weight)

In [81]:
iris = load_iris()
x = iris['data']
y = iris['target']

In [82]:
# Create an instance of the classifier with a normal likelihood distribution
clf = NaiveBayesClassifier(likelihood='normal')

# # Fit the classifier to the training data
clf.fit(x, y)

# # Use the classifier to make predictions on new data
y_pred = clf.predict(x)

# # Evaluate the accuracy of the classifier
accuracy = clf.score(x, y)
print('Accuracy:', accuracy)

Accuracy: 0.96


In [83]:
np.set_printoptions(precision=3)

print("\nmeans:\n", clf.avgs)

print("\nvariances:\n", clf.vars)

print('\nprior probability:\n', clf.priors)

print('\nlikelihoods:')
print(clf.likelihoods[:5, :])
print(clf.likelihoods[50:55, :])
print(clf.likelihoods[100:105, :])

print('\nposteriors:')
print(clf.posteriors[:5, :])
print(clf.posteriors[50:55, :])
print(clf.posteriors[100:105, :])

print('\npredictions:')
print(y_pred[:5])
print(y_pred[50:55])
print(y_pred[100:105])


means:
 [[5.006 3.428 1.462 0.246]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]]

variances:
 [[0.122 0.141 0.03  0.011]
 [0.261 0.097 0.216 0.038]
 [0.396 0.102 0.298 0.074]]

prior probability:
 [0.333 0.333 0.333]

likelihoods:
[[  2.161 -38.979 -55.744]
 [  1.519 -37.209 -55.192]
 [  1.268 -40.108 -57.749]
 [  1.198 -37.564 -55.283]
 [  2.111 -40.127 -56.697]]
[[-252.68    -3.084   -4.495]
 [-234.161   -1.337   -4.185]
 [-284.931   -3.124   -2.948]
 [-163.146   -1.113  -11.486]
 [-247.01    -0.715   -3.712]]
[[-5.863e+02 -2.613e+01 -2.646e+00]
 [-3.517e+02 -5.429e+00 -1.766e+00]
 [-5.074e+02 -1.634e+01 -7.383e-01]
 [-4.061e+02 -6.860e+00 -6.451e-01]
 [-5.012e+02 -1.577e+01 -4.852e-01]]

posteriors:
[[  1.063 -40.078 -56.843]
 [  0.421 -38.308 -56.29 ]
 [  0.169 -41.207 -58.848]
 [  0.099 -38.662 -56.382]
 [  1.012 -41.226 -57.796]]
[[-253.779   -4.182   -5.594]
 [-235.259   -2.436   -5.283]
 [-286.029   -4.223   -4.047]
 [-164.245   -2.211  -12.585]
 [-248.109   -1.814   

In [87]:
# Create an instance of the classifier with a normal likelihood distribution
clf = NaiveBayesClassifier(likelihood='knn', k=3)

# # Fit the classifier to the training data
clf.fit(x, y)

# # Use the classifier to make predictions on new data
y_pred = clf.predict(x)

# # Evaluate the accuracy of the classifier
accuracy = clf.score(x, y)
print('Accuracy:', accuracy)

Accuracy: 0.96


In [88]:
np.set_printoptions(precision=3)

print('prior probability:\n', clf.priors)

print('\nlikelihoods:')
print(clf.likelihoods[:5, :])
print(clf.likelihoods[50:55, :])
print(clf.likelihoods[100:105, :])

print('\nposteriors:')
print(clf.posteriors[:5, :])
print(clf.posteriors[50:55, :])
print(clf.posteriors[100:105, :])

print('\npredictions:')
print(y_pred[:5])
print(y_pred[50:55])
print(y_pred[100:105])

prior probability:
 [0.333 0.333 0.333]

likelihoods:
[[  0.    -15.333 -17.093]
 [  0.    -11.932 -15.079]
 [ -2.303 -13.082 -15.331]
 [  0.    -13.264 -15.367]
 [  0.    -15.621 -17.941]]
[[-15.176  -2.996  -7.601]
 [-14.584  -2.303  -5.704]
 [-15.256  -4.605  -2.303]
 [-16.489   0.    -13.775]
 [-17.748  -2.303  -5.298]]
[[-17.808 -11.503  -2.303]
 [-16.543  -6.397   0.   ]
 [-16.118 -11.918  -4.605]
 [-17.305  -7.244  -2.303]
 [-15.591 -10.897   0.   ]]

posteriors:
[[ -1.099 -16.431 -18.192]
 [ -1.099 -13.03  -16.177]
 [ -3.401 -14.18  -16.429]
 [ -1.099 -14.362 -16.466]
 [ -1.099 -16.719 -19.039]]
[[-16.275  -4.094  -8.7  ]
 [-15.682  -3.401  -6.802]
 [-16.355  -5.704  -3.401]
 [-17.588  -1.099 -14.873]
 [-18.847  -3.401  -6.397]]
[[-18.906 -12.601  -3.401]
 [-17.642  -7.496  -1.099]
 [-17.216 -13.017  -5.704]
 [-18.403  -8.343  -3.401]
 [-16.69  -11.995  -1.099]]

predictions:
[0, 0, 0, 0, 0]
[1, 1, 2, 1, 1]
[2, 2, 2, 2, 2]
