# Homework 02: Naïve Bayes’ Classifier
## Hamza Dehidi KU 0077989
### October 24, 2021

In [1]:
import numpy as np
import pandas as pd
import math


## Importing Data

In [3]:
# read data into memory
images = np.genfromtxt(r'C:\Users\Hamza\Desktop\Koç\2- DASC 521\4- Assignments\02-Naïve Bayes’ Classifier\Initial\hw02_images.csv', delimiter = ",")
labels = np.genfromtxt(r'C:\Users\Hamza\Desktop\Koç\2- DASC 521\4- Assignments\02-Naïve Bayes’ Classifier\Initial\hw02_labels.csv', delimiter = ",")

In [4]:
# Split data to train and test
images_train = images[:30000,:].astype(int)
images_test = images[30000:35001,:].astype(int)

y_truth_train = labels[:30000].astype(int)
y_truth_test = labels[30000:35001].astype(int)


In [5]:
# number of dimensions
d = images_train.shape[1]
# number of labels
k = np.max(y_truth_train)

## Parameter Estimation

### Calculate sample means

$\widehat{\mu_{c}} = \dfrac{\sum\limits_{i = 1}^{N} x_{i} \mathbb{1}(y_{i} = c)}{\sum\limits_{i = 1}^{N} \mathbb{1}(y_{i} = c)}$

In [6]:
# Calculate sample means for each dataset along each dimension

sample_means = np.reshape([np.mean((images_train[y_truth_train == (c + 1)]),axis=0)
                           for c in range(k)],(k,d))
                           
                    

In [7]:
print(sample_means)

[[254.99866667 254.98416667 254.85616667 ... 254.679      254.87816667
  254.95933333]
 [254.99733333 254.99733333 254.9965     ... 254.96883333 254.99216667
  254.98866667]
 [254.99933333 254.99933333 254.99233333 ... 251.52483333 254.4725
  254.97483333]
 [254.99666667 254.98983333 254.91416667 ... 252.39516667 254.44166667
  254.93666667]
 [254.999      254.98433333 254.93783333 ... 250.673      253.23333333
  254.79083333]]


### Calculate sample standard deviation

$\widehat{\sigma_{c}} = \dfrac{\sum\limits_{i = 1}^{N} \sqrt{(x_{i} - \widehat{\mu_{c}})^{2}} \mathbb{1}(y_{i} = c)} {\sum\limits_{i = 1}^{N} \mathbb{1}(y_{i} = c)}$


In [8]:
# Compute the standard deviation along the specified dimension for specific label.
sample_std = np.reshape([(np.std(images_train[y_truth_train == (c + 1)], axis=0))
                                        for c in range(k)],(k,d))

In [9]:
images_train.shape

(30000, 784)

In [10]:
print(sample_std)

[[ 0.09127736  0.25609108  1.31090756 ...  5.29826629  3.9117332
   1.93959091]
 [ 0.2065419   0.2065419   0.2163818  ...  1.04076669  0.47057267
   0.70062226]
 [ 0.05163547  0.04081939  0.16002465 ... 18.43665868  6.7881694
   1.1061344 ]
 [ 0.18436076  0.21617116  1.81046936 ... 15.67799977  6.34549162
   1.79971911]
 [ 0.04471018  0.64582342  3.03248555 ... 23.62576428 13.9167006
   4.4727787 ]]


### Calculate prior probabilities

$\widehat{P}(y_{i} = c) = \dfrac{\sum\limits_{i = 1}^{N} \mathbb{1}(y_{i} = c)}{N}$

In [11]:
# calculate prior probabilities
class_priors = [np.mean(y_truth_train == (c + 1)) for c in range(k)]

In [12]:
print(class_priors)

[0.2, 0.2, 0.2, 0.2, 0.2]


## Parametric Classification

\begin{align*}
g_{c}(x) &= \log p(x | y = c) + \log P(y = c)\\
&= -\dfrac{d}{2} \log(2 \pi) - \sum_{n=1}^{d}(\log\sigma_{c})-\dfrac{1}{2}(x - \mu_{c})^{T}\sigma_{c}^{-2}(x - \mu_{c}) + \log P(y = c)
\end{align*}

## Creating score function


In [22]:
#I'm using the first method not the Ws.
# for i in range(images_test.shape[0]) produces 30,000 itirations,the number of observations in the test_data. 
# for c in range(k) gives 5 itirations, the number of classes.
# remove -.5 with **2 from the second term.

def score_func(data_test):
    total_scores = np.zeros((data_test.shape[0], 0))
    for c in range(k):
        scores = np.vstack([- 0.5 * d * np.log(2 * math.pi) - np.sum(np.log(sample_std[c]))
                         - 0.5 * np.inner(((data_test[j] - sample_means[c]).T * (sample_std[c]**-2)),(data_test[j] - sample_means[c])) 
                         + np.log(class_priors[c]) for j in range(data_test.shape[0])])
        print(scores.shape)
        total_scores = np.hstack((total_scores,scores))
        
    return total_scores


In [23]:
# This loop will assign the highest score for observation.

def classifier(data_test):
    
    scores = score_func(data_test)
    print(scores.shape)
    y_predicted = np.zeros((data_test.shape[0])).astype(int)
    for r in range(data_test.shape[0]):
            if scores[r,0] == np.max(scores[r,:]): y_predicted[r] = 1
            elif scores[r,1] == np.max(scores[r,:]): y_predicted[r] = 2
            elif scores[r,2] == np.max(scores[r,:]): y_predicted[r] = 3
            elif scores[r,3] == np.max(scores[r,:]): y_predicted[r] = 4        
            elif scores[r,4] == np.max(scores[r,:]): y_predicted[r] = 5            
    
         
    return y_predicted

## Calculate the confusion matrix

In [24]:
class_predicted_training = classifier(images_train)

(30000, 1)
(30000, 1)
(30000, 1)
(30000, 1)
(30000, 1)
(30000, 5)


In [19]:
images_train.shape[0]

30000

In [None]:
# Calculate the confusion matrix for the data points in training set
confusion_matrix = pd.crosstab(class_predicted_training, y_truth_train, rownames = ['y_pred'], colnames = ['y_truth'])
print(confusion_matrix)

In [None]:
class_predicted_test = classifier(images_test)

In [None]:
# Calculate the confusion matrix for the data points in test set
confusion_matrix = pd.crosstab(class_predicted_test, y_truth_test, rownames = ['y_pred'], colnames = ['y_truth'])
print(confusion_matrix)