In [7]:
import numpy as np
import pandas as pd
from keras.datasets import mnist

In [8]:
class NaiveBayes(object):

    def train_data(self, x_train, y_train, smoothing=1000):
        '''
        This function will train the data
        param: x_train -> features
        param: y_train -> labels
        params: smoothing -> to add into variance

        '''
        self.smoothing = smoothing
        self.no_rows, self.x_features = x_train.shape #getting no of samples and no of features(here features are 28*28=784)
        X_train = np.zeros([self.no_rows, self.x_features+1]) #making a mask for merging the features and label in one numpy array
        X_train[:,:-1] = x_train 
        X_train[:,-1]=y_train

        self.unique_classes=np.unique(X_train[:,-1]) # getting no of unique classes
        self.no_classes=len(self.unique_classes)            
        np.random.shuffle(X_train) # shuffling the data to avoid biasness

        '''
        Creating the mask to calculate mean and variance
        of individual pixels and individual classes.
        It will generate 10x784 size of array
        '''
        self.pixel_mean_value=np.zeros([self.no_classes,self.x_features]) 
        self.pixel_variance_value=np.zeros([self.no_classes,self.x_features])
        self.prior_prob=[] # for storing the individual prior probability of classes

        self.prior = {}
        for e in self.unique_classes:
            class_occurance=X_train[X_train[:,-1]==e] # getting all the arrays where class == e
            class_occurance=class_occurance[:,:-1]
            self.prior[e] = len(class_occurance)/len(X_train)
            self.prior_prob.append(len(class_occurance)/len(X_train))
            self.pixel_mean_value[int(e),:]=class_occurance.mean(axis=0)
            self.pixel_variance_value[int(e),:]=class_occurance.var(axis=0)

        '''
        Here we are adding smoothing to variance as variance
        will be in denominator so to avoid infinte error

        I take it thousand as it gives good accuracy on it
        '''
        self.pixel_variance_value=self.pixel_variance_value+self.smoothing

        return (self.prior) #it returns the prior probability of individual class in dictionary
    def predict_data(self, x_test):
        '''
        param: x_test -> feature set to predict

        This function as the name says predicts the data
        it takes testing data as input.
        Gaussian Distribution is used as we have multiple classes
        You can look mathematical formula her:
        https://www.gstatic.com/education/formulas2/397133473/en/normal_distribution.svg
        '''
        self.predicted = []
        for i in range(x_test.shape[0]):
            posteriors = []
            for j in range(self.no_classes):
                #>>>>>>>>>>>>>>>> Gausian Distribution <<<<<<<<<<<<<
                numerator=np.exp(-((x_test[i]-self.pixel_mean_value[j])**2)/(2*self.pixel_variance_value[j])) 
                denominator=np.sqrt(2*np.pi*(self.pixel_variance_value[j]))
                prob_xc=numerator/denominator
                #>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<
                posterior=np.sum(np.log(prob_xc)+np.log(self.prior_prob[j]))
                posteriors.append(posterior)
            self.predicted.append(np.argmax(posteriors)) # Position with highest ratio will be the predicted class
        return self.predicted
    
    def accuracy(self, predicted, y_test):
        
        '''
        param: predicted -> It is the list that will be generated by predict function
        param: y_test -> labels of testing data

        The function will give the accuracy on basis correctly predicted divided by total data in test
        '''
        count = 0
        if(len(predicted)==len(y_test)):
            for i in range(len(predicted)):
                if(predicted[i]==y_test[i]):
                    count=count+1
            return count/len(y_test)
        else:
            raise Exception("Length of two arrays did not match")

# MNIST Data

In [9]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train=x_train.reshape(60000,784)
x_test=x_test.reshape(10000,784)
classifier = NaiveBayes()
clf = classifier.train_data(x_train, y_train)

In [10]:
pred = classifier.predict_data(x_test)
accuracy = classifier.accuracy(pred, y_test)

In [11]:
print(f"Accuracy by NaiveBayes classifier on MNIST is: {accuracy}")

Accuracy by NaiveBayes classifier on MNIST is: 0.739


In [12]:
print(f"Prior Probability of each class is:\n {clf}")

Prior Probability of each class is:
 {0.0: 0.09871666666666666, 1.0: 0.11236666666666667, 2.0: 0.0993, 3.0: 0.10218333333333333, 4.0: 0.09736666666666667, 5.0: 0.09035, 6.0: 0.09863333333333334, 7.0: 0.10441666666666667, 8.0: 0.09751666666666667, 9.0: 0.09915}


# MNIST Data by Kaggle

In [7]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json


mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [67]:
!kaggle competitions download -c digit-recognizer


Downloading test.csv.zip to /content
  0% 0.00/6.09M [00:00<?, ?B/s]
100% 6.09M/6.09M [00:00<00:00, 55.6MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/235k [00:00<?, ?B/s]
100% 235k/235k [00:00<00:00, 74.8MB/s]
Downloading train.csv.zip to /content
 55% 5.00M/9.16M [00:00<00:00, 47.8MB/s]
100% 9.16M/9.16M [00:00<00:00, 55.3MB/s]


In [68]:
!unzip test.csv.zip
!unzip train.csv.zip

Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv               


In [13]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [14]:
y_train = np.array(train['label'])
train.drop('label',axis='columns',inplace=True)
x_train = np.array(train)
x_test = np.array(test)

In [15]:
classifier = NaiveBayes()
clf = classifier.train_data(x_train, y_train)
pred = classifier.predict_data(x_test)

In [16]:
print(f"Prior Probability of each class is:\n {clf}")

Prior Probability of each class is:
 {0.0: 0.09838095238095237, 1.0: 0.11152380952380953, 2.0: 0.09945238095238096, 3.0: 0.1035952380952381, 4.0: 0.09695238095238096, 5.0: 0.09035714285714286, 6.0: 0.0985, 7.0: 0.10478571428571429, 8.0: 0.09673809523809523, 9.0: 0.09971428571428571}


In [17]:
'''
Submission csv
'''
submission = pd.DataFrame({
    "ImageId": list(range(1,len(pred)+1)),
    "Label": pred
})
submission.to_csv("submission.csv")