In [49]:
import numpy as np
import pandas as pd
import matplotlib as plt

Loading the Dataset

In [50]:
dataSet = pd.read_csv('./iris_csv.csv')
dataSet

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


Normalizing Dataset 

In [51]:
x = dataSet.drop(['class'],axis='columns')
y = dataSet['class']

In [52]:
x = (x-np.min(x))/(np.max(x) - np.min(x))
x

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
0,0.641026,0.435897,0.166667,0.012821
1,0.615385,0.371795,0.166667,0.012821
2,0.589744,0.397436,0.153846,0.012821
3,0.576923,0.384615,0.179487,0.012821
4,0.628205,0.448718,0.166667,0.012821
...,...,...,...,...
145,0.846154,0.371795,0.653846,0.282051
146,0.794872,0.307692,0.628205,0.230769
147,0.820513,0.371795,0.653846,0.243590
148,0.782051,0.423077,0.679487,0.282051


Spliting The dataset into train and test

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,random_state=42,test_size=0.30)

In [55]:
from sklearn.naive_bayes import GaussianNB

In [56]:
model = GaussianNB()
model.fit(X_train,Y_train)

Score

In [57]:
print("Score: ", model.score(X_test,Y_test))

Score:  0.9777777777777777


Implementing NavieBayes from Scratch

In [58]:
class NaiveBayes:
    def __init__(self):
        self.features = list
        self.likelihoods = {}
        self.class_priors = {}
        self.pred_priors={}

        self.X_train = np.array
        self.y_train = np.array
        self.train_size = int
        self.num_feats = int
    
    def fit(self,X,y):
        self.features = list(X.columns)
        self.X_train = X
        self.y_train = y
        self.train_size = X.shape[0]
        self.num_feats = X.shape[1]

        for feature in self.features:
            self.likelihoods[feature]={}
            self.pred_priors[feature]={}

            for feat_val in np.unique(self.X_train[feature]):
                self.pred_priors[feature].update({feat_val:0})
                for outcome in np.unique(self.y_train):
                    self.likelihoods[feature].update({str(feat_val)+'_'+str(outcome):0})
                    self.class_priors.update({outcome: 0})
        
        self._calc_class_prior()
        self._calc_likelihoods()
        self._calc_predictor_prior()
    
    def _calc_class_prior(self):
        for outcome in np.unique(self.y_train):
            outcome_count = sum(self.y_train == outcome)
            self.class_priors[outcome] = outcome_count/self.train_size
    def _calc_likelihoods(self):
        for feature in self.features:
            for outcome in np.unique(self.y_train):
                outcome_count = sum(self.y_train == outcome)
                feat_likelihood = self.X_train[feature][self.y_train[self.y_train==outcome].index.values.tolist()].value_counts().to_dict()
                for feat_val,count in feat_likelihood.items():
                    self.likelihoods[feature][str(feat_val)+'_'+str(outcome)] = count/outcome_count
    def _calc_predictor_prior(self):
        for feature in self.features:
            feat_vals = self.X_train[feature].value_counts().to_dict()

            for feat_val, count in feat_vals.items():
                self.pred_priors[feature][feat_val] = count/self.train_size

    def predict(self, X):
        results = []
        X = np.array(X)

        for query in X:
            probs_outcome = {}
            for outcome in np.unique(self.y_train):
                prior = self.class_priors[outcome]
                likelihood = 1
                evidence = 1

                for feat, feat_val in zip(self.features, query):
                    likelihood *= self.likelihoods[feat][str(feat_val)+"_" +str(outcome)]
                    evidence *= self.pred_priors[feat][feat_val]

                posterior = (likelihood * prior) / (evidence)

                probs_outcome[outcome] = posterior

            result = max(probs_outcome, key = lambda x: probs_outcome[x])
            results.append(result)

        return np.array(results)

In [59]:
def accuracy_score(y_true, y_pred):

	"""	score = (y_true - y_pred) / len(y_true) """

	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)


In [60]:
nb_clf = NaiveBayes()
nb_clf.fit(X_train,Y_train)

In [61]:
print("Train Accuracy: {}".format(accuracy_score(Y_test, nb_clf.predict(X_test))))

KeyError: '0.8717948717948718_Iris-setosa'