In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
class Gaussian(object):
    def __init__(self):
        self.mean = None
        self.var  = None
    
    def fit(self, df):
        """
        Input
        df: Pandas DataFrame where each row is random variable vector.
        Output
        Sets values of distribution parameters by their estimates.
        """
        # axis = 0 imples first dimension i.e. rows give different examples.
        self.mean = np.array(np.mean(df, axis = 0))
        self.var  = np.diag(np.var(df, axis = 0))
        
    def pdf(self, X):
        """
        Function takes attribute vector X as input and returns 
        probability density value curresponding to it.
        If distribution parameters are not defiend it will raise an error
        """
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
        
        mu = self.mean
        sigma = self.var
        
        sigmaDet = np.linalg.det(sigma)
        sigmaInv = np.linalg.inv(sigma)
        
        t1 = 1/np.sqrt(2 * np.pi * sigmaDet)
        t2 = (-1/2) * np.matmul((X - mu), np.matmul(sigmaInv, (X - mu)))
        
        f = t1 * np.exp(t2)
        return f
    
    def logLH(self, X):
        """
        Function takes attribute vector X as input and returns 
        log of probability density value curresponding to it.
        If distribution parameters are not defiend it will raise an error
        """        
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
        
        return np.log10(self.pdf(X))

In [3]:
class NaiveBayes(object):
    def __init__(self):
        """
        self.categories - list of output categories / labels
        self.attributes - list of attributes
        self.condP - conditional probability dataframe, which contains the distributions 
                     for each attribute given each category
        self.catgP - dictionary of probability of each category / label
        """
        self.categories = None
        self.attributes = None
        self.condP = None
        self.catgP = None

    def fit(self, X, Y, dist):
        """
        Input:
        X - Pandas dataframe where each row is attribute vector for a example
        Y - Pandas series with labels curresponding to X
        dist - distribution class
        
        With given inputes this function fills the attributes of the NaiveBayes class
        """

        self.categories = list(np.unique(Y))
        self.attributes = list(X.columns)
        
        condP = {}
        for categorie in self.categories:
            condP[categorie] = dist()
            condP[categorie].fit(X.loc[Y == categorie,])
        self.condP = condP
        
        catgP = {}
        for categorie in self.categories:
            catgP[categorie] = sum(Y == categorie)/len(Y)
        self.catgP = catgP
    
    def model(self, X):
        """
        Input:
        X - pandas series with set of attributes
        Output:
        function returns predicted label based on given set of attributes
        """
        predProb = {}
        for categorie in self.categories:
            predProb[categorie] = np.log10(self.catgP[categorie]) + self.condP[categorie].logLH(X)
            
        return max(predProb, key = predProb.get)
    
    def predict(self, Xlist):
        row, col = Xlist.shape
        pred = []
        for each in range(row):
            pred.append(self.model(Xlist.iloc[each,]))
        return pred

In [4]:
df = pd.read_csv('iris.csv')
df = df.sample(frac = 1) 

In [5]:
row, col = df.shape
frac  = row//5
train = df.iloc[frac:,]
test  = df.iloc[:frac,]

In [6]:
X_train = train.iloc[:,range(col-1)]
Y_train = train.iloc[:,col-1]
X_test  = test.iloc[:,range(col-1)]
Y_test  = test.iloc[:,col-1]

In [7]:
# Creating an instance of the NaiveBayes class
NB = NaiveBayes()

# Generating model based on data with Normal distribution assumption
NB.fit(X_train,Y_train,Gaussian)

In [8]:
# Prediction using NaiveBayes
Y_pred = NB.predict(X_test)

# Calculating accuracy
Accuracy = sum(Y_pred == Y_test)/len(Y_test)
print('Accuracy of prediction is ', round(Accuracy*100,2),'%' )

Accuracy of prediction is  100.0 %
