In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
class Normal(object):
    def __init__(self, mean = None, var = None):
        """
        mean and varience which are parameters of Normal Distribution, 
        are the attributes for this class. If specified while initialisation 
        class is initialised with those values otherwise with None
        """
        self.mean = mean
        self.var  = var
        
    def pdf(self, x):
        """
        Function takes real number x as input and returns probability density 
        value curresponding to it.
        if distribution parameters are not defiend it will raise an error
        """
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
        
        sigma = np.sqrt(self.var)
        mu  = self.mean
        f = 1/(sigma * np.sqrt(2 * np.pi)) * np.exp(-1/2 * ((x - mu) / sigma)**2)
        return f
    
    def logLH(self, x):
        """
        Function takes real number x as input and returns log of probability 
        density value curresponding to it.
        if distribution parameters are not defiend it will raise an error
        """
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
        
        return np.log10(self.pdf(x))
    
    def fit(self, X):
        """
        Function takes array of realisations of random variable and 
        sets values of distribution parameters by their estimates 
        based on data
        """
        self.mean = np.mean(X)
        self.var  = np.var(X)
    
    def plot(self):
        """
        Function Plots the density function
        if distribution parameters are not defiend it will raise an error
        """      
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
        sigma = np.sqrt(self.var)
        mu  = self.mean
        x = np.linspace( (mu - 3*sigma) , (mu + 3*sigma), 100)
        y = self.pdf(x)
        plt.plot(x,y)
        plt.show()
        
    def __str__(self):
        if self.mean is None or self.var is None:
            return 'Distribution Parameters are not set'
        else:
            return "Normal distribution with mean " + str('{:.2e}'.format(self.mean)) + " and varience " + str('{:.2e}'.format(self.var))

In [3]:
class NaiveBayes(object):
    def __init__(self):
        """
        self.categories - list of output categories / labels
        self.attributes - list of attributes
        self.condP - conditional probability dataframe, which contains the distributions 
                     for each attribute given each category
        self.catgP - dictionary of probability of each category / label
        """
        self.categories = None
        self.attributes = None
        self.condP = None
        self.catgP = None

    def fit(self, X, Y, dist):
        """
        Input:
        X - Pandas dataframe where each column gives different attribute values
        Y - Pandas series with labels curresponding to X
        dist - distribution class
        
        With given inputes this function fills the attributes of the NaiveBayes class
        """

        self.categories = list(np.unique(Y))
        self.attributes = list(X.columns)
        
        att, cat = len(self.attributes), len(self.categories)
        
        condP = pd.DataFrame(np.zeros((att, cat)), columns = self.categories, index = self.attributes)
        for attribute in self.attributes:
            for categorie in self.categories:
                condP.loc[attribute,categorie] = dist()
                condP.loc[attribute,categorie].fit(X.loc[Y == categorie, attribute])
        self.condP = condP
        
        catgP = {}
        for categorie in self.categories:
            catgP[categorie] = sum(Y == categorie)/len(Y)
        self.catgP = catgP
    
    def model(self, X):
        """
        Input:
        X - pandas series with set of attributes
        Output:
        function returns predicted label based on given set of attributes
        """
        predProb = {}
        for categorie in self.categories:
            predProb[categorie] = 0
            for attribute in self.attributes:
                predProb[categorie] = predProb[categorie] + self.condP.loc[attribute,categorie].logLH(X[attribute]) + np.log10(self.catgP[categorie])
        return max(predProb, key = predProb.get)
    
    def predict(self, X):
        row, col = X.shape
        pred = []
        for each in range(row):
            pred.append(self.model(X.iloc[each,]))
        return pred

In [4]:
df = pd.read_csv('iris.csv')
df = df.sample(frac = 1) 

In [5]:
row, col = df.shape
frac  = row//5
train = df.iloc[frac:,]
test  = df.iloc[:frac,]

In [6]:
X_train = train.iloc[:,range(col-1)]
Y_train = train.iloc[:,col-1]
X_test  = test.iloc[:,range(col-1)]
Y_test  = test.iloc[:,col-1]

In [7]:
# Creating an instance of the NaiveBayes class
NB = NaiveBayes()

# Generating model based on data with Normal distribution assumption
NB.fit(X_train,Y_train,Normal)

In [8]:
# Prediction using NaiveBayes
Y_pred = NB.predict(X_test)

# Calculating accuracy
Accuracy = sum(Y_pred == Y_test)/len(Y_test)
print('Accuracy of prediction is ', round(Accuracy*100,2),'%' )

Accuracy of prediction is  96.67 %
