In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
class Normal(object):
    def __init__(self, mean = None, var = None):
        """
        mean and varience which are parameters of Normal Distribution, 
        are the attributes for this class.
        """
        self.mean = mean
        self.var  = var
        
    def pdf(self, x):
        """
        Function takes real number x as input and returns probability density 
        value curresponding to it.
        If distribution parameters are not defiend it will raise an error
        """
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
        
        sigma = np.sqrt(self.var)
        mu  = self.mean
        
        f = 1/(sigma * np.sqrt(2 * np.pi)) * np.exp(-1/2 * ((x - mu) / sigma)**2)
        return f
    
    def fit(self, X):
        """
        Function takes array of realisations of random variable and sets values 
        of distribution parameters by their estimates based on data
        """
        self.mean = np.mean(X)
        self.var  = np.var(X)
    
    def plot(self):
        """
        Function Plots the density function
        If distribution parameters are not defiend it will raise an error
        """      
        if self.mean is None or self.var is None:
            raise Exception('Distribution Parameters are not set')
            
        sigma = np.sqrt(self.var)
        mu  = self.mean
        
        x = np.linspace( (mu - 3*sigma) , (mu + 3*sigma), 100)
        y = self.pdf(x)
        
        plt.plot(x,y)
        plt.show()
        
    def __str__(self):
        if self.mean is None or self.var is None:
            return 'Distribution Parameters are not set'
        else:
            return "Normal distribution with mean " + str('{:.2e}'.format(self.mean)) + " and varience " + str('{:.2e}'.format(self.var))

In [3]:
class NaiveBayes(object):
    def __init__(self):
        """
        self.categories - list of output categories
        self.attributes - list of attributes
        self.condP - dataframe, which contains the conditional probability distribution 
                     for each attribute with each category
                     condP.loc[attribute,category] = P(attribute|category)
        self.catgP - dictionary of probability of each category
        """
        self.categories = None
        self.attributes = None
        self.condP = None
        self.catgP = None

    def fit(self, X, Y, dist):
        """
        Input:
        X - Pandas dataframe where each row is set of attributes for an example.
        Y - Pandas series with labels curresponding to X
        dist - distribution class
        
        With given inputes this function fills the attributes of the NaiveBayes class
        """

        self.categories = list(np.unique(Y))
        self.attributes = list(X.columns)
        
        att, cat = len(self.attributes), len(self.categories)
        
        # dataframe with probability distribution of each attribute given a category
        condP = pd.DataFrame(np.zeros((att, cat)), columns = self.categories, index = self.attributes)
        for attribute in self.attributes:
            for category in self.categories:
                # data vector of specific attribute curresponding to specific category
                dataVec = X.loc[Y == category, attribute]
                
                # Creating an instance of distribution and fitting with estimates of parameters
                condP.loc[attribute,category] = dist()
                condP.loc[attribute,category].fit(dataVec)
                
        self.condP = condP
        
        # dictionary with probability for each category
        catgP = {}
        for category in self.categories:
            catgP[category] = sum(Y == category)/len(Y)
        self.catgP = catgP
    
    def model(self, X):
        """
        Input:
        X - pandas series with set of attributes
        Output:
        function returns predicted label based on given set of attributes
        """
        # Dictionary with predicted probability of each class given set of attributes
        predProb = {}
        for category in self.categories:
            # probability of each category
            catgP = self.catgP[category]
            predProb[category] = np.log10(catgP)
            for attribute in self.attributes:
                # probability density value of each attribute, given the category
                pdf = self.condP.loc[attribute,category].pdf(X[attribute]) 
                predProb[category] = predProb[category] + np.log10(pdf)
                
        return max(predProb, key = predProb.get)
    
    def predict(self, Xlist):
        row, col = Xlist.shape
        pred = []
        for each in range(row):
            pred.append(self.model(Xlist.iloc[each,]))
        return pred

In [4]:
df = pd.read_csv('iris.csv')
df = df.sample(frac = 1) 

In [5]:
row, col = df.shape
frac  = row//5
train = df.iloc[frac:,]
test  = df.iloc[:frac,]

In [6]:
X_train = train.iloc[:,range(col-1)]
Y_train = train.iloc[:,col-1]
X_test  = test.iloc[:,range(col-1)]
Y_test  = test.iloc[:,col-1]

In [7]:
# Creating an instance of the NaiveBayes class
NB = NaiveBayes()

# Generating model based on data with Normal distribution assumption
NB.fit(X_train,Y_train,Normal)

In [8]:
# Prediction using NaiveBayes
Y_pred = NB.predict(X_test)

# Calculating accuracy
Accuracy = sum(Y_pred == Y_test)/len(Y_test)
print('Accuracy of prediction is ', round(Accuracy*100,2),'%' )

Accuracy of prediction is  93.33 %
