# In this we will apply Naive Bayes to Categorical Data

In [1]:
import numpy as np
import pandas as pd
from Gaussian_Naive_Bayes import GaussianNaiveBayes
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('mushrooms.csv')

In [3]:
training_data_len = int(0.7*data.shape[0])
cv_data_len = int(0.2*data.shape[0])
testing_data_len = data.shape[0] - (training_data_len + cv_data_len)
print(training_data_len, cv_data_len, testing_data_len)

5686 1624 814


In [4]:
data_p = data[data['class'] == 'p']
data_e = data[data['class'] == 'e']

training_data = pd.concat([data_e.iloc[:training_data_len//2,:],data_p.iloc[:training_data_len//2,:]],axis=0)

remaining_data = pd.concat([data_e.iloc[training_data_len//2:,:],data_p.iloc[training_data_len//2:,:]],axis=0)

X_cv, X_test, Y_cv, Y_test = train_test_split(remaining_data.iloc[:,1:], remaining_data['class'], test_size=1/3)

cv_data = pd.DataFrame(X_cv, columns=data.columns[1:])
cv_data['class'] = Y_cv

testing_data = pd.DataFrame(X_test, columns=data.columns[1:])
testing_data['class'] = Y_test

del data_e, data_p, training_data_len, cv_data_len, testing_data_len, remaining_data, X_cv, X_test, Y_cv, Y_test

In [5]:
def fit(features, label):
    unique_in_columns = {}
    probabilities = {}
    prior = {}
    unique = label.unique()
    
    for categories in unique:
        df = features[label == categories]
        probabilities[categories] = {'Size': df.shape[0], 'prior':df.shape[0]/features.shape[0]}
        unique_in_columns[categories] = {}
        
        for column_name in df.columns:
            unique_in_columns[categories][column_name] = len(features[column_name].unique())
            probabilities[categories][column_name] = {}
            unique_values_column = df[column_name].unique()
            
            for value in unique_values_column:
                probabilities[categories][column_name][value] = df[df[column_name] == value].shape[0]/df.shape[0]
                
    
    return unique_in_columns, probabilities, prior, unique

In [6]:
def lidstone_smoothing(category_size,gamma,unique_count):
    return gamma/(category_size + (gamma * unique_count))

In [7]:
u, p, prior, unique_lables = fit(training_data.iloc[:,1:], training_data['class'])

In [8]:
def predict(features):
    predicted = []
    for category in unique_lables:
        l = []
        for row in range(features.shape[0]):
            posterior = 1
            for column in features:
                try:
                    posterior *= p[category][column][features.iloc[row,:][column]]
                except KeyError:
                    posterior *= lidstone_smoothing(p[category]['Size'],0.1,u[category][column])
            l.append(posterior)
        predicted.append(l)
    predicted = np.reshape(np.array(predicted), (len(predicted[0]), -1))
    predicted = pd.DataFrame(np.concatenate([predicted], axis=1), columns=unique_lables)
    return np.array(predicted.idxmax(axis=1))

In [9]:
p = predict(cv_data.iloc[:,:-1])

In [10]:
accuracy = np.count_nonzero(p == cv_data['class'])/cv_data.shape[0]

In [11]:
accuracy

0.49046153846153845