In [152]:
import numpy as np # for numerical operations 
import pandas as pd # for handling input data 
import matplotlib.pyplot as plt # for data visualization 
import seaborn as sns # for data visualization 
from sklearn.model_selection import train_test_split 

In [153]:
from sklearn.datasets import load_iris

In [160]:
dat=load_iris()

In [165]:
df=pd.DataFrame(data=dat.data,columns=dat.feature_names)
df_targets=dat.target

In [166]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [167]:
X=df
y=df_targets

In [188]:
X_train, X_test ,y_train, y_test  = train_test_split(X,y, random_state=42,test_size=0.20,stratify=y, shuffle=True)

In [189]:
classes=np.unique(y)

In [190]:
X_dict={}
for c in classes:
    X_dict[c]=X_train[y_train==c]

In [191]:
mean_dict={}
std_dict={}
for k,v in X_dict.items():
    mean_dict[k] =np.mean(v,axis=0)
    std_dict[k] =np.std(v,axis=0)

In [192]:
def gaussian_likelihood(x,mean,std):
    eps=1e-6 # to avoid dividing by zero
    P=(1/((std+eps)*np.sqrt(2*np.pi)))*np.exp(-0.5*np.square((x-mean)/(std+eps)))
    return P

In [193]:
likelihood_dict={}
for (k_mean,val_mean),(k_std,val_std) in zip (mean_dict.items(),std_dict.items()):
    likelihood_dict[k_mean] = gaussian_likelihood(X_test, val_mean, val_std)

In [194]:
# priority for each class 
prior_dict={}
for k,v in X_dict.items():
    prior_dict[k] = len(v) / len(X_train)

In [195]:
posterior_dict={}
for (k_lik,val_lik),(k_prior,val_prior) in zip (likelihood_dict.items(),prior_dict.items()):
    posterior_dict[k_lik] = np.prod(val_lik,axis=1) * val_prior

In [196]:
# convert dictionary to array of shape nclasses X samples
res=pd.DataFrame(np.array([posterior_dict[k] for k in posterior_dict]))

In [197]:
res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.1683157,5.398268e-144,1.8708310000000002e-39,9.784669e-40,0.2525403,8.359290999999999e-124,0.01523723,0.5125325,6.604023e-176,2.644603e-105,...,1.042173,2.380221e-230,9.325784e-109,4.1748319999999995e-172,7.479885e-215,6.135798e-149,6.8292599999999994e-102,0.05321493,7.002318e-185,2.3923
1,3.5428529999999996e-21,0.005561621,0.0007170759,0.0007258282,1.669041e-21,0.01973565,2.870489e-24,1.676359e-17,2.723962e-05,0.1462396,...,7.952119e-20,3.224978e-11,0.1698788,0.004802072,4.461178e-08,0.003819331,0.1624792,1.670242e-17,0.0001465001,1.376308e-19
2,2.0884879999999998e-26,0.05781974,3.658621e-10,3.139421e-10,1.715803e-26,0.01070744,1.7946920000000002e-28,6.617549e-23,0.1844719,0.0009915572,...,1.393765e-24,0.08805413,0.001069746,0.005373684,0.1646003,0.06128621,0.001498717,2.211909e-22,0.1533922,8.899662e-25


In [198]:
res_final=np.argmax(res,axis=0)

In [199]:
res_final

array([0, 2, 1, 1, 0, 1, 0, 0, 2, 1, 2, 2, 2, 1, 0, 0, 0, 1, 1, 2, 0, 2,
       1, 2, 2, 2, 1, 0, 2, 0])

In [200]:
from sklearn.metrics import accuracy_score

In [201]:
print('Test Accuracy',accuracy_score(res_final, y_test))

Test Accuracy 0.9666666666666667


# Using SKlearn

In [203]:
from sklearn.naive_bayes import GaussianNB

In [204]:
cl=GaussianNB()
cl.fit(X_train,y_train)

y_pred=cl.predict(X_test)

print('Test Accuracy',accuracy_score(y_pred, y_test))

Test Accuracy 0.9666666666666667


# Categorical Naive Bayes

In [208]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
df = pd.read_csv(url, header=None,
                 names=['buying','maint','doors','persons','lug_boot','safety','class'])

In [209]:
X=df.drop(['class'],axis=1)
y=df['class']

In [210]:
X_train, X_test ,y_train, y_test  = train_test_split(X,y, random_state=42,test_size=0.20,stratify=y, shuffle=True)

In [211]:
class_probs_dict=y_train.value_counts(normalize=True).to_dict()

In [212]:
likelihood_dict={}

In [213]:
import warnings
warnings.filterwarnings('ignore')

In [214]:
# training
for col in X_train.columns:
    likelihood_dict[col]={}
    for c in y.unique():
        subset = X_train[y==c]
        props=subset[col].value_counts(normalize=True)
        likelihood_dict[col][c]=props.to_dict()

In [215]:
# prediction
results = []

for _,row in X_test.iterrows():
    class_scores={}
    for c in class_probs_dict:
        score=np.log(class_probs_dict[c]) # class prop
        for col in row.index:
            value=row[col]
            prop=likelihood_dict.get(col).get(c).get(value,1e-6)
            score +=np.log(prop)
        class_scores[c]=score
    best_class = max(class_scores,key=class_scores.get)
    results.append(best_class)

In [216]:
print('Test Accuracy',accuracy_score(results, y_test))

Test Accuracy 0.8641618497109826


# Using SKlearn

In [240]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder

In [241]:
X=df.drop(['class'],axis=1)
y=df['class']

In [242]:
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)


In [243]:
X_train, X_test ,y_train, y_test  = train_test_split(X_encoded,y, random_state=42,test_size=0.20,stratify=y, shuffle=True)

In [244]:
cl=CategoricalNB()
cl.fit(X_train,y_train)

In [245]:
y_pred=cl.predict(X_test)

print('Test Accuracy',accuracy_score(y_pred, y_test))

Test Accuracy 0.8641618497109826
