# Naive Bayes - Mushroom Dataset

Goal is to predict the class of mushrooms. We will use Naive Bayes Model

In [59]:
import numpy as np
import pandas as pd

## Load the Dataset

In [60]:
df=pd.read_csv("mushrooms.csv")
df #Here the data is not numerical

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


## Encode categorical data to numerical data

In [61]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [62]:
le=LabelEncoder()

# Applies on each column of the dataframe
ds=df.apply(le.fit_transform)
ds

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [63]:
dfx=ds.iloc[:,1:]
dfy=ds.iloc[:,0]

X=dfx.values
Y=dfy.values

## Split

In [64]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=1,test_size=0.2)

X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((6499, 22), (6499,), (1625, 22), (1625,))

In [65]:
np.unique(Y_train)

array([0, 1])

## Building a classifier

In [66]:
a=np.array([1,1,1,1,0,0,0,1,1,0,1,0,1,0,0,0,1,1,0,1,0])

np.sum(a==0)

10

In [67]:
def prior_prob(Y_train,label):
    M=Y_train.shape[0]
    label_count=np.sum(Y_train==label)
    return label_count/float(M)

def cond_prob(X_train,Y_train,feature_col,feature_val,label):
    X_filtered=X_train[Y_train==label]
    feature_count=np.sum(X_filtered[:,feature_col]==feature_val)
    label_count=np.sum(Y_train==label)
    return feature_count/float(label_count)

def predict(X_train,Y_train,x_test):
    '''Here x_test is a signle point containing n features=> 1*n size'''
    classes=np.unique(Y_train)
    post_probs=[]
    n_features=X_train.shape[1]
    #Compute posterior prob for each element of classes
    for label in classes:
        #Post_label=likelihood*prior
        likelihood=1.0
        for f in range(n_features):
            cond=cond_prob(X_train,Y_train,f,x_test[f],label)
            likelihood*=cond
        
        postProb=likelihood*prior_prob(Y_train,label)
        post_probs.append(postProb)
        
    
    return np.argmax(np.array(post_probs))


def score(X_train,Y_train,X_test,Y_test):
    pred=[]
    
    for i in range(X_test.shape[0]):
        p=predict(X_train,Y_train,X_test[i])
        pred.append(p)
        
    pred=np.array(pred)
    score=np.sum(Y_test==pred)/pred.shape[0]
    
    return score*100,pred
    

In [68]:
i=90
pred=predict(X_train,Y_train,X_test[i])
pred,Y_test[i]

(0, 0)

In [69]:
score,Y_pred=score(X_train,Y_train,X_test,Y_test)

print(score)

99.87692307692308


In [77]:
from collections import defaultdict

d=defaultdict(LabelEncoder)

ds1=df.apply(lambda x: d[x.name].fit_transform(x))

ds1

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [78]:
ds1.apply(lambda x:d[x.name].inverse_transform(x))

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [80]:
#Using the dictionary to label future data
df.apply(lambda x:d[x.name].transform(x))

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [82]:
pd.DataFrame(Y_pred,columns=['type']).apply(lambda x:d[x.name].inverse_transform(x))

Unnamed: 0,type
0,e
1,p
2,p
3,p
4,e
...,...
1620,p
1621,e
1622,p
1623,e
