In [7]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('mushrooms.csv')

In [9]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [10]:
df.shape

(8124, 23)

# Encoding

In [11]:
encoder = LabelEncoder()
df = df.apply(encoder.fit_transform)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


# Splitting the dataset

In [12]:
X = df.drop(columns = ['class'])
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print("X_train = ", X_train.shape)
print("y_train = ", y_train.shape)
print("X_test = ", X_test.shape)
print("y_test = ", y_test.shape)

X_train =  (5686, 22)
y_train =  (5686,)
X_test =  (2438, 22)
y_test =  (2438,)


In [15]:
def prior(y_train, label):
   total_points = y_train.shape[0]
   class_points = np.sum(y_train == label)
   return class_points/float(total_points)

In [16]:
def cond_prob(X_train, y_train, feat_col, feat_val, label):
   X_filtered = X_train[y_train == label]
   numerator = np.sum(X_filtered[feat_col] == feat_val)
   denominator = np.sum(y_train == label)
   return numerator/float(denominator)

In [17]:
def predict(X_train, y_train, xtest):
    
    classes = np.unique(y_train)
    features = [x for x in X_train.columns]
    post_probs = []
    for label in classes:
        likelihood = 1.0
        for f in features:
            cond = cond_prob(X_train, y_train, f, xtest[f], label)
            likelihood *= cond
        prior_prob = prior(y_train, label)
        posterior = prior_prob * likelihood
        post_probs.append(posterior)
        
    prediction = np.argmax(post_probs)
    
    return prediction    
        

In [18]:
rand_example = 6

output = predict(X_train, y_train, X_test.iloc[rand_example])

print("Naive Bayes Classifier predicts ", output)
print("Current Answer ", y_test.iloc[rand_example])

Naive Bayes Classifier predicts  1
Current Answer  1


In [19]:
def accuracy_score(X_train, y_train, xtest, ytest):
    
    preds = []
    
    for i in range(xtest.shape[0]):
        pred_label = predict(X_train, y_train, xtest.iloc[i])
        preds.append(pred_label)
        
    preds = np.array(preds)
    
    accuracy = np.sum(preds == ytest)/ytest.shape[0]
    
    return accuracy

In [None]:
print("Accuracy Score for our classifier == ", accuracy_score(X_train, y_train, X_test, y_test))