# <u>Naive Baye's - Mushroom Classifier

Goal is to predict the class of mushrooms, given some features of the mushrooms. We will use Naive Bayes Model for this classification.

### Load Dataset

In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder   # For encoding from Catagorical to numerical data
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv("Mushroom.csv")
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [12]:
df.shape

(8124, 23)

### Encode the Catagorical Data into Numerical Data using Sklearn

In [13]:
le = LabelEncoder()

ds = df.apply(le.fit_transform) 


In [14]:
print(type(ds))

<class 'pandas.core.frame.DataFrame'>


In [15]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


Job is done ... we got the numerical data

In [18]:
# Convert dataframe to Numpy array
data = ds.values
print(data.shape)
print(type(data))

print(data[:5,:])

# Separating the feature set and label
data_x = data[:,1:]
data_y = data[:,0]

(8124, 23)
<class 'numpy.ndarray'>
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


### Break the data into Training & Testing Data

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2 )

In [21]:
print(x_train.shape, y_train.shape) 
print(x_test.shape, y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [25]:
# How many classed of Mushroom do we have?

np.unique(y_train)

array([0, 1], dtype=int64)

In [26]:
# There are only 2 types of mushroom in our data, Class-0 and Class-1

### Building our Naive Baye's Classifier

In [29]:
# prior
def prior_probability(y_train, label):  # P(Y = c)
    
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train == label) # count the numbers of unique values in label
    
    return (class_examples/float(total_examples))

In [36]:
# How above function prior_probe work?

a = np.array([1,0,1,0,0,0,1,1,1,1])
print(prior_probability(a, 1)) # what is the probability of class '1' in 'a'
print(prior_probability(a, 0))

0.6
0.4


In [42]:
# Likelihood
# will be done for all the feature
def cond_probability(x_train, y_train, feature_col, feature_val, label): # P(Xi/Y = c)
    """feature_ val =  Xi (i.e. value of an example of feature_column)
       feature_col = feature_name of Xi
       label = c"""
    
    x_filtered = x_train[y_train == label]
    numerator = np.sum(x_filtered[:, feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    
    return (numerator/float(denominator))

### Next Step : Compute Posterior Prob for each test example and make predictions

In [38]:
np.unique(y_train)

array([0, 1], dtype=int64)

In [40]:
def predict(x_train, y_train, xtest):
    """xtest is the single testing point, n feature"""
    
    classes = np.unique(y_train)
    n_features = x_train.shape[1]
    posterior_probs = [] # List of prob for all classes and given a single testing point
    
    # Compute posterior for each class
    for label in classes:
        # post_prob = likelihood * prior
        # likelihood = product(P(Xi/Y = c))
        likelihood = 1.0
        for f in range(n_features):
            cond = cond_probability(x_train, y_train, f, xtest[f], label)
            likelihood *= cond
            
        prior = prior_probability(y_train, label)
        posterior_probability = likelihood * prior
        
        # append posterior probability of each class
        posterior_probs.append(posterior_probability)
        
    pred = np.argmax(posterior_probs)
    return(pred)

### Making Predictions for Test Data

In [50]:
output = predict(x_train, y_train, x_test[1])

In [51]:
print(output)
print(y_test[1])

1
1


In [52]:
# We are getting the correct prediction

### Calculating the score 

In [49]:
def score(x_train,y_train,x_test,y_test):

    pred = []
    # Predicting the label for each test example
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i])
        pred.append(pred_label)
    
    pred = np.array(pred)
    
    # total number of labels that are classified correctly
    accuracy = np.sum(pred==y_test)/y_test.shape[0]
    return accuracy

In [53]:
score(x_train, y_train, x_test, y_test)

0.9963076923076923

**Our classification is 99.6% correct**