### Naive Bayes (Classifier) - Multinomial Naive Bayes

In [1]:
# Import useful libararies used for data management

import numpy as np
import pandas as pd

data = pd.read_csv('Weather.csv')

In [2]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,No,no
1,sunny,hot,high,Yes,no
2,overcast,hot,high,No,yes
3,rainy,mild,high,No,yes
4,rainy,cool,normal,No,yes
5,rainy,cool,normal,Yes,no
6,overcast,cool,normal,Yes,yes
7,sunny,mild,high,No,no
8,sunny,cool,normal,No,yes
9,rainy,mild,normal,No,yes


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     object
 2   Humidity     14 non-null     object
 3   Windy        14 non-null     object
 4   Play         14 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes


In [4]:
data.describe()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
count,14,14,14,14,14
unique,3,3,2,2,2
top,sunny,mild,high,No,yes
freq,5,6,7,8,9


In [5]:
# get dummies
data = pd.get_dummies(data)

In [6]:
data

Unnamed: 0,Outlook_overcast,Outlook_rainy,Outlook_sunny,Temperature_cool,Temperature_hot,Temperature_mild,Humidity_high,Humidity_normal,Windy_No,Windy_Yes,Play_no,Play_yes
0,0,0,1,0,1,0,1,0,1,0,1,0
1,0,0,1,0,1,0,1,0,0,1,1,0
2,1,0,0,0,1,0,1,0,1,0,0,1
3,0,1,0,0,0,1,1,0,1,0,0,1
4,0,1,0,1,0,0,0,1,1,0,0,1
5,0,1,0,1,0,0,0,1,0,1,1,0
6,1,0,0,1,0,0,0,1,0,1,0,1
7,0,0,1,0,0,1,1,0,1,0,1,0
8,0,0,1,1,0,0,0,1,1,0,0,1
9,0,1,0,0,0,1,0,1,1,0,0,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Outlook_overcast  14 non-null     uint8
 1   Outlook_rainy     14 non-null     uint8
 2   Outlook_sunny     14 non-null     uint8
 3   Temperature_cool  14 non-null     uint8
 4   Temperature_hot   14 non-null     uint8
 5   Temperature_mild  14 non-null     uint8
 6   Humidity_high     14 non-null     uint8
 7   Humidity_normal   14 non-null     uint8
 8   Windy_No          14 non-null     uint8
 9   Windy_Yes         14 non-null     uint8
 10  Play_no           14 non-null     uint8
 11  Play_yes          14 non-null     uint8
dtypes: uint8(12)
memory usage: 296.0 bytes


In [8]:
# define independent attributes 
features = list(data.columns[0:10])
# assign values for independent variables and target variable ('Play_yes')
X = data [features]
y = data ['Play_yes']

In [9]:
features

['Outlook_overcast',
 'Outlook_rainy',
 'Outlook_sunny',
 'Temperature_cool',
 'Temperature_hot',
 'Temperature_mild',
 'Humidity_high',
 'Humidity_normal',
 'Windy_No',
 'Windy_Yes']

In [10]:
# import cross validation as it is classification model
from sklearn.model_selection import cross_val_score,cross_val_predict

In [11]:
#Import Multinomial Naive Bayes model from sklearn
from sklearn.naive_bayes import MultinomialNB

#Create a Multinomial Bayes Classifier
mnb = MultinomialNB()

In [12]:
pred_y = cross_val_predict(mnb,X,y,cv=3)

In [13]:
pred_y

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0], dtype=uint8)

In [14]:
pred_proba = cross_val_predict(mnb, X,y,cv=3, method = 'predict_proba')

In [15]:
pred_proba

array([[0.41625425, 0.58374575],
       [0.51681762, 0.48318238],
       [0.21098496, 0.78901504],
       [0.70645448, 0.29354552],
       [0.32220992, 0.67779008],
       [0.19204424, 0.80795576],
       [0.08183961, 0.91816039],
       [0.46111674, 0.53888326],
       [0.1762241 , 0.8237759 ],
       [0.08682188, 0.91317812],
       [0.60931822, 0.39068178],
       [0.46422639, 0.53577361],
       [0.16314735, 0.83685265],
       [0.56515877, 0.43484123]])

In [16]:
score = cross_val_score(mnb,X,y,cv=3)

In [17]:
score.mean() #Not efficient / not helping as 9/14 =0.64

0.65

In [18]:
# train model using training dataset
mnb.fit(X, y)

MultinomialNB()

In [19]:
mnb.class_count_

array([5., 9.])

In [20]:
mnb.class_log_prior_ 

array([-1.02961942, -0.44183275])

In [21]:
np.exp(mnb.class_log_prior_)

array([0.35714286, 0.64285714])

In [22]:
mnb.feature_count_ # equals to counting base on the colum order 

array([[0., 2., 3., 1., 2., 2., 4., 1., 2., 3.],
       [4., 3., 2., 3., 2., 4., 3., 6., 6., 3.]])

In [23]:
features

['Outlook_overcast',
 'Outlook_rainy',
 'Outlook_sunny',
 'Temperature_cool',
 'Temperature_hot',
 'Temperature_mild',
 'Humidity_high',
 'Humidity_normal',
 'Windy_No',
 'Windy_Yes']

In [24]:
mnb.feature_log_prob_

array([[-3.40119738, -2.30258509, -2.01490302, -2.7080502 , -2.30258509,
        -2.30258509, -1.79175947, -2.7080502 , -2.30258509, -2.01490302],
       [-2.21920348, -2.44234704, -2.73002911, -2.44234704, -2.73002911,
        -2.21920348, -2.44234704, -1.88273125, -1.88273125, -2.44234704]])

In [25]:
np.exp(mnb.feature_log_prob_)

array([[0.03333333, 0.1       , 0.13333333, 0.06666667, 0.1       ,
        0.1       , 0.16666667, 0.06666667, 0.1       , 0.13333333],
       [0.10869565, 0.08695652, 0.06521739, 0.08695652, 0.06521739,
        0.10869565, 0.08695652, 0.15217391, 0.15217391, 0.08695652]])

In [26]:
# Predict class for new instance with "Outlook = overcast, Temperature = cool, humidity = high, Windy = No"
mnb.predict([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

array([1], dtype=uint8)

In [29]:
# Predict class probability for new instance with "Outlook = overcast, Temperature = cool, humidity = high, Windy = No"
mnb.predict([[0,1,0,0,1,0,1,0,0,1]])

array([0], dtype=uint8)