                                                     Applying the Naive Bayes 

In [61]:
import pandas as pd
import numpy as np

In [62]:
df = pd.read_csv('User_Data.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [63]:
dummy_gender = pd.get_dummies(df['Gender'], prefix='Gender')

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, dummy_gender], axis=1)

# Drop the original "Gender" column if needed
df.drop('Gender', axis=1, inplace=True)
df

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19,19000,0,False,True
1,15810944,35,20000,0,False,True
2,15668575,26,43000,0,True,False
3,15603246,27,57000,0,True,False
4,15804002,19,76000,0,False,True
...,...,...,...,...,...,...
395,15691863,46,41000,1,True,False
396,15706071,51,23000,1,False,True
397,15654296,50,20000,1,True,False
398,15755018,36,33000,0,False,True


In [64]:
df['Gender_Male'] = df['Gender_Male'].replace({True:1, False: 0})
df['Gender_Female'] = df['Gender_Female'].replace({True:1, False: 0})
df

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Female,Gender_Male
0,15624510,19,19000,0,0,1
1,15810944,35,20000,0,0,1
2,15668575,26,43000,0,1,0
3,15603246,27,57000,0,1,0
4,15804002,19,76000,0,0,1
...,...,...,...,...,...,...
395,15691863,46,41000,1,1,0
396,15706071,51,23000,1,0,1
397,15654296,50,20000,1,1,0
398,15755018,36,33000,0,0,1


In [65]:
column_order = ['Age','EstimatedSalary','Gender_Male','Gender_Female','Purchased']
df=df[column_order]
df

Unnamed: 0,Age,EstimatedSalary,Gender_Male,Gender_Female,Purchased
0,19,19000,1,0,0
1,35,20000,1,0,0
2,26,43000,0,1,0
3,27,57000,0,1,0
4,19,76000,1,0,0
...,...,...,...,...,...
395,46,41000,0,1,1
396,51,23000,1,0,1
397,50,20000,0,1,1
398,36,33000,1,0,0


In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [67]:
ip_columns = ['Age','EstimatedSalary','Gender_Male','Gender_Female']  
op_columns = ['Purchased'] 

In [68]:
# splitting data into train and test

Xtrain,Xtest,Ytrain,Ytest = train_test_split(df[ip_columns],df[op_columns],test_size = 0.3,random_state=0)

In [69]:
Xtrain

Unnamed: 0,Age,EstimatedSalary,Gender_Male,Gender_Female
92,26,15000,1,0
223,60,102000,1,0
234,38,112000,0,1
232,40,107000,1,0
377,42,53000,0,1
...,...,...,...,...
323,48,30000,0,1
192,29,43000,1,0
117,36,52000,1,0
47,27,54000,0,1


In [70]:
Ytrain

Unnamed: 0,Purchased
92,0
223,1
234,0
232,1
377,0
...,...
323,1
192,0
117,0
47,0


In [71]:
Xtest

Unnamed: 0,Age,EstimatedSalary,Gender_Male,Gender_Female
132,30,87000,1,0
309,38,50000,0,1
341,35,75000,1,0
196,30,79000,0,1
246,35,50000,0,1
...,...,...,...,...
216,49,65000,1,0
259,45,131000,0,1
49,31,89000,0,1
238,46,82000,0,1


In [72]:
Ytest

Unnamed: 0,Purchased
132,0
309,0
341,0
196,0
246,0
...,...
216,0
259,1
49,0
238,0


In [73]:
imnb = MultinomialNB()

In [74]:
Xtrain.iloc[0:40,:]

Unnamed: 0,Age,EstimatedSalary,Gender_Male,Gender_Female
92,26,15000,1,0
223,60,102000,1,0
234,38,112000,0,1
232,40,107000,1,0
377,42,53000,0,1
142,35,59000,1,0
22,48,41000,1,0
252,48,134000,0,1
350,38,113000,0,1
168,29,148000,1,0


In [75]:
Ytrain.iloc[0:40,:]

Unnamed: 0,Purchased
92,0
223,1
234,0
232,1
377,0
142,0
22,1
252,1
350,1
168,1


In [76]:

pred_mnb = imnb.fit(Xtrain.iloc[:,0:4],Ytrain.iloc[:,0]) 

In [77]:
Y_train_pred = pred_mnb.predict(Xtrain.iloc[:,0:4])
Y_train_pred

array([0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [78]:
pred_mnb.predict_proba(Xtrain.iloc[:,0:4])

array([[0.80438434, 0.19561566],
       [0.65344203, 0.34655797],
       [0.41153145, 0.58846855],
       [0.49953102, 0.50046898],
       [0.71091937, 0.28908063],
       [0.68973142, 0.31026858],
       [0.81753815, 0.18246185],
       [0.37354413, 0.62645587],
       [0.40682037, 0.59317963],
       [0.25028049, 0.74971951],
       [0.77020427, 0.22979573],
       [0.8585689 , 0.1414311 ],
       [0.78280892, 0.21719108],
       [0.31723952, 0.68276048],
       [0.54230336, 0.45769664],
       [0.64801818, 0.35198182],
       [0.546256  , 0.453744  ],
       [0.50642814, 0.49357186],
       [0.66976425, 0.33023575],
       [0.8470013 , 0.1529987 ],
       [0.58556579, 0.41443421],
       [0.43590688, 0.56409312],
       [0.59840482, 0.40159518],
       [0.64064184, 0.35935816],
       [0.58444944, 0.41555056],
       [0.22911125, 0.77088875],
       [0.66811557, 0.33188443],
       [0.33847536, 0.66152464],
       [0.66937662, 0.33062338],
       [0.6194145 , 0.3805855 ],
       [0.

In [79]:
confusion_matrix(Ytrain,Y_train_pred)

array([[158,  20],
       [ 60,  42]], dtype=int64)

In [80]:
pd.crosstab(Ytrain.iloc[:,0],Y_train_pred)

col_0,0,1
Purchased,Unnamed: 1_level_1,Unnamed: 2_level_1
0,158,20
1,60,42


In [81]:
accuracy_score(Ytrain.iloc[:,0],Y_train_pred)

0.7142857142857143

In [82]:
Y_pred = pred_mnb.predict(Xtest.iloc[:,0:4])
Y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [83]:
pred_mnb.predict_proba(Xtest.iloc[:,0:4])

array([[0.52960363, 0.47039637],
       [0.70069036, 0.29930964],
       [0.6194145 , 0.3805855 ],
       [0.51749108, 0.48250892],
       [0.68347571, 0.31652429],
       [0.79305178, 0.20694822],
       [0.79316756, 0.20683244],
       [0.30350931, 0.69649069],
       [0.49029377, 0.50970623],
       [0.80750038, 0.19249962],
       [0.70246809, 0.29753191],
       [0.6186516 , 0.3813484 ],
       [0.71721286, 0.28278714],
       [0.63548418, 0.36451582],
       [0.46722697, 0.53277303],
       [0.35053959, 0.64946041],
       [0.62291819, 0.37708181],
       [0.46537428, 0.53462572],
       [0.43768385, 0.56231615],
       [0.82052401, 0.17947599],
       [0.69598555, 0.30401445],
       [0.32325581, 0.67674419],
       [0.71338804, 0.28661196],
       [0.55616789, 0.44383211],
       [0.70585377, 0.29414623],
       [0.34829944, 0.65170056],
       [0.46511393, 0.53488607],
       [0.55676266, 0.44323734],
       [0.64332046, 0.35667954],
       [0.49686118, 0.50313882],
       [0.

In [84]:
confusion_matrix(Ytest,Y_pred)

array([[64, 15],
       [27, 14]], dtype=int64)

In [85]:
pd.crosstab(Ytest.iloc[:,0],Y_pred)

col_0,0,1
Purchased,Unnamed: 1_level_1,Unnamed: 2_level_1
0,64,15
1,27,14


In [86]:
accuracy_score(Ytrain.iloc[:,0],Y_train_pred) 

0.7142857142857143

In [87]:
accuracy_score(Ytest.iloc[:,0],Y_pred) 

0.65