In [504]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

In [505]:
ads = pd.read_csv('Social_Network_Ads.csv')
ads.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


# Preprocessing

In [506]:
le = preprocessing.LabelEncoder()
le.fit(ads["Gender"])
ads["Gender"] = le.transform(ads["Gender"])
gender_labels = dict(zip(le.classes_, le.transform(le.classes_)))
print(gender_labels)

{'Female': 0, 'Male': 1}


In [507]:
x = ads.iloc[:,:4]
x

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15624510,1,19,19000
1,15810944,1,35,20000
2,15668575,0,26,43000
3,15603246,0,27,57000
4,15804002,1,19,76000
...,...,...,...,...
395,15691863,0,46,41000
396,15706071,1,51,23000
397,15654296,0,50,20000
398,15755018,1,36,33000


In [508]:
y = ads.iloc[:,4]
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

# Feature Selection

In [519]:
bestfeatures = SelectKBest(score_func = chi2, k = 2)
fit = bestfeatures.fit(x,y)
adsscores = pd.DataFrame(fit.scores_)
adscolumns = pd.DataFrame(x.columns)

featureScore = pd.concat([adscolumns, adsscores], axis = 1)
featureScore.columns = ['Attr', 'Score']
print(featureScore.nlargest(2,'Score'))

              Attr          Score
1  EstimatedSalary  872013.169231
0              Age     451.155226


In [510]:
ads.drop(columns = ['User ID', 'Gender'], inplace = True)
ads.dropna(inplace = True)
ads

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
...,...,...,...
395,46,41000,1
396,51,23000,1
397,50,20000,1
398,36,33000,0


In [511]:
fitur = ads.iloc[:,:2]
fitur

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
...,...,...
395,46,41000
396,51,23000
397,50,20000
398,36,33000


In [512]:
x = fitur
x

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
...,...,...
395,46,41000
396,51,23000
397,50,20000
398,36,33000


In [513]:
y = ads.iloc[:,2]
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

# Split Data

In [514]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Modelling

In [515]:
nb = GaussianNB()
nb = nb.fit(x_train, y_train)

In [516]:
y_pred = nb.predict(x_test)
y_pred

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0], dtype=int64)

# Model Evaluation

In [517]:
print("Confusuon matrix : \n", confusion_matrix(y_pred, y_test))
print("Classification report matrix : \n", classification_report(y_pred, y_test))

Confusuon matrix : 
 [[57  2]
 [ 2 19]]
Classification report matrix : 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        59
           1       0.90      0.90      0.90        21

    accuracy                           0.95        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.95      0.95      0.95        80



In [518]:
print(accuracy_score(y_pred, y_test))

0.95
