# Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Data Set

In [2]:
data=pd.read_csv('suv_data.xls')
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
data.shape

(400, 5)

In [4]:
data.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [5]:
data.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

# Define X and Y

In [8]:
X=data.iloc[:,2:4:1]
y=data.iloc[:,-1]

In [9]:
X.head(2)

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000


In [10]:
y.head(2)

0    0
1    0
Name: Purchased, dtype: int64

# Train Model

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=91)

In [13]:
X_train.shape

(300, 2)

# Naive_Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

In [15]:
model=GaussianNB()

In [16]:
model.fit(X_train,y_train)

GaussianNB()

In [17]:
model.score(X_test,y_test)

0.84

In [18]:
from sklearn.metrics import confusion_matrix

In [19]:
y_predict=model.predict(X_test)

In [20]:
confusion_matrix(y_test,y_predict)

array([[58,  7],
       [ 9, 26]], dtype=int64)

# Hyper Parameter Tunning

In [25]:
nb_classifier = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=nb_classifier, 
                 param_grid=params_NB, 
                 cv=5,   
                 verbose=1, 
                 scoring='accuracy') 
gs_NB.fit(X_train, y_train)

gs_NB.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.3s finished


{'var_smoothing': 2.310129700083158e-09}

# Check Model Score

In [28]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB(var_smoothing=2.310129700083158e-09)
model.fit(X_train,y_train)

GaussianNB(var_smoothing=2.310129700083158e-09)

In [29]:
model.score(X_test,y_test)

0.85

# Save/Dump Model

In [32]:
import pickle
filename = 'finalized_NB_model'
pickle.dump(model, open(filename, 'wb'))