# Classification Codes Using Naive Bayes Classifier and Grid Independence

In [1]:
# Import libraries
import sklearn
import pandas as pd
import numpy as np
import pickle
import warnings

In [2]:
# Ignore warnings
warnings.filterwarnings("ignore")

In [5]:
# Import the dataset from the table in csv format
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [7]:
# Convert alphabetical values in the dataset into numerical values
dataset = pd.get_dummies(dataset, dtype = int, drop_first = True)
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [9]:
# Exclude the colum named user ID because it is a redundant variable and does not contribute to the solution 
dataset = dataset.drop('User ID', axis = 1)

In [11]:
# Find the number of output variables classified as O and 1
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [13]:
# Get the headings of the dataset
dataset.columns

Index(['Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [15]:
# Specify the input values or independent variables in the dataset
Input = dataset[['Age', 'EstimatedSalary', 'Gender_Male']]
Input

Unnamed: 0,Age,EstimatedSalary,Gender_Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1
...,...,...,...
395,46,41000,0
396,51,23000,1
397,50,20000,0
398,36,33000,1


In [17]:
# Find the number of different input variables and the maximum count of the input variables
Input.shape

(400, 3)

In [19]:
# Specify the output values or dependent variables in the dataset
Output = dataset[['Purchased']]
Output

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0
...,...
395,1
396,1
397,1
398,0


In [21]:
# Split the variables in the dataset into training set and test set and also assign the size of the test set
from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(Input, Output, test_size = 0.3, random_state = 0)

In [23]:
# Get the values of the input training set
X_train

Unnamed: 0,Age,EstimatedSalary,Gender_Male
92,26,15000,1
223,60,102000,1
234,38,112000,0
232,40,107000,1
377,42,53000,0
...,...,...,...
323,48,30000,0
192,29,43000,1
117,36,52000,1
47,27,54000,0


In [25]:
# Transform model and Fit model to standard scalar
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
X_train
X_test

array([[-0.77101313,  0.49720103,  0.99288247],
       [ 0.0133054 , -0.57280368, -1.00716855],
       [-0.28081405,  0.15017248,  0.99288247],
       [-0.77101313,  0.26584866, -1.00716855],
       [-0.28081405, -0.57280368, -1.00716855],
       [-1.06513258, -1.44037507,  0.99288247],
       [-0.67297331, -1.5849703 , -1.00716855],
       [-0.18277423,  2.14558666,  0.99288247],
       [-1.94749093, -0.05226085, -1.00716855],
       [ 0.89566375, -0.775237  ,  0.99288247],
       [-0.77101313, -0.60172273,  0.99288247],
       [-0.96709276, -0.42820845, -1.00716855],
       [-0.08473441, -0.42820845,  0.99288247],
       [ 0.11134522,  0.20801057,  0.99288247],
       [-1.7514113 ,  0.46828198,  0.99288247],
       [-0.5749335 ,  1.36477242, -1.00716855],
       [-0.08473441,  0.20801057,  0.99288247],
       [-1.84945111,  0.43936294,  0.99288247],
       [ 1.67998229,  1.74072002, -1.00716855],
       [-0.28081405, -1.38253697,  0.99288247],
       [-0.28081405, -0.65956082, -1.007

In [27]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB 
from sklearn.model_selection import GridSearchCV as GCV
classifiers = {'BNB': BernoulliNB(), 'GNB': GaussianNB(), 'MNB': MultinomialNB()}
Grid_param = {'BNB': {'alpha' : [0.1, 0.5, 1.0], 'binarize': [0.0, 0.5, 1.0]}, 'GNB': {'var_smoothing' : [1e-9, 1e-8, 1e-7]}, 'MNB': {'alpha' : [0.1, 0.5, 1.0]}}

In [29]:
best_models = {}

In [31]:
# Loop through each classifier and perform GridSearchCV
for name, clf in classifiers.items():
    print(f"Running GridSearchCV for {name}...")
    grid = GCV(clf, Grid_param[name], n_jobs=-1, cv=5, verbose=3, scoring='f1_weighted')
    grid.fit(X_train, Y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best params for {name}: {grid.best_params_}\n")

Running GridSearchCV for BNB...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best params for BNB: {'alpha': 0.1, 'binarize': 0.5}

Running GridSearchCV for GNB...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best params for GNB: {'var_smoothing': 1e-09}

Running GridSearchCV for MNB...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\dilip\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\dilip\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dilip\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 759, in fit
    self._count(X, Y)
  File "C:\Users\dilip\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 881, in _count
    check_non_negative(X, "MultinomialNB (input X)")
  File "C:\Users\dilip\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1650, in check_non_negative
    raise ValueError("Negative values in data passed to %s" % whom)
ValueError: Negative values in data passed to MultinomialNB (input X)


In [None]:
# Find the best parameters
Re = Grid.cv_results_
print ("The Re score value for the best parameter is: {}".format(Grid.best_params_))

In [None]:
# Predict output using the data from input variables
Y_pred = Grid.predict(X_test)

In [None]:
# Generate confusion matrix from output variables
from sklearn.metrics import confusion_matrix as cm
cm = cm(Y_test, Y_pred)

In [None]:
# Generate report of the confusion matrix
from sklearn.metrics import classification_report as cr
cr = cr(Y_test, Y_pred)

In [None]:
# Extract weighted score from the grid
from sklearn.metrics import f1_score
f1_macro = f1_score (Y_test, Y_pred, average = 'weighted')
print ("The best parameter is :". format(Grid.best_params_), f1_macro)

In [None]:
print (" The confusion matrix is: \n", cm)

In [None]:
print (" The classification report is: \n", cr)

In [None]:
from sklearn.metrics import roc_auc_score as ras
ras = ras(Y_test, Grid.predict_proba (X_test)[:,1])

In [None]:
Table = pd.DataFrame.from_dict(Re)
Table

In [None]:
filename = "Pythoncode_kNN_grid.sav"
pickle.dump (Grid, open(filename, "wb"))

In [None]:
load_model=pickle.load(open("Pythoncode_kNN_grid.sav", "rb"))
Result = load_model.predict([[40,30000,1]])
Result