In [20]:
# import libraries
from numpy import mean
from numpy import std
from numpy import hstack
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

# reading csv files
df =  pd.read_csv('adult.data', sep=",", header=None, skipinitialspace=True)
df2 = pd.read_csv('adult.test', sep=",", header=None, skipinitialspace=True)

# Join the data and test files together
df = pd.concat([df, df2])

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Replace all of ? with None
df = df.replace(['?'], [None])
# Drop all rows with None in them
df = df.dropna(axis=0)

# Check no None values remain
df.isnull().sum()

#Adding column headers to our data 
df.columns = ["Age", "Workclass", "Fnlwgt", "Education", "Education-num", "Marital-status", "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week", "Native-country", "Income"]
# Workclass, Fnlwgt, Race and Native-country are not worth using.
# Education = Education num, so drop Education
df = df.drop(columns=['Workclass', 'Race', 'Fnlwgt', 'Native-country', 'Education'])

In [21]:
#Numerically encoding occupation: Occupaiton is grouped into white collar and blue collar 
occupation_mapping_dict = {
    "Tech-support" : 0,
    "Craft-repair" : 1, 
    "Other-service" : 1, #Wasn't sure about blue or white collar for other services 
    "Sales" : 0, 
    "Exec-managerial" : 0, 
    "Prof-specialty" : 0, 
    "Handlers-cleaners" : 1, 
    "Machine-op-inspct" : 1, 
    "Adm-clerical" : 0, 
    "Farming-fishing" : 1, 
    "Transport-moving" : 1, 
    "Priv-house-serv" : 1, 
    "Protective-serv" : 1, 
    "Armed-Forces" : 1
    }

df["Occupation"] = df["Occupation"].map(occupation_mapping_dict)


#Numerically encoding the sex variable 
sex_mapping_dict = {
    "Male" : 0,
    "Female" : 1
    }

df["Sex"] = df["Sex"].map(sex_mapping_dict)


#Encoding income variable
income_mapping_dict = {
    "<=50K" : 0,
    ">50K" : 1, 
    "<=50K." : 0, 
    ">50K." : 1
    }

df["Income"] = df["Income"].map(income_mapping_dict)


# FOR MODELS
# Group ages into discrete bins for models
bins = [10,20,30,40,50,60,70,80,90]
names = ['0', '1', '2', '3', '4', '5', '6', '7']
df['Age'] = pd.cut(df['Age'], bins, labels = names)



df

Unnamed: 0,Age,Education-num,Marital-status,Occupation,Relationship,Sex,Capital-gain,Capital-loss,Hours-per-week,Income
0,1,13,Never-married,0,Not-in-family,1,3325,0,43,0
1,3,13,Married-civ-spouse,0,Husband,0,99999,0,45,1
2,0,9,Never-married,1,Own-child,0,0,0,40,0
3,0,9,Never-married,1,Own-child,0,0,0,30,0
4,2,9,Divorced,1,Not-in-family,0,0,0,45,0
...,...,...,...,...,...,...,...,...,...,...
48837,1,9,Never-married,1,Not-in-family,0,0,0,50,0
48838,1,13,Never-married,0,Own-child,0,0,0,40,0
48839,2,5,Married-civ-spouse,1,Wife,1,0,0,48,0
48840,1,9,Never-married,1,Own-child,0,0,0,40,0


In [22]:
#Will now employ one-hot encoding for :  Marital Status and Relationship ; no order in their values
df = pd.get_dummies(df, columns = ['Relationship', 'Marital-status'])

In [23]:
df['Capital-gain'].value_counts()
# 229 values have 99,999 which is ALOT. Is this cap value? or error?

0        41432
15024      498
7688       391
7298       351
99999      229
         ...  
7262         1
1731         1
22040        1
1639         1
2387         1
Name: Capital-gain, Length: 121, dtype: int64

In [24]:
# Will now do feature scaling on Capital-gain and Capital-loss
col_names = ['Capital-gain', 'Capital-loss']
features = df[col_names]

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df[col_names] = scaler.fit_transform(features.values)
df


Unnamed: 0,Age,Education-num,Occupation,Sex,Capital-gain,Capital-loss,Hours-per-week,Income,Relationship_Husband,Relationship_Not-in-family,...,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Marital-status_Divorced,Marital-status_Married-AF-spouse,Marital-status_Married-civ-spouse,Marital-status_Married-spouse-absent,Marital-status_Never-married,Marital-status_Separated,Marital-status_Widowed
0,1,13,0,1,0.296225,-0.21878,43,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,3,13,0,0,13.175193,-0.21878,45,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0,9,1,0,-0.146733,-0.21878,40,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,9,1,0,-0.146733,-0.21878,30,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,2,9,1,0,-0.146733,-0.21878,45,0,0,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,1,9,1,0,-0.146733,-0.21878,50,0,0,1,...,0,0,0,0,0,0,0,1,0,0
48838,1,13,0,0,-0.146733,-0.21878,40,0,0,0,...,1,0,0,0,0,0,0,1,0,0
48839,2,5,1,1,-0.146733,-0.21878,48,0,0,0,...,0,0,1,0,0,1,0,0,0,0
48840,1,9,1,0,-0.146733,-0.21878,40,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [25]:
df['Age'].dtypes

CategoricalDtype(categories=['0', '1', '2', '3', '4', '5', '6', '7'], ordered=True)

In [26]:
# Converting Age column data from string to numerical.
df[["Age"]] = df[["Age"]].apply(pd.to_numeric)

In [27]:
df['Age'].dtypes

dtype('int64')

In [28]:
# Split X and y
X = df.iloc[:, [0,1,2, 3, 4, 5, 6,8,9,10,11,12,13,14,15,16,17,18,19,20]]
y = df.iloc[:, [7]]

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.20, stratify = y)

In [29]:
#Import Naive Bayes models
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

# Cannot use ComplementNB or Mulitnomial NB because require non negative values.

#Create a Classifier
model = GaussianNB()
# model = BernoulliNB(binarize=0.0)

model.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [30]:
# For Bernoulli NB
n_classes = np.unique(y)
n_classes

array([0, 1])

In [31]:
from sklearn.model_selection import GridSearchCV

# Parameter grid for Gaussian NB
param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

# # For Bernoulli NB hyperparamter tuning
# param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0],
#           'fit_prior': [True, False],
#           'class_prior': [None, [0.1,]* len(n_classes), ],
#           'binarize': [None, 0.0, 8.5, 10.0]
#          }

# Create a base model
rf = GaussianNB()
# rf = BernoulliNB()

# Grid search with param_grid
grid_search = GridSearchCV(estimator=rf, 
                     param_grid=param_grid, 
                     cv=3,
                     n_jobs = -1,
                     verbose=2, 
                     return_train_score=True)


In [32]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train.ravel())
# Find best params
grid_search.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'var_smoothing': 0.001873817422860383}

In [33]:
# Testing best params

# Best params for both Gaussian and Bernoulli NB
model=GaussianNB(var_smoothing=0.001873817422860383)
# model=BernoulliNB(alpha=0.01, binarize=8.5, class_prior=None, fit_prior=True)


model.get_params()

{'priors': None, 'var_smoothing': 0.001873817422860383}

In [34]:
# Fitting model to data
model.fit(X_train, y_train.ravel())

GaussianNB(var_smoothing=0.001873817422860383)

In [35]:
# Importing libraries
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Getting predicted train values
train_data_predictions = cross_val_predict(model, X_train, y_train.ravel(), cv=20)

In [36]:
# Finding accuracy score
print(accuracy_score(y_train, train_data_predictions))

0.8288691710202615


In [140]:
# Without hyperparameter tuning:
# 0.7204853912706969


# With best params hyperparameter tuning:
# GaussianNB(var_smoothing=0.001873817422860383)
# 0.8296707853055809
# 0.8296431434336733
# 0.8288138872764463
# 0.828841529148354


# With best params hyperparameter tuning:
# BernoulliNB(alpha=0.01, binarize=8.5)
# 0.7581612626807087

# Therefore GaussianNB best to use rather than Bernoulli, and best params are var_smoothing=0.001873817422860383
