# Cenus data classification using naive bayes
### Youssef Salah Mostafa
### 22010442
### Intelligent Systems Department
### Level 2

## Library imports
First, we have to import the libraries needed.
We will use sklearn for the naive bayes model and data cleaning and shap for the dataset

In [120]:
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report
import pandas as pd

## Loading the dataset
We first have to load the dataset from shap before using it

In [121]:
dataset = fetch_ucirepo(id=2) 

## Preprocessing

In [122]:
# Dividing the data into features and targets
features = dataset.data.features
# The Target column contains 4 classes, some with a trailing . so we normalize the values
target = ["<=50K" if '=' in i else ">50K" for i in dataset.data.targets['income']]
# The outcome column is appended for preprocessing
features['outcome'] = pd.Series(target)
# Checking the data
features.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [123]:
# Checking the data types of the columns
features.info()
# Caclulating the descriptive statistics of the data
features.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  outcome         48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [124]:
# Checking for nulls in the data
null_Count = features.isnull().sum()
null_Count

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
outcome             0
dtype: int64

In [125]:
# Checking if any column contains non-unique values
for col in features.columns:
    if len(features[col].unique()) == 1:
        print(col)

In [126]:
# Drops nulls and nans from the data
features.dropna(inplace=True)
# Resets the index 
features = features.reset_index()
features

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
47617,48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
47618,48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
47619,48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [127]:
# Selects all categorical columns except for the outcome column
categoricalCols = [i for i in features.select_dtypes(include=['object']).columns if i != 'outcome']
# One hot encodes the categorical features
categoricalColsEncoded = pd.get_dummies(features[categoricalCols])
print(categoricalColsEncoded)
# Concats both the original features and the one hot encoded ones
features = pd.concat([features,categoricalColsEncoded],axis=1)
print(features)
# Removes the old non-encoded features
features.drop(categoricalCols,axis=1, inplace=True)
# Drops newly generated index column
features.drop('index',axis=1,inplace=True)
features

       workclass_?  workclass_Federal-gov  workclass_Local-gov  \
0            False                  False                False   
1            False                  False                False   
2            False                  False                False   
3            False                  False                False   
4            False                  False                False   
...            ...                    ...                  ...   
47616        False                  False                False   
47617        False                  False                False   
47618        False                  False                False   
47619        False                  False                False   
47620        False                  False                False   

       workclass_Never-worked  workclass_Private  workclass_Self-emp-inc  \
0                       False              False                   False   
1                       False              False       

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,outcome,workclass_?,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,2174,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,83311,13,0,0,13,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,38,215646,9,0,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,53,234721,7,0,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,13,0,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,33,245211,13,0,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
47617,39,215419,13,0,0,36,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
47618,38,374983,13,0,0,50,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False
47619,44,83891,13,5455,0,40,<=50K,False,False,False,...,False,False,False,False,False,False,False,True,False,False


## Training the model

In [128]:
# Divinding the dataset into training and testing data
trainX, testX, trainY, testY = train_test_split(features.drop('outcome',axis=1),features['outcome'],test_size=0.3,random_state=17)

In [129]:
# Creating an object fro the naive bayes classifier
naiveBayes = CategoricalNB()
# Fitting the model with the training data
naiveBayes.fit(trainX,trainY)

In [130]:
# Prediciting the values for the testing data
predY = naiveBayes.predict(testX)

# Calclating accuracy and f1-score
print("Accuracy for the model is: ",round(accuracy_score(testY,predY)*100,2),"%")
print("F1-Score for the model is: ",round(f1_score(testY,predY,pos_label='>50K')*100,2),"%")
# Calculating the confusion matrix
confusionMatrix = confusion_matrix(testY,predY)
print("Confusion matrix:",confusionMatrix)
# Splitting the data from the confusion matrix
# This form is taken from the documentation so it should be pretty reliable
tn, fp, fn, tp = confusionMatrix.ravel()
# Calculating senstivity and specifity
senstivity = tp / (tp+fn)
specifity = tn / (tn+fp)
# Printing senstivity and specifity
print("Senstivity: ", senstivity) 
print("Specifity: ", specifity)
# Printing classification report for the prediction
print(classification_report(testY,predY))

Accuracy for the model is:  83.47 %
F1-Score for the model is:  68.52 %
Confusion matrix: [[9356 1456]
 [ 905 2570]]
Senstivity:  0.739568345323741
Specifity:  0.8653348131705513
              precision    recall  f1-score   support

       <=50K       0.91      0.87      0.89     10812
        >50K       0.64      0.74      0.69      3475

    accuracy                           0.83     14287
   macro avg       0.78      0.80      0.79     14287
weighted avg       0.85      0.83      0.84     14287



In [131]:
# Fetching class names so we can determine the index for each class
classes = naiveBayes.classes_
print(classes)
# Calculates the posterior probability of each event occuring
posteriorProb = naiveBayes.predict_proba(features.drop('outcome',axis=1))
# Prints the postierior probability of >50K from the index taken from the classes of the model
posteriorProb[:,1] 

['<=50K' '>50K']


array([4.70059607e-04, 9.94782464e-01, 1.09310352e-04, ...,
       9.79955738e-01, 3.62682827e-04, 9.98904507e-01])