# CS5830 Project 5: Naive Bayes

In [143]:
import pandas as pd
import numpy as np
import seaborn as sns

from ucimlrepo import fetch_ucirepo # Dataset
from mrmr import mrmr_classif # Feature Selection
from sklearn.preprocessing import StandardScaler # Data standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Models
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

## Fetch and Clean the Data

In [144]:
adult = fetch_ucirepo(id=2)
X = adult.data.features
y = adult.data.targets

# Convert <=50k to 0 and >50k to 1 (dataset also has a . at the end of some values, which we need to account for)
print(y['income'].unique())
y['income'] = y['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
print(y['income'].unique())

df = pd.concat([X, y], axis=1)

df = df.replace('?', np.nan)
df = df.dropna()
df.head()

['<=50K' '>50K' '<=50K.' '>50K.']
[0 1]


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [145]:
"""
Declare categorical and numerical features
"""
adult_feats = adult.variables
# Remove role "Target"
adult_feats = adult_feats[adult_feats["role"] != "Target"]
display(adult_feats)

categorical_feats = adult_feats[(adult_feats["type"] == "Categorical") | (adult_feats["type"] == "Binary")]['name'].tolist()
numerical_feats = adult_feats[adult_feats["type"] == "Integer"]['name'].tolist()

print(f'Categorical: {categorical_feats}')
print(f'Numerical: {numerical_feats}')

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,,no
1,workclass,Feature,Categorical,Income,"Private, Self-emp-not-inc, Self-emp-inc, Feder...",,yes
2,fnlwgt,Feature,Integer,,,,no
3,education,Feature,Categorical,Education Level,"Bachelors, Some-college, 11th, HS-grad, Prof-...",,no
4,education-num,Feature,Integer,Education Level,,,no
5,marital-status,Feature,Categorical,Other,"Married-civ-spouse, Divorced, Never-married, S...",,no
6,occupation,Feature,Categorical,Other,"Tech-support, Craft-repair, Other-service, Sal...",,yes
7,relationship,Feature,Categorical,Other,"Wife, Own-child, Husband, Not-in-family, Other...",,no
8,race,Feature,Categorical,Race,"White, Asian-Pac-Islander, Amer-Indian-Eskimo,...",,no
9,sex,Feature,Binary,Sex,"Female, Male.",,no


Categorical: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
Numerical: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [146]:
"""
Prepare categorical features
"""
# one-hot encode the categorical features
df = pd.get_dummies(df, columns=categorical_feats, drop_first=True)

X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Scale the numerical features
scaler = StandardScaler()
X_train[numerical_feats] = scaler.fit_transform(X_train[numerical_feats])
X_test[numerical_feats] = scaler.transform(X_test[numerical_feats])

In [147]:
""" Now with all features cleaned, we can perform mRMR on the data"""
# selected_feats = mrmr_classif(X=df.iloc[:, :-1], y=df.iloc[:, -1], K=len(df.columns)-1)
# selected_feats

' Now with all features cleaned, we can perform mRMR on the data'

## Naive Bayes Classification

In [148]:
bool_feats = X.select_dtypes(include='bool').columns

bnb = BernoulliNB()
X_train_bnb = X_train[bool_feats]
X_test_bnb = X_test[bool_feats]
bnb.fit(X_train_bnb, y_train)
y_pred = bnb.predict(X_test_bnb)
bnb_scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

gnb = GaussianNB()
X_train_gnb = X_train.drop(columns=bool_feats)
X_test_gnb = X_test.drop(columns=bool_feats)
gnb.fit(X_train_gnb, y_train)
y_pred = gnb.predict(X_test_gnb)
gnb_scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Naive Bayes (Bernoulli) Scores: {bnb_scores}')
print(f'Naive Bayes (Gaussian) Scores: {gnb_scores}')

Naive Bayes (Bernoulli) Scores: (0.5246263170791473, 0.7831016825164594, 0.6283198826118855, None)
Naive Bayes (Gaussian) Scores: (0.6467391304347826, 0.30468178493050474, 0.4142217802088513, None)


In [149]:
# Combine test results from the two models using a simple logical AND
y_pred = np.logical_or(y_pred, bnb.predict(X_test[bool_feats]))
combined_scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Combined Scores: {combined_scores}')

Combined Scores: (0.5089794130530004, 0.8500365764447696, 0.6367123287671232, None)
