# CS5830 Project 5: Naive Bayes

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from ucimlrepo import fetch_ucirepo # Dataset
from mrmr import mrmr_classif # Feature Selection
from sklearn.preprocessing import StandardScaler # Data standardization
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Models
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

## Fetch and Clean the Data

In [None]:
adult = fetch_ucirepo(id=2)
X = adult.data.features
y = adult.data.targets

# Convert <=50k to 0 and >50k to 1 (dataset also has a . at the end of some values, which we need to account for)
print(y['income'].unique())
y['income'] = y['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})
print(y['income'].unique())

df = pd.concat([X, y], axis=1)

df = df.replace('?', np.nan)
df = df.dropna()
df.head()

In [None]:
"""
Declare categorical and numerical features
"""
adult_feats = adult.variables
# Remove role "Target"
adult_feats = adult_feats[adult_feats["role"] != "Target"]
display(adult_feats)

categorical_feats = adult_feats[(adult_feats["type"] == "Categorical") | (adult_feats["type"] == "Binary")]['name'].tolist()
numerical_feats = adult_feats[adult_feats["type"] == "Integer"]['name'].tolist()

print(f'Categorical: {categorical_feats}')
print(f'Numerical: {numerical_feats}')

In [None]:
"""
Prepare categorical features
"""
# one-hot encode the categorical features
df = pd.get_dummies(df, columns=categorical_feats, drop_first=True)

X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Scale the numerical features
scaler = StandardScaler()
X_train[numerical_feats] = scaler.fit_transform(X_train[numerical_feats])
X_test[numerical_feats] = scaler.transform(X_test[numerical_feats])

In [None]:
""" Now with all features cleaned, we can perform mRMR on the data"""
selected_feats = mrmr_classif(X=X_train, y=y_train, K=20)
display(selected_feats)

X_train = X_train[selected_feats]
X_test = X_test[selected_feats]

X_train.head()

## Naive Bayes Classification

In [None]:
X_train.info()
bool_feats = X_train.select_dtypes(include='bool').columns

bnb = BernoulliNB()
X_train_bnb = X_train[bool_feats]
X_test_bnb = X_test[bool_feats]
bnb.fit(X_train_bnb, y_train)
y_pred = bnb.predict(X_test_bnb)
bnb_scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

gnb = GaussianNB()
X_train_gnb = X_train.drop(columns=bool_feats)
X_test_gnb = X_test.drop(columns=bool_feats)
gnb.fit(X_train_gnb, y_train)
y_pred = gnb.predict(X_test_gnb)
gnb_scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Naive Bayes (Bernoulli) Scores: precision={bnb_scores[0]}, recall={bnb_scores[1]}, f1={bnb_scores[2]}')
print(f'Naive Bayes (Gaussian) Scores: precision={gnb_scores[0]}, recall={gnb_scores[1]}, f1={gnb_scores[2]}')

In [None]:
# Combine test results from the two models using a simple logical AND
y_pred = np.logical_or(y_pred, bnb.predict(X_test[bool_feats]))
combined_scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

print(f'Combined Scores: precision={combined_scores[0]}, recall={combined_scores[1]}, f1={combined_scores[2]}')