In [13]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.naive_bayes import GaussianNB

### Loading the cleaned dataset

In [14]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [15]:
# Splitting the data into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

In [16]:
encoder = LabelEncoder()

# Encode categorical variables
X_train = X.copy()  # Make a copy of X_train to avoid modifying the original DataFrame
X_train['workclass'] = encoder.fit_transform(X_train['workclass'])
X_train['education'] = encoder.fit_transform(X_train['education'])
X_train['marital status'] = encoder.fit_transform(X_train['marital status'])
X_train['occupation'] = encoder.fit_transform(X_train['occupation'])
X_train['sex'] = encoder.fit_transform(X_train['sex'])
X_train['gave birth this year'] = encoder.fit_transform(X_train['gave birth this year'])

y_train = y.copy()
y_train = encoder.fit_transform(y_train)

### Models

In [17]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
# Naive Bayes model (Gaussian)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [19]:
# Predictions
nb_preds = nb_model.predict(X_test)

In [20]:
# Accuracy evaluation
nb_accuracy = accuracy_score(y_test, nb_preds)

In [21]:
print(classification_report(y_test, nb_preds))

              precision    recall  f1-score   support

           0       0.55      0.75      0.64       625
           1       0.84      0.67      0.75      1175

    accuracy                           0.70      1800
   macro avg       0.69      0.71      0.69      1800
weighted avg       0.74      0.70      0.71      1800


In [22]:
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.7011111111111111


In [23]:
# save model
save_model(nb_model, '../output/saved_models/naive_bayes_model.joblib')