In [279]:
from helper_functions import load_dataset
from typing import List, Tuple

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

### Loading the cleaned dataset

In [280]:
data: pd.DataFrame = None
try:
    data = load_dataset('../data/assignment2_income_cleaned.xlsx')
except FileNotFoundError:
    print('File not found')

In [281]:
# Encoding categorical variables
# label_encoder = LabelEncoder()
# data['sex'] = label_encoder.fit_transform(data['sex'])

In [282]:
# Splitting the data into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

In [283]:
encoder = LabelEncoder()

# Encode categorical variables
X_train = X.copy()  # Make a copy of X_train to avoid modifying the original DataFrame
X_train['workclass'] = encoder.fit_transform(X_train['workclass'])
X_train['education'] = encoder.fit_transform(X_train['education'])
X_train['marital status'] = encoder.fit_transform(X_train['marital status'])
X_train['occupation'] = encoder.fit_transform(X_train['occupation'])
X_train['sex'] = encoder.fit_transform(X_train['sex'])
X_train['gave birth this year'] = encoder.fit_transform(X_train['gave birth this year'])

y_train = y.copy()
y_train = encoder.fit_transform(y_train)

### Models

In [284]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [285]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [286]:
# K-Nearest Neighbors model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

In [287]:
# Naive Bayes model (Gaussian)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [288]:
# Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [289]:
# Predictions
lr_preds = lr_model.predict(X_test)
knn_preds = knn_model.predict(X_test)
nb_preds = nb_model.predict(X_test)
dt_preds = dt_model.predict(X_test)

In [290]:
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)
knn_accuracy = accuracy_score(y_test, knn_preds)
nb_accuracy = accuracy_score(y_test, nb_preds)
dt_accuracy = accuracy_score(y_test, dt_preds)

In [291]:
print(classification_report(y_test, lr_preds))

              precision    recall  f1-score   support

           0       0.71      0.53      0.61       625
           1       0.78      0.89      0.83      1175

    accuracy                           0.76      1800
   macro avg       0.75      0.71      0.72      1800
weighted avg       0.76      0.76      0.75      1800


In [292]:
print(classification_report(y_test, knn_preds))

              precision    recall  f1-score   support

           0       0.63      0.60      0.61       625
           1       0.79      0.81      0.80      1175

    accuracy                           0.74      1800
   macro avg       0.71      0.71      0.71      1800
weighted avg       0.74      0.74      0.74      1800


In [293]:
print(classification_report(y_test, nb_preds))

              precision    recall  f1-score   support

           0       0.55      0.75      0.64       625
           1       0.84      0.67      0.75      1175

    accuracy                           0.70      1800
   macro avg       0.69      0.71      0.69      1800
weighted avg       0.74      0.70      0.71      1800


In [294]:
print(classification_report(y_test, dt_preds))

              precision    recall  f1-score   support

           0       0.57      0.62      0.60       625
           1       0.79      0.75      0.77      1175

    accuracy                           0.71      1800
   macro avg       0.68      0.69      0.68      1800
weighted avg       0.72      0.71      0.71      1800


In [295]:
print("Logistic Regression Accuracy:", lr_accuracy)
print("K-Nearest Neighbors Accuracy:", knn_accuracy)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)

Logistic Regression Accuracy: 0.7622222222222222
K-Nearest Neighbors Accuracy: 0.7377777777777778
Naive Bayes Accuracy: 0.7011111111111111
Decision Tree Accuracy: 0.7088888888888889
