In [None]:
from helper_functions import load_dataset
from typing import List, Tuple

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

### Loading the cleaned dataset

In [ ]:
data: pd.DataFrame = None
try:
    data = load_dataset('../data/assignment2_income_levels_cleaned.xlsx')
except FileNotFoundError:
    print('File not found')

In [ ]:
# Encoding categorical variables
label_encoder = LabelEncoder()
data['sex'] = label_encoder.fit_transform(data['sex'])

In [ ]:
# Splitting the data into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

### Models

In [ ]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [ ]:
# Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# K-Nearest Neighbors model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Naive Bayes model (Gaussian)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Predictions
lr_preds = lr_model.predict(X_test)
knn_preds = knn_model.predict(X_test)
nb_preds = nb_model.predict(X_test)
dt_preds = dt_model.predict(X_test)

# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)
knn_accuracy = accuracy_score(y_test, knn_preds)
nb_accuracy = accuracy_score(y_test, nb_preds)
dt_accuracy = accuracy_score(y_test, dt_preds)

# classification_rep = classification_report(y_test, y_pred)

print("Linear Regression Accuracy:", lr_accuracy)
print("K-Nearest Neighbors Accuracy:", knn_accuracy)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Decision Tree Accuracy:", dt_accuracy)