In [1]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

### Loading the cleaned dataset

In [2]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [3]:
# Splitting the data into features (X) and target (y)
X = data.drop(columns=['income'])
y = data['income']

In [4]:
encoder = LabelEncoder()

# Encode categorical variables
X_train = X.copy()  # Make a copy of X_train to avoid modifying the original DataFrame
X_train['workclass'] = encoder.fit_transform(X_train['workclass'])
X_train['education'] = encoder.fit_transform(X_train['education'])
X_train['marital status'] = encoder.fit_transform(X_train['marital status'])
X_train['occupation'] = encoder.fit_transform(X_train['occupation'])
X_train['sex'] = encoder.fit_transform(X_train['sex'])
X_train['gave birth this year'] = encoder.fit_transform(X_train['gave birth this year'])

y_train = y.copy()
y_train = encoder.fit_transform(y_train)

### Models

In [5]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [6]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [7]:
# Predictions
lr_preds = lr_model.predict(X_test)

In [8]:
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)

In [9]:
print(classification_report(y_test, lr_preds))

              precision    recall  f1-score   support

           0       0.71      0.53      0.61       625
           1       0.78      0.89      0.83      1175

    accuracy                           0.76      1800
   macro avg       0.75      0.71      0.72      1800
weighted avg       0.76      0.76      0.75      1800


In [10]:
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.7622222222222222


In [11]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [12]:
# test_predictions = lr_model.predict(test_data)

In [13]:
# test_predictions

In [14]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')