In [1]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression

### Loading the cleaned dataset

In [2]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [3]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, columns_to_exclude)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [4]:
X_train

Unnamed: 0,age,education,workinghours,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,occupation_Construction/Extraction,...,occupation_Protective Services,occupation_Repair/Maintenance,occupation_Sales,"occupation_Science, Engineering, Technology",occupation_Service/Hospitality,occupation_Transport,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed
6317,22,16,36,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
740,61,22,40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3781,48,16,40,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7850,62,18,65,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2963,53,19,44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,22,19,25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5191,24,16,28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5390,35,16,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
860,23,20,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Model

In [5]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)

In [6]:
# Predictions
lr_preds = lr_model.predict(X_test)
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)

print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy:", lr_accuracy)

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1175
           1       0.72      0.63      0.67       625

    accuracy                           0.79      1800
   macro avg       0.77      0.75      0.76      1800
weighted avg       0.78      0.79      0.78      1800

Logistic Regression Accuracy: 0.7855555555555556


### Feature Importance using the model itself

In [7]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]

# Create a DataFrame to store the feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})

# Sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [8]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
25,occupation_Service/Hospitality,-1.5942
24,"occupation_Science, Engineering, Technology",1.222623
14,occupation_Finance/Accounting,0.881249
28,workclass_no paid work,-0.747349
17,occupation_Management/Business,0.741893
16,occupation_Legal Services,0.716917
4,marital status_Husband,0.714221
19,occupation_Office/Administrative Support,-0.674846
5,marital status_Never married,-0.656019
18,occupation_Military Services,0.598502


In [9]:
from sklearn.inspection import permutation_importance
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
# https://scikit-learn.org/stable/modules/permutation_importance.html
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)

In [10]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [11]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
2,workinghours,0.038441
1,education,0.03817
0,age,0.02883
4,marital status_Husband,0.015278
25,occupation_Service/Hospitality,0.011411
24,"occupation_Science, Engineering, Technology",0.008152
17,occupation_Management/Business,0.007022
5,marital status_Never married,0.00637
19,occupation_Office/Administrative Support,0.003052
8,marital status_Wife,0.003011


In [ ]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

knn = KNeighborsClassifier()
rfecv = RFECV(knn, cv=StratifiedKFold(), scoring='accuracy')

rfecv.fit(X_train, y_train)

selected_features = rfecv.support_
feature_ranking = rfecv.ranking_

In [12]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [13]:
# test_predictions = lr_model.predict(test_data)

In [14]:
# test_predictions

### Saving the model

In [15]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')