In [322]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression

### Loading the cleaned dataset

In [323]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [324]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, columns_to_exclude)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

  y = column_or_1d(y, warn=True)


In [ ]:
X_train

### Model

In [326]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)

In [327]:
# Predictions
lr_preds = lr_model.predict(X_test)
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)

print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy:", lr_accuracy)

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1175
           1       0.71      0.59      0.64       625

    accuracy                           0.77      1800
   macro avg       0.75      0.73      0.74      1800
weighted avg       0.77      0.77      0.77      1800

Logistic Regression Accuracy: 0.7716666666666666


### Feature Importance using the model itself

In [328]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]

# Create a DataFrame to store the feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})

# Sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [329]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
16,workclass_no paid work,-0.667489
11,marital status_Never married,-0.651976
10,marital status_Husband,0.640166
15,workclass_governmental,0.392334
7,sex_Female,-0.350035
13,marital status_Widowed,0.34127
8,sex_Male,0.323939
17,workclass_private,0.309997
12,marital status_Separated,-0.283289
1,education,0.277633


In [330]:
from sklearn.inspection import permutation_importance
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)

In [331]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [332]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
1,education,0.066919
2,workinghours,0.042156
0,age,0.032967
10,marital status_Husband,0.012896
11,marital status_Never married,0.007193
15,workclass_governmental,0.003004
17,workclass_private,0.002559
8,sex_Male,0.00177
7,sex_Female,0.001415
4,occupation_encoded,0.001074


In [333]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [334]:
# test_predictions = lr_model.predict(test_data)

In [335]:
# test_predictions

### Saving the model

In [336]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')