In [65]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.clfmodel_functions import tune_model, seq_feat_selection, multi_metric_cv, plot_multi_score_cv_results, forward_feat_selection_hypertuning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression

### Loading the cleaned dataset

In [66]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [67]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
columns_to_exclude = []
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, columns_to_exclude)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [68]:
X_train

Unnamed: 0,age,education,workinghours,ability to speak english,gave birth this year_No,gave birth this year_Yes,sex_Female,sex_Male,workclass_governmental,workclass_no paid work,...,occupation_Sales,"occupation_Science, Engineering, Technology",occupation_Service/Hospitality,occupation_Transport,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife
6317,22,16,36,0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
740,61,22,40,1,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3781,48,16,40,0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7850,62,18,65,0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2963,53,19,44,0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,22,19,25,0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5191,24,16,28,0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5390,35,16,40,0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
860,23,20,40,0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Model

In [69]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)

In [70]:
# Predictions
lr_preds = lr_model.predict(X_test)
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)

print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy:", lr_accuracy)

              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1175
           1       0.72      0.64      0.68       625

    accuracy                           0.79      1800
   macro avg       0.77      0.76      0.76      1800
weighted avg       0.79      0.79      0.79      1800

Logistic Regression Accuracy: 0.7911111111111111


### Feature Importance using the model itself

In [71]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]

# Create a DataFrame to store the feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})

# Sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [72]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
28,occupation_Service/Hospitality,-1.540047
27,"occupation_Science, Engineering, Technology",1.224767
17,occupation_Finance/Accounting,0.971187
19,occupation_Legal Services,0.817
20,occupation_Management/Business,0.791085
16,"occupation_Farming, Fishing, Forestry",-0.641121
32,marital status_Never married,-0.612312
9,workclass_no paid work,-0.599867
21,occupation_Military Services,0.597584
31,marital status_Husband,0.582692


In [73]:
from sklearn.inspection import permutation_importance
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
# https://scikit-learn.org/stable/modules/permutation_importance.html
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)

In [74]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [75]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
1,education,0.036122
2,workinghours,0.035159
0,age,0.027837
28,occupation_Service/Hospitality,0.010415
31,marital status_Husband,0.007744
27,"occupation_Science, Engineering, Technology",0.007148
20,occupation_Management/Business,0.00697
32,marital status_Never married,0.004722
8,workclass_governmental,0.003289
29,occupation_Transport,0.002159


### Hyperparameter Tuning

In [76]:
param_grid = {
    'penalty': ['l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
    'max_iter': [5000],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(LogisticRegression(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=lbfgs; total time=   0.3s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=lbfgs; total time=   0.3s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=lbfgs; total time=   0.3s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=lbfgs; total time=   0.3s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=lbfgs; total time=   0.3s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=5000, penalty=l2, random_state=42, solver=liblinear

In [77]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [78]:
# test_predictions = lr_model.predict(test_data)

In [79]:
# test_predictions

### Saving the model

In [80]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')