In [190]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_nominal_features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression

### Loading the cleaned dataset

In [191]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [192]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')

columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
X = X.drop(columns=columns_to_exclude)

# List of nominal features
nominal_features_lc = ['workclass', 'marital status'] # low cardinality features
nominal_features_hc = ['occupation'] # high cardinality features

# Encoded datasets
X_encoded = encode_nominal_features(X, nominal_features_lc, nominal_features_hc)
y_encoded = y.map({'low': 0, 'high': 1})

  y = column_or_1d(y, warn=True)


### Model

In [193]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [194]:
X_train

Unnamed: 0,age,education,workinghours,occupation_encoded,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife
6317,22,16,36,11,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
740,61,22,40,14,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3781,48,16,40,10,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7850,62,18,65,11,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2963,53,19,44,10,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,22,19,25,16,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5191,24,16,28,14,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5390,35,16,40,10,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
860,23,20,40,16,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [195]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

In [196]:
# Predictions
lr_preds = lr_model.predict(X_test)

In [197]:
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)

In [198]:
print(classification_report(y_test, lr_preds))

              precision    recall  f1-score   support

           0       0.79      0.86      0.83      1175
           1       0.70      0.58      0.63       625

    accuracy                           0.77      1800
   macro avg       0.75      0.72      0.73      1800
weighted avg       0.76      0.77      0.76      1800


In [199]:
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.7661111111111111


### Feature Importance using the model itself

In [200]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]

# Create a DataFrame to store the feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})

# Sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [201]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
9,marital status_Husband,0.955678
5,workclass_no paid work,-0.572826
10,marital status_Never married,-0.53749
13,marital status_Wife,-0.401363
4,workclass_governmental,0.381343
6,workclass_private,0.30904
12,marital status_Widowed,0.280982
1,education,0.272311
11,marital status_Separated,-0.256555
2,workinghours,0.057537


In [202]:
from sklearn.inspection import permutation_importance
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)

In [203]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [204]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
1,education,0.065767
2,workinghours,0.045085
0,age,0.032219
9,marital status_Husband,0.030433
10,marital status_Never married,0.004952
4,workclass_governmental,0.002878
6,workclass_private,0.002333
13,marital status_Wife,0.000711
7,workclass_self employed,0.000393
12,marital status_Widowed,0.000344


In [205]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [206]:
# test_predictions = lr_model.predict(test_data)

In [207]:
# test_predictions

In [208]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')