In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features, get_train_test_with_excluded_columns
from helper.clfmodel_functions import tune_model, forward_feat_selection_hypertuning

from helper.fairness_functions import print_male_female_metrics, get_male_female_data

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.inspection import permutation_importance

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'


### Loading the cleaned dataset

In [2]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

### Feature Engineering (encoding) & Train-Test Split

In [3]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
X_male, X_female = get_male_female_data(X, False)
# Encoding the features and target
X_encoded, y_encoded = encode_all_features(X, y, [])
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [4]:
X_train

Unnamed: 0,age,education,workinghours,ability to speak english,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,...,occupation_Service/Hospitality,occupation_Transport,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed,gave birth this year_No,gave birth this year_Yes,sex_Female,sex_Male
6317,22,16,36,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
740,61,22,40,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
3781,48,16,40,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
7850,62,18,65,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
2963,53,19,44,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,22,19,25,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
5191,24,16,28,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,0,1,0
5390,35,16,40,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,1
860,23,20,40,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,0,1


### Model

Here, we quickly train and evaluate a Logistic Regression model with random parameters for demonstration.

In [5]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_preds)

print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy:", lr_accuracy)

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1175
           1       0.73      0.64      0.68       625

    accuracy                           0.79      1800
   macro avg       0.77      0.76      0.76      1800
weighted avg       0.79      0.79      0.79      1800

Logistic Regression Accuracy: 0.7916666666666666


### Feature Importance using the model itself

In [6]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})
# We sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [7]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
26,occupation_Service/Hospitality,-1.550581
25,"occupation_Science, Engineering, Technology",1.215813
15,occupation_Finance/Accounting,0.958572
17,occupation_Legal Services,0.821199
18,occupation_Management/Business,0.787061
19,occupation_Military Services,0.65001
14,"occupation_Farming, Fishing, Forestry",-0.613917
6,marital status_Never married,-0.611398
5,marital status_Husband,0.581464
29,workclass_no paid work,-0.565824


In [8]:
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
# https://scikit-learn.org/stable/modules/permutation_importance.html
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [9]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
1,education,0.03631852
2,workinghours,0.03525185
0,age,0.02828519
26,occupation_Service/Hospitality,0.01062222
5,marital status_Husband,0.0077
25,"occupation_Science, Engineering, Technology",0.007111111
18,occupation_Management/Business,0.006966667
6,marital status_Never married,0.004874074
28,workclass_governmental,0.003388889
27,occupation_Transport,0.002066667


### Hyperparameter Tuning

Here, we define the base parameter grid for our hyperparameter tuning function.

In [10]:
param_grid = {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
        'max_iter': [5000],
        'random_state': [42]
    }

Below, we tune the hyperparameters of the Logistic Regression model using the defined parameter grid and using all features.

In [11]:
if True:
    best_params, best_model, best_accuracy = tune_model(LogisticRegression(), X_train, y_train, X_test, y_test, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)
    
    lr_preds = best_model.predict(X_test)
    print(classification_report(y_test, lr_preds))
    
    print_male_female_metrics(best_model, X, X_male, X_female, X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best Hyperparameters: {'C': 1, 'max_iter': 5000, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}
Best Model: LogisticRegression(C=1, max_iter=5000, random_state=42)
Best Model Accuracy: 0.7916666666666666
              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1175
           1       0.73      0.64      0.68       625

    accuracy                           0.79      1800
   macro avg       0.77      0.76      0.76      1800
weighted avg       0.79      0.79      0.79      1800


Male FPR: 0.17912552891396333
Male TPR: 0.7314629258517034
Female FPR: 0.0536480686695279
Female TPR: 0.29365079365079366

Disparate Impact (DI): 0.257
Discrimination Score (DS): -0.303
Equal Opportunity Difference (EO): 0.438
Equalized Odds (EOdds): 0.125


Below, we tune the hyperparameters of the Logistic Regression model using the defined parameter grid and using a **subset of features**. We exclude the columns 'age', 'ability to speak english', and 'workclass'.

In [12]:
if True:
    columns_to_exclude = ['age', 'ability to speak english', 'workclass']
    X_train_, X_test_, y_train_, y_test_, X_male, X_female = get_train_test_with_excluded_columns(data, columns_to_exclude)
    
    best_params, best_model, best_accuracy = tune_model(LogisticRegression(), X_train_, y_train_, X_test_, y_test_, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)
    
    lr_preds = best_model.predict(X_test_)
    print(classification_report(y_test_, lr_preds))
    
    print_male_female_metrics(best_model, X, X_male, X_female, X_test_, y_test_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best Hyperparameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}
Best Model: LogisticRegression(C=0.1, max_iter=5000, random_state=42)
Best Model Accuracy: 0.7733333333333333
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1175
           1       0.71      0.59      0.64       625

    accuracy                           0.77      1800
   macro avg       0.75      0.73      0.74      1800
weighted avg       0.77      0.77      0.77      1800


Male FPR: 0.18053596614950634
Male TPR: 0.6753507014028056
Female FPR: 0.04721030042918455
Female TPR: 0.23809523809523808

Disparate Impact (DI): 0.228
Discrimination Score (DS): -0.297
Equal Opportunity Difference (EO): 0.437
Equalized Odds (EOdds): 0.133


Because hyperparameter tuning results can vary depending on the chosen subset of features, we will use forward feature selection with hyperparameter tuning to find the best subset of features and hyperparameters for the model.

In [13]:
# Forward feature selection with hyperparameter tuning
if True:
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(LogisticRegression(), param_grid, X_train, y_train, X_test, y_test)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model_allf_ffs = LogisticRegression(**best_params)
    final_model_allf_ffs.fit(X_train[best_subset], y_train)
    final_model_preds = final_model_allf_ffs.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))
    
    print_male_female_metrics(final_model_allf_ffs, X, X_male, X_female, X_test[best_subset], y_test)

{'penalty': ['l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'], 'max_iter': [5000], 'random_state': [42]}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best subset: ['occupation_Construction/Extraction', 'occupation_Counseling/Mental Health Services', 'occupation_Education', 'occupation_Entertainment', 'occupation_Farming, Fishing, Forestry', 'occupation_Finance/Accounting', 'occupation_Healthcare/Medical Services', 'occupation_Legal Services', '

Below we repeat the above process, but this time excluding the sex-related columns

In [14]:
# Forward feature selection with hyperparameter tuning
if True:
    columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
    X_train_, X_test_, y_train_, y_test_, X_male, X_female = get_train_test_with_excluded_columns(data, columns_to_exclude)
    
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(LogisticRegression(), param_grid, X_train, y_train, X_test, y_test)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model_sexexlc_ffs = LogisticRegression(**best_params)
    final_model_sexexlc_ffs.fit(X_train[best_subset], y_train)
    final_model_preds = final_model_sexexlc_ffs.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))
    
    print_male_female_metrics(final_model_sexexlc_ffs, X, X_male, X_female, X_test[best_subset], y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best subset: ['occupation_Construction/Extraction', 'occupation_Counseling/Mental Health Services', 'occupation_Education', 'occupation_Entertainment', 'occupation_Farming, Fishing, Forestry', 'occupation_Finance/Accounting', 'occupation_Healthcare/Medical Services', 'occupation_Legal Services', 'occupation_Management/Business', 'occupation_Military Services', 'occupation_Office/Administrative Support', 'occupation_Production/Assembly', 'occupatio

### AdaBoost Classifier

In [15]:
# Define the parameter grid for AdaBoost
param_grid = {
    'algorithm': ['SAMME', 'SAMME.R'],
    'n_estimators': [50, 75, 100, 150],
    'learning_rate': [0.1, 0.5, 1.0],
    'estimator': [LogisticRegression(random_state=42), final_model_allf_ffs],
    'random_state': [42]
}

Below, we tune the hyperparameters of the AdaBoost model using the defined parameter grid and using all features.

In [16]:
best_params, best_model, best_accuracy = tune_model(AdaBoostClassifier(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

print_male_female_metrics(best_model, X, X_male, X_female, X_test, y_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Hyperparameters: {'algorithm': 'SAMME.R', 'estimator': LogisticRegression(random_state=42), 'learning_rate': 1.0, 'n_estimators': 150, 'random_state': 42}
Best Model: AdaBoostClassifier(estimator=LogisticRegression(random_state=42),
                   n_estimators=150, random_state=42)
Best Model Accuracy: 0.7833333333333333

Male FPR: 0.18758815232722145
Male TPR: 0.7214428857715431
Female FPR: 0.04935622317596566
Female TPR: 0.24603174603174602

Disparate Impact (DI): 0.224
Discrimination Score (DS): -0.317
Equal Opportunity Difference (EO): 0.475
Equalized Odds (EOdds): 0.138


Below, we tune the hyperparameters of the AdaBoost model using the defined parameter grid and using a **subset of features**. We exclude the sex-related columns.

In [17]:
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
X_train_, X_test_, y_train_, y_test_, X_male, X_female = get_train_test_with_excluded_columns(data, columns_to_exclude)

best_params, best_model, best_accuracy = tune_model(AdaBoostClassifier(), X_train_, y_train_, X_test_, y_test_, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

print_male_female_metrics(best_model, X, X_male, X_female, X_test_, y_test_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Hyperparameters: {'algorithm': 'SAMME.R', 'estimator': LogisticRegression(random_state=42), 'learning_rate': 1.0, 'n_estimators': 150, 'random_state': 42}
Best Model: AdaBoostClassifier(estimator=LogisticRegression(random_state=42),
                   n_estimators=150, random_state=42)
Best Model Accuracy: 0.7538888888888889

Male FPR: 0.14527503526093088
Male TPR: 0.5811623246492986
Female FPR: 0.12017167381974249
Female TPR: 0.40476190476190477

Disparate Impact (DI): 0.556
Discrimination Score (DS): -0.145
Equal Opportunity Difference (EO): 0.176
Equalized Odds (EOdds): 0.025


### Saving the model

In [18]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')