In [1]:
import warnings
warnings.filterwarnings("ignore")

from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.clfmodel_functions import tune_model, seq_feat_selection, multi_metric_cv, plot_multi_score_cv_results, forward_feat_selection_hypertuning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

from helper.fairness_functions import split_male_female_metrics, statistical_measures, print_statistical_measures

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'
pip install 'aif360[OptimalTransport]'


### Loading the cleaned dataset

In [2]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

### Feature Engineering (encoding) & Train-Test Split

In [3]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [4]:
X_train.head()

Unnamed: 0,age,education,workinghours,ability to speak english,sex_Female,sex_Male,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed,...,occupation_Service/Hospitality,occupation_Transport,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,gave birth this year_No,gave birth this year_Yes
6317,22,16,36,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
740,61,22,40,1,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
3781,48,16,40,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
7850,62,18,65,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2963,53,19,44,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


### Model

Here, we quickly train and evaluate a Decision Tree model with random parameters for demonstration.

In [5]:
# Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_preds)

print(classification_report(y_test, dt_preds))
print("Decision Tree Accuracy:", dt_accuracy)

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      1175
           1       0.59      0.58      0.58       625

    accuracy                           0.71      1800
   macro avg       0.68      0.68      0.68      1800
weighted avg       0.71      0.71      0.71      1800

Decision Tree Accuracy: 0.715


In [6]:
if False:
    seq_feat_selection(DecisionTreeClassifier(), X_train, y_train, direction='backward')

### Hyperparameter tuning

Here, we define the base parameter grid for our hyperparameter tuning function.

In [7]:
param_grid = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'random_state': [42]
    }

Below, we tune the hyperparameters of the Decision Tree model using the defined parameter grid and using all features.


In [8]:
if False:
    best_params, best_model, best_accuracy = tune_model(DecisionTreeClassifier(), X_train, y_train, X_test, y_test, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)

Below, we tune the hyperparameters of the Decision Tree model using the defined parameter grid and using a **subset of features**. We exclude the columns 'age', 'ability to speak english', and 'workclass'.

In [9]:
if False:
    # Splitting the data into features (X) and target (y)
    X_, y_ = get_features_and_target(data, 'income')
    columns_to_exclude = ['age', 'ability to speak english', 'workclass']
    X_ = X_.drop(columns=columns_to_exclude)
    # Encoding the features and target, and excluding some columns
    X_encoded_, y_encoded_ = encode_all_features(X_, y_, columns_to_exclude)
    X_train_, X_test_, y_train_, y_test_ = train_test_split(X_encoded_, y_encoded_, test_size=0.2, random_state=42)
    
    best_params, best_model, best_accuracy = tune_model(DecisionTreeClassifier(), X_train_, y_train_, X_test_, y_test_, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)

In [10]:
# Forward feature selection with hyperparameter tuning
if False:
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model = DecisionTreeClassifier(**best_params)
    final_model.fit(X_train[best_subset], y_train)
    final_model_preds = final_model.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))

In [11]:
from sklearn.ensemble import AdaBoostClassifier

if False:
    # Define the parameter grid for AdaBoost
    param_grid = {
        'algorithm': ['SAMME', 'SAMME.R'],
        'n_estimators': [50, 75, 100, 150],
        'learning_rate': [0.1, 0.4, 0.5, 1.0],
        'estimator': [None],
        'random_state': [42]
    }
    
    best_params, best_model, best_accuracy = tune_model(AdaBoostClassifier(), X_train, y_train, X_test, y_test, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)

In [12]:
def get_male_female_data(data: pd.DataFrame, is_encoded: bool):
    if is_encoded:
        male_data = data[data['sex_Male'] == 1]
        female_data = data[data['sex_Male'] == 0]
    else:
        male_data = data[data['sex'] == 'Male']
        female_data = data[data['sex'] == 'Female']
    
    return male_data, female_data

In [13]:
def get_male_female_test_data(X_male, X_female, X_test_, y_test_):
    """
    Get test data for males and females.
    
    Parameters:
        X_male (pd.DataFrame): Dataframe containing male samples.
        X_female (pd.DataFrame): Dataframe containing female samples.
        X_test_ (pd.DataFrame): Test features dataframe.
        y_test_ (pd.Series): Test target series.
        
    Returns:
        Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: 
            Test data for males (features, target) and females (features, target).
    """    
    # Get the indices of male and female samples in the original dataset
    male_indices = X_male.index
    female_indices = X_female.index

    # Get the indices of male and female samples in the test set
    male_indices_test = X_test_.index.intersection(male_indices)
    female_indices_test = X_test_.index.intersection(female_indices)

    # Get the test data for males and females
    X_male_test = X_test_.loc[male_indices_test]
    y_male_test = y_test_.loc[male_indices_test]

    X_female_test = X_test_.loc[female_indices_test]
    y_female_test = y_test_.loc[female_indices_test]

    return X_male_test, y_male_test, X_female_test, y_female_test

In [14]:
from sklearn.ensemble import AdaBoostClassifier

# Splitting the data into features (X) and target (y)
X_, y_ = get_features_and_target(data, 'income')
X_male, X_female = get_male_female_data(X_, False)
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
X__ = X_.drop(columns=columns_to_exclude)
# Encoding the features and target, and excluding some columns
X_encoded_, y_encoded_ = encode_all_features(X__, y_, columns_to_exclude)
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_encoded_, y_encoded_, test_size=0.2, random_state=42)

X_male_test, y_male_test, X_female_test, y_female_test = get_male_female_test_data(X_male, X_female, X_test_, y_test_)

print(len(X_male_test))
print(len(X_female_test))
    
# Define the parameter grid for AdaBoost
param_grid = {
    'algorithm': ['SAMME', 'SAMME.R'],
    'n_estimators': [50, 75, 100, 150],
    'learning_rate': [0.1, 0.4, 0.5, 1.0],
    'estimator': [None],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(AdaBoostClassifier(), X_train_, y_train_, X_test_, y_test_, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

1208
592
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best Hyperparameters: {'algorithm': 'SAMME.R', 'estimator': None, 'learning_rate': 0.5, 'n_estimators': 150, 'random_state': 42}
Best Model: AdaBoostClassifier(learning_rate=0.5, n_estimators=150, random_state=42)
Best Model Accuracy: 0.7716666666666666


In [15]:
split_testsets = [X_male_test, y_male_test, X_female_test, y_female_test]
model = AdaBoostClassifier(**best_params)
model.fit(X_train_, y_train_)
fpr_male, fpr_female, tpr_male, tpr_female = split_male_female_metrics(model, X_test, y_test, split_testsets=split_testsets)

print("Male FPR:", fpr_male)
print("Male TPR:", tpr_male)
print("Female FPR:", fpr_female)
print("Female TPR:", tpr_female)

Male FPR: 0.14809590973201692
Male TPR: 0.6332665330661322
Female FPR: 0.13304721030042918
Female TPR: 0.5158730158730159


In [17]:
y_pred = model.predict(X_test_)
X_test_with_sex = X_test_.join(X_['sex'])
X_test_with_sex['sex'] = X_test_with_sex['sex'].map({'Female': 0, 'Male': 1})
DI, DS, EO, EOdds, conf_matrix = statistical_measures(X_test_with_sex, y_test_, y_pred, 'sex', use_lib_implementation=False)
print_statistical_measures(DI, DS, EO, EOdds)

Disparate Impact (DI): 0.616
Discrimination Score (DS): -0.134
Equal Opportunity Difference (EO): 0.117
Equalized Odds (EOdds): 0.015


### Saving the model

In [None]:
# save model
save_model(dt_model, '../output/saved_models/decision_tree_model.joblib')