In [116]:
import warnings
warnings.filterwarnings("ignore")

from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.fairness_functions import statistical_measures, print_conf_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [117]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [118]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['age', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [119]:
X_train.head()

Unnamed: 0,age,education,workinghours,ability to speak english,gave birth this year_No,gave birth this year_Yes,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,...,occupation_Sales,"occupation_Science, Engineering, Technology",occupation_Service/Hospitality,occupation_Transport,sex_Female,sex_Male,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed
6317,22,16,36,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
740,61,22,40,1,1,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3781,48,16,40,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
7850,62,18,65,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
2963,53,19,44,0,1,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [120]:
X_test['sex'] = X_test['sex_Male'] * 1
X_train['sex'] = X_train['sex_Male'] * 1
X_test = X_test.drop(columns=['sex_Male', 'sex_Female'])
X_train = X_train.drop(columns=['sex_Male', 'sex_Female'])

DI, DS, EO, EOdds, conf_matrix = statistical_measures(X_train, y_train, X_test, y_test, 'sex', use_lib_implementation=False)
    
print(f"Disparate Impact (DI): {DI:.3f}")
print(f"Discrimination Score (DS): {DS:.3f}")
print(f"Equal Opportunity Difference (EO): {EO:.3f}")
print(f"Equalized Odds (EOdds): {EOdds:.3f}")

print_conf_matrix(conf_matrix)

Disparate Impact (DI): 0.525
Discrimination Score (DS): -0.198
Equal Opportunity Difference (EO): 0.179
Equalized Odds (EOdds): 0.097

Confusion Matrix: 
Percentage of True positives = 0.21777777777777776
Percentage of True negatives = 0.5177777777777778
Percentage of False positives = 0.135
Percentage of False negatives = 0.12944444444444445
FPR:  0.20680851063829786
TPR:  0.6272
PPP:  0.6173228346456693


In [121]:
def get_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float, float, float, float]:
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1
#model, X_train, X_test, y_train, y_test
def split_male_female_metrics(data: pd.DataFrame, is_encoded: bool = True) -> pd.DataFrame:
    
    X_, y_ = get_features_and_target(data, 'income')
    if not is_encoded:
        X_, y_ = encode_all_features(X_, y_, [])
        
    model = DecisionTreeClassifier(random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
        
    X_male_train = X_train[X_train['sex_Male'] == 1]
    X_male_test = X_test[X_test['sex_Male'] == 1]
    y_male_train = y_train[X_train['sex_Male'] == 1]
    y_male_test = y_test[X_test['sex_Male'] == 1]
    
    y_male_pred = model.predict(X_male_test)
    
    male_accuracy, male_precision, male_recall, male_f1 = get_metrics(y_male_test, y_male_pred)
    
    X_female_train = X_train[X_train['sex_Male'] == 0]
    X_female_test = X_test[X_test['sex_Male'] == 0]
    y_female_train = y_train[X_train['sex_Male'] == 0]
    y_female_test = y_test[X_test['sex_Male'] == 0]
    
    y_female_pred = model.predict(X_female_test)
    
    female_accuracy, female_precision, female_recall, female_f1 = get_metrics(y_female_test, y_female_pred)
    
    # calculate male conf matrix
    conf_matrix_male = confusion_matrix(y_male_test, y_male_pred)
    # calculate male conf matrix
    conf_matrix_female = confusion_matrix(y_female_test, y_female_pred)
    
    print("\nConfusion Matrix for Male group")
    print_conf_matrix(conf_matrix_male)
    print("\nConfusion Matrix for Female group")
    print_conf_matrix(conf_matrix_female)
    
    # Compare the model's accuracy for male and female groups
    metrics_table = pd.DataFrame({
        'Gender': ['Male', 'Female'],
        'Accuracy': [male_accuracy, female_accuracy],
        'Precision': [male_precision, female_precision],
        'Recall': [male_recall, female_recall],
        'F1-score': [male_f1, female_f1]
    })
    
    return metrics_table

# def split_male_female_metrics(data: pd.DataFrame, is_encoded: bool = True) -> pd.DataFrame:
#     
#     X_, y_ = get_features_and_target(data, 'income')
#     if not is_encoded:
#         X_, y_ = encode_all_features(X_, y_, ['sex'])
#         X_['sex'] = X_['sex'].map({'Female': 0, 'Male': 1})
#         
#     model = DecisionTreeClassifier(random_state=42)
#     X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)
#     model.fit(X_train, y_train)
#         
#     X_male = X_[X_['sex'] == 1]
#     X_female = X_[X_['sex'] == 0]
#     y_male = y_[X_['sex'] == 1]
#     y_female = y_[X_['sex'] == 0]
#     
#     print(f"Length of Male dataset: {len(X_male)}")
#     print(f"Length of Female dataset: {len(X_female)}")
#     
#     # X_male_train, X_male_test, y_male_train, y_male_test = train_test_split(X_male, y_male, test_size=0.2, random_state=42)
#     X_male_train = X_train[X_train['sex'] == 1]
#     X_male_test = X_test[X_test['sex'] == 1]
#     y_male_train = y_train[X_train['sex'] == 1]
#     y_male_test = y_test[X_test['sex'] == 1]
#     
#     clf_male = model # DecisionTreeClassifier(random_state=42)
#     # clf_male.fit(X_male_train, y_male_train)
#     y_male_pred = clf_male.predict(X_male_test)
#     
#     male_accuracy, male_precision, male_recall, male_f1 = get_metrics(y_male_test, y_male_pred)
#     
#     # X_female_train, X_female_test, y_female_train, y_female_test = train_test_split(X_female, y_female, test_size=0.2, random_state=42)
#     X_female_train = X_train[X_train['sex'] == 0]
#     X_female_test = X_test[X_test['sex'] == 0]
#     y_female_train = y_train[X_train['sex'] == 0]
#     y_female_test = y_test[X_test['sex'] == 0]
#     
#     clf_female = model # DecisionTreeClassifier(random_state=42)
#     # clf_female.fit(X_female_train, y_female_train)
#     y_female_pred = clf_female.predict(X_female_test)
#     
#     female_accuracy, female_precision, female_recall, female_f1 = get_metrics(y_female_test, y_female_pred)
#     
#     # calculate male conf matrix
#     conf_matrix_male = confusion_matrix(y_male_test, y_male_pred)
#     # calculate male conf matrix
#     conf_matrix_female = confusion_matrix(y_female_test, y_female_pred)
#     
#     print("\nConfusion Matrix for Male group")
#     print_conf_matrix(conf_matrix_male)
#     print("\nConfusion Matrix for Female group")
#     print_conf_matrix(conf_matrix_female)
#     
#     # Compare the model's accuracy for male and female groups
#     metrics_table = pd.DataFrame({
#         'Gender': ['Male', 'Female'],
#         'Accuracy': [male_accuracy, female_accuracy],
#         'Precision': [male_precision, female_precision],
#         'Recall': [male_recall, female_recall],
#         'F1-score': [male_f1, female_f1]
#     })
#     
#     return metrics_table

In [122]:
metrics_table = split_male_female_metrics(data, is_encoded=False)
metrics_table


Confusion Matrix for Male group

Confusion Matrix: 
Percentage of True positives = 0.25
Percentage of True negatives = 0.44950331125827814
Percentage of False positives = 0.13741721854304637
Percentage of False negatives = 0.1630794701986755
FPR:  0.23413258110014104
TPR:  0.6052104208416834
PPP:  0.6452991452991453

Confusion Matrix for Female group

Confusion Matrix: 
Percentage of True positives = 0.10304054054054054
Percentage of True negatives = 0.6587837837837838
Percentage of False positives = 0.12837837837837837
Percentage of False negatives = 0.1097972972972973
FPR:  0.1630901287553648
TPR:  0.48412698412698413
PPP:  0.44525547445255476


Unnamed: 0,Gender,Accuracy,Precision,Recall,F1-score
0,Male,0.699503,0.645299,0.60521,0.624612
1,Female,0.761824,0.445255,0.484127,0.463878


### Resampling the dataset

In [123]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)

X_, y_ = get_features_and_target(data, 'income')
X_, y_ = encode_all_features(X_, y_, ['sex'])
X_['sex'] =  X_['sex'].map({'Female': 0, 'Male': 1})

X_with_y = pd.concat([X_, y_], axis=1)

X_ = X_with_y.drop(columns=['sex'])
y_ = X_with_y['sex']

# Resample the dataset
X_resampled, y_resampled = adasyn.fit_resample(X_, y_)

X_with_y_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [124]:
X_with_y_resampled['sex'].value_counts() / len(X_with_y_resampled) * 100

sex
1    52.232959
0    47.767041
Name: count, dtype: float64

In [125]:
counts = X_with_y_resampled.groupby(['sex', 'income']).size().unstack(fill_value=0)
total_counts = X_with_y_resampled['sex'].value_counts()
# Calculate the percentage of each income level for each sex
percentage = counts.div(total_counts, axis=0) * 100

In [126]:
percentage

income,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,84.344815,15.655185
1,58.9,41.1


In [127]:
X_, y_ = get_features_and_target(X_with_y_resampled, 'income')

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=42)

DI, DS, EO, EOdds, conf_matrix = statistical_measures(X_train, y_train, X_test, y_test, 'sex', use_lib_implementation=False)
    
print(f"Disparate Impact (DI): {DI:.3f}")
print(f"Discrimination Score (DS): {DS:.3f}")
print(f"Equal Opportunity Difference (EO): {EO:.3f}")
print(f"Equalized Odds (EOdds): {EOdds:.3f}")

print_conf_matrix(conf_matrix)

Disparate Impact (DI): 0.355
Discrimination Score (DS): -0.252
Equal Opportunity Difference (EO): 0.094
Equalized Odds (EOdds): 0.161

Confusion Matrix: 
Percentage of True positives = 0.17275892080069627
Percentage of True negatives = 0.6100957354221062
Percentage of False positives = 0.10008703220191471
Percentage of False negatives = 0.11705831157528286
FPR:  0.1409313725490196
TPR:  0.5960960960960962
PPP:  0.6331738437001595


In [129]:
X_with_y_resampled['sex'] =  X_with_y_resampled['sex'].map({0: 'Female', 1: 'Male'})
X_with_y_resampled = pd.get_dummies(X_with_y_resampled, columns=['sex'], dtype=int)
metrics_table = split_male_female_metrics(X_with_y_resampled, is_encoded=True)


Confusion Matrix for Male group

Confusion Matrix: 
Percentage of True positives = 0.2542927228127555
Percentage of True negatives = 0.4260016353229763
Percentage of False positives = 0.16189697465249386
Percentage of False negatives = 0.15780866721177433
FPR:  0.2753824756606398
TPR:  0.6170634920634921
PPP:  0.6110019646365422

Confusion Matrix for Female group

Confusion Matrix: 
Percentage of True positives = 0.08372093023255814
Percentage of True negatives = 0.7683720930232558
Percentage of False positives = 0.08093023255813954
Percentage of False negatives = 0.06697674418604652
FPR:  0.09529025191675794
TPR:  0.5555555555555556
PPP:  0.5084745762711864


In [130]:
metrics_table

Unnamed: 0,Gender,Accuracy,Precision,Recall,F1-score
0,Male,0.680294,0.611002,0.617063,0.614018
1,Female,0.852093,0.508475,0.555556,0.530973
