In [135]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.fairness_functions import statistical_measures
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

In [136]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [137]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['age', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [138]:
X_train

Unnamed: 0,age,education,workinghours,ability to speak english,gave birth this year_No,gave birth this year_Yes,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,...,occupation_Management/Business,occupation_Military Services,occupation_Office/Administrative Support,occupation_Production/Assembly,occupation_Protective Services,occupation_Repair/Maintenance,occupation_Sales,"occupation_Science, Engineering, Technology",occupation_Service/Hospitality,occupation_Transport
6317,22,16,36,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
740,61,22,40,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3781,48,16,40,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7850,62,18,65,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2963,53,19,44,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,22,19,25,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5191,24,16,28,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
5390,35,16,40,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
860,23,20,40,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [139]:
X_test['sex'] = X_test['sex_Male'] * 1
X_train['sex'] = X_train['sex_Male'] * 1
X_test = X_test.drop(columns=['sex_Male', 'sex_Female'])
X_train = X_train.drop(columns=['sex_Male', 'sex_Female'])

DI, DS, EO, EOdds = statistical_measures(X_train, y_train, X_test, y_test, 'sex')
    
print(f"Disparate Impact (DI): {DI:.3f}")
print(f"Discrimination Score (DS): {DS:.3f}")
print(f"Equal Opportunity Difference (EO): {EO:.3f}")
print(f"Equalized Odds (EOdds): {EOdds:.3f}")

Disparate Impact (DI): 0.525
Discrimination Score (DS): -0.198
Equal Opportunity Difference (EO): 0.179
Equalized Odds (EOdds): 0.097


In [140]:
from sklearn.metrics import precision_score, recall_score, f1_score

columns_to_exclude = []
data = data.drop(columns=columns_to_exclude)

# Split the dataset into male and female groups
male_data = data[data['sex'] == 'Male']
female_data = data[data['sex'] == 'Female']

print(f"Length of Male dataset: {len(male_data)}")
print(f"Length of Female dataset: {len(female_data)}")

X_male, y_male = get_features_and_target(male_data, 'income')

X_male, y_male = encode_all_features(X_male, y_male, ['sex'] + columns_to_exclude)
X_male['sex'] = X_male['sex'].map({'Male': 1})
X_male_train, X_male_test, y_male_train, y_male_test = train_test_split(X_male, y_male, test_size=0.2, random_state=42)
clf_male = DecisionTreeClassifier(random_state=42)
clf_male.fit(X_male_train, y_male_train)
y_male_pred = clf_male.predict(X_male_test)

male_accuracy = accuracy_score(y_male_test, y_male_pred)
male_precision = precision_score(y_male_test, y_male_pred)
male_recall = recall_score(y_male_test, y_male_pred)
male_f1 = f1_score(y_male_test, y_male_pred)

X_female, y_female = get_features_and_target(female_data, 'income')

X_female, y_female = encode_all_features(X_female, y_female, ['sex'] + columns_to_exclude)
X_female['sex'] = X_female['sex'].map({'Female': 0})
X_female_train, X_female_test, y_female_train, y_female_test = train_test_split(X_female, y_female, test_size=0.2, random_state=42)
clf_female = DecisionTreeClassifier(random_state=42)
clf_female.fit(X_female_train, y_female_train)
y_female_pred = clf_female.predict(X_female_test)

female_accuracy = accuracy_score(y_female_test, y_female_pred)
female_precision = precision_score(y_female_test, y_female_pred)
female_recall = recall_score(y_female_test, y_female_pred)
female_f1 = f1_score(y_female_test, y_female_pred)

# Compare the model's accuracy for male and female groups
metrics_table = pd.DataFrame({
    'Gender': ['Male', 'Female'],
    'Accuracy': [male_accuracy, female_accuracy],
    'Precision': [male_precision, female_precision],
    'Recall': [male_recall, female_recall],
    'F1-score': [male_f1, female_f1]
})

Index(['age', 'workclass', 'education', 'marital status', 'occupation',
       'workinghours', 'sex', 'ability to speak english',
       'gave birth this year', 'income'],
      dtype='object')
marital status
Husband          3588
Never married    2388
Wife             1571
Divorced         1105
Widowed           188
Separated         160
Name: count, dtype: int64
   age      workclass  education marital status                   occupation  \
0   52  self employed         16        Widowed                    Transport   
1   60        private         20       Divorced  Healthcare/Medical Services   
2   64        private         21       Divorced          Management/Business   
3   64        private         17        Husband                    Transport   
4   31        private         15        Husband                    Transport   

   workinghours     sex  ability to speak english gave birth this year income  
0            50    Male                         0                   No  

In [141]:
metrics_table

Unnamed: 0,Gender,Accuracy,Precision,Recall,F1-score
0,Male,0.689167,0.633197,0.614314,0.623613
1,Female,0.773333,0.394958,0.423423,0.408696


In [142]:
from imblearn.over_sampling import ADASYN

# Initialize ADASYN
adasyn = ADASYN(random_state=42)

# Resample the dataset
X_resampled, y_resampled = adasyn.fit_resample(X, y)

ValueError: could not convert string to float: 'self employed'