## Investigating which resampling technique will results in balance of fairness acroos metrics of statistical partiy and equalized odds

In [50]:
import pandas as pd

# Load the data
file_path = 'data/df_cleaned.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,sex,juv_fel_count,juv_misd_count,juv_other_count,priors_count,two_year_recid,age_cat_25-45,age_cat_Greaterthan45,age_cat_Lessthan25,charge_degree,race
0,1,-0.14965,-0.190564,-0.247266,-0.709995,1,1,0,0,1,1
1,1,-0.14965,-0.190564,1.827564,0.110444,1,0,0,1,1,1
2,1,-0.14965,-0.190564,-0.247266,2.16154,1,1,0,0,1,0
3,0,-0.14965,-0.190564,-0.247266,-0.709995,0,1,0,0,0,0
4,1,-0.14965,-0.190564,-0.247266,-0.709995,0,1,0,0,1,0


In [51]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, roc_auc_score
from fairness_metrics import statistical_parity, equalized_odds
from metricFrame_vis import analyze_metrics_using_metricFrame

In [58]:
# Initialize a DataFrame to store the results of our analysis for esay comparison

resampling_results_df = pd.DataFrame(columns=[
    'resampling_group',
    'model_accuracy',
    'AUC', 
    'f1', 
    'precision', 
    'recall', 
    'statistical_parity_race',
    'equalized_odds_race',
])

# Function to update the DataFrame with new results
def update_results(resampling_group,
                   model_accuracy, 
                   AUC, 
                   f1, 
                   precision, 
                   recall, 
                   statistical_parity_race, 
                   equalized_odds_race
                  ):
    new_entry = {
        'resampling_group': resampling_group, 
        'model_accuracy': model_accuracy, 
        'AUC': AUC, 
        'f1': f1, 
        'precision': precision, 
        'recall': recall, 
        'statistical_parity_race': str(statistical_parity_race), 
        'equalized_odds_race': str(equalized_odds_race), 
    }
    new_row = pd.DataFrame(new_entry, index=[len(resampling_results_df)])
    return pd.concat([new_row, resampling_results_df])


# Example usage after evaluating a model
# resampling_results_df = update_results(
#     resampling_group='upsampling',
#     model_accuracy=0.85,  
#     AUC=0.90,  
#     f1=0.87, 
#     precision=0.88,  
#     recall=0.86, 
#     statistical_parity_sex =0.89, 
#     equalized_odds_sex =0.99,
#     statistical_parity_race = 0.23, 
#     equalized_odds_race =0.33
# )

## 1. Upsampled race

In [59]:
# Upsampling RACE feature
group_1 = df[df['race'] == 1]
group_0 = df[df['race'] == 0]


# Function to resample each subgroup to have the same number of samples
def resample_group(df, group_size):
    return df.sample(n=group_size, replace=True, random_state=42)

max_group_size = max(len(group_0[group_0['two_year_recid'] == 1]), len(group_1[group_1['two_year_recid'] == 1]))

# Resample each group to have the same number of samples
group_0_recidivate = resample_group(group_0[group_0['two_year_recid'] == 1], max_group_size)
group_1_recidivate = resample_group(group_1[group_1['two_year_recid'] == 1], max_group_size)

group_0_not_recidivate = resample_group(group_0[group_0['two_year_recid'] == 0], max_group_size)
group_1_not_recidivate = resample_group(group_1[group_1['two_year_recid'] == 0], max_group_size)

# Combine the resampled data back
df_upsampled_race = pd.concat([group_0_recidivate, group_0_not_recidivate, group_1_recidivate, group_1_not_recidivate])

# Shuffle the resampled data
df_upsampled_race = df_upsampled_race.sample(frac=1).reset_index(drop=True)

# Data Preparation
X = df_upsampled_race.drop(['two_year_recid'], axis=1)
y = df_upsampled_race['two_year_recid']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Remove sex and race from the training data
X_train = X_train.drop(['sex', 'race'], axis=1)
sensitive_feature_race = X_test['race']
sensitive_feature_sex = X_test['sex']
X_test = X_test.drop(['sex', 'race'], axis=1)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(learning_rate=0.01, 
                                           max_depth=3, 
                                           n_estimators=300, 
                                           random_state=42)

# Fit the grid search to the data
gb_classifier.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_gb = gb_classifier.predict(X_test)

model_accuracy_gb = accuracy_score(y_test, y_pred_gb)
AUC = roc_auc_score(y_test, y_pred_gb)
f1 = f1_score(y_test, y_pred_gb)
precision = precision_score(y_test, y_pred_gb)
recall = recall_score(y_test, y_pred_gb)

statistical_parity_race = statistical_parity(y_test, y_pred_gb, sensitive_feature_race)
equalized_odds_race = equalized_odds(y_test, y_pred_gb, sensitive_feature_race)

statistical_parity_sex = statistical_parity(y_test, y_pred_gb, sensitive_feature_sex)
equalized_odds_sex = equalized_odds(y_test, y_pred_gb, sensitive_feature_sex)

resampling_results_df = update_results(
    resampling_group='Upsampled_Race',
    model_accuracy=model_accuracy_gb,  
    AUC=AUC,  
    f1=f1, 
    precision=precision,  
    recall=recall, 
    statistical_parity_race = statistical_parity_race, 
    equalized_odds_race = equalized_odds_race
)

In [60]:
resampling_results_df

Unnamed: 0,resampling_group,model_accuracy,AUC,f1,precision,recall,statistical_parity_race,equalized_odds_race
0,Upsampled_Race,0.660983,0.661613,0.640808,0.689143,0.598808,"{'Statistical Parity Difference': 0.184, 'Stat...","{'Equalized Odds Difference': 0.209, 'Equalize..."


## 2. Downsampled_Race

In [61]:
# Upsampling RACE feature
group_1 = df[df['race'] == 1]
group_0 = df[df['race'] == 0]


# Function to resample each subgroup to have the same number of samples
def resample_group(df, group_size):
    return df.sample(n=group_size, replace=True, random_state=42)

min_group_size = min(len(group_0[group_0['two_year_recid'] == 1]), len(group_1[group_1['two_year_recid'] == 1]))

# Resample each group to have the same number of samples
group_0_recidivate = resample_group(group_0[group_0['two_year_recid'] == 1], min_group_size)
group_1_recidivate = resample_group(group_1[group_1['two_year_recid'] == 1], min_group_size)

group_0_not_recidivate = resample_group(group_0[group_0['two_year_recid'] == 0], min_group_size)
group_1_not_recidivate = resample_group(group_1[group_1['two_year_recid'] == 0], min_group_size)

# Combine the resampled data back
df_downsampled_race = pd.concat([group_0_recidivate, group_0_not_recidivate, group_1_recidivate, group_1_not_recidivate])

# Shuffle the resampled data
df_downsampled_race = df_downsampled_race.sample(frac=1).reset_index(drop=True)

# Data Preparation
X = df_downsampled_race.drop(['two_year_recid'], axis=1)
y = df_downsampled_race['two_year_recid']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Remove sex and race from the training data
X_train = X_train.drop(['sex', 'race'], axis=1)
sensitive_feature_race = X_test['race']
sensitive_feature_sex = X_test['sex']
X_test = X_test.drop(['sex', 'race'], axis=1)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(learning_rate=0.01, 
                                           max_depth=3, 
                                           n_estimators=300, 
                                           random_state=42)

# Fit the grid search to the data
gb_classifier.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_gb = gb_classifier.predict(X_test)

model_accuracy_gb = accuracy_score(y_test, y_pred_gb)
AUC = roc_auc_score(y_test, y_pred_gb)
f1 = f1_score(y_test, y_pred_gb)
precision = precision_score(y_test, y_pred_gb)
recall = recall_score(y_test, y_pred_gb)

statistical_parity_race = statistical_parity(y_test, y_pred_gb, sensitive_feature_race)
equalized_odds_race = equalized_odds(y_test, y_pred_gb, sensitive_feature_race)

resampling_results_df = update_results(
    resampling_group='Downsampled_Race',
    model_accuracy=model_accuracy_gb,  
    AUC=AUC,  
    f1=f1, 
    precision=precision,  
    recall=recall, 
    statistical_parity_race = statistical_parity_race, 
    equalized_odds_race = equalized_odds_race
)

In [62]:
resampling_results_df

Unnamed: 0,resampling_group,model_accuracy,AUC,f1,precision,recall,statistical_parity_race,equalized_odds_race
1,Downsampled_Race,0.62614,0.627239,0.601081,0.65566,0.55489,"{'Statistical Parity Difference': 0.218, 'Stat...","{'Equalized Odds Difference': 0.278, 'Equalize..."
0,Upsampled_Race,0.660983,0.661613,0.640808,0.689143,0.598808,"{'Statistical Parity Difference': 0.184, 'Stat...","{'Equalized Odds Difference': 0.209, 'Equalize..."
