In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
from sklearn.model_selection import train_test_split
from aif360.algorithms.postprocessing import EqOddsPostprocessing


  warn_deprecated('vmap', 'torch.vmap')


In [63]:
def compute_performance_metrics(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} Recall: {recall}")
    print(f"{model_name} F1 Score: {f1_score_value}")

def compute_classification_metric(dataset, predictions, label_name_v, favorable_label_v, unfavorable_label_v, privileged_attribute, unprivileged_attributes):
    features = [privileged_attribute] + unprivileged_attributes

    aif_age_dataset = BinaryLabelDataset(
            df=dataset,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    aif_age_pred = BinaryLabelDataset(
            df=predictions,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    age_privileged_group = [{privileged_attribute: 1}]
    age_unprivileged_groups = [{attr: 1} for attr in unprivileged_attributes]

    fairness_metrics = ClassificationMetric(dataset=aif_age_dataset,
                            classified_dataset=aif_age_pred,
                            unprivileged_groups=age_unprivileged_groups,
                            privileged_groups=age_privileged_group)
    
    return fairness_metrics

def compute_fairness_metrics(fairness_metrics: ClassificationMetric):
    # Values less than 0 indicate that privileged group has higher
    # proportion of predicted positive outcomes than unprivileged group.
    # Value higher than 0 indicates that unprivileged group has higher proportion
    # of predicted positive outcomes than privileged group.
    SPD = round(fairness_metrics.statistical_parity_difference(),3)

    # Measures the deviation from the equality of opportunity, which means that the same
    # proportion of each population receives the favorable outcome. This measure must be equal to 0 to be fair.
    EOD = round(fairness_metrics.equal_opportunity_difference(),3)

    # Average of difference in False Positive Rate and True Positive Rate for unprivileged and privileged groups
    # A value of 0 indicates equality of odds, which means that samples in both the privileged and unprivileged
    # groups have the same probability of being classified positively.
    AOD = round(fairness_metrics.average_odds_difference(),3)

    print(f"Statistical Parity Difference (SPD): {SPD}")
    print(f"Equal Opportunity Difference (EOD): {EOD}")
    print(f"Average Odds Difference: {AOD}")

In [2]:
dataset_path = 'bankTrainedRaw.csv'
df_raw = pd.read_csv(dataset_path)
pd.set_option('display.max_columns', None)

In [65]:

X = df_raw.drop(columns="deposit")
y = df_raw["deposit"]

# Define four sets and apply the function
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, # 0.2 indicates a test set size of 20%
                                                    random_state=42)

privileged_attributeAge = "age_25<=age<60"
unprivileged_attributesAge = ["age_age<25", "age_age>=60"]
age_features = [privileged_attributeAge] + unprivileged_attributesAge


In [66]:
# Add your functions here
# compute_performance_metrics and compute_classification_metric
# Add the definition for compute_fairness_metrics

# Train the Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# Predizione sui dati di test
dt_predictions = dt_clf.predict(X_test)

# Calcolo delle metriche di performance per il Decision Tree grezzo
#compute_performance_metrics(y_test, dt_predictions, "Decision Tree")

# Aggiunta della colonna del target al dataframe di X_test e rimozione di valori mancanti
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
#dataset = dataset.dropna()

# Conversione dei dati in formato BinaryLabelDataset
test_dataset = BinaryLabelDataset(df=dataset, label_names=['deposit'], protected_attribute_names=age_features)

# Creazione di un dataset di predizioni per il post-processing
predictions = dataset.copy(deep=True)
predictions['deposit'] = dt_predictions
#predictions = predictions.dropna()

test_pred_dataset = BinaryLabelDataset(df=predictions, label_names=['deposit'], protected_attribute_names=age_features)

# Applicazione del post-processing con EqOddsPostprocessing
dt_eq_odds = EqOddsPostprocessing(privileged_groups=[{age_features[0]: 1}], unprivileged_groups=[{age_features[0]: 0}])
dt_eq_odds = dt_eq_odds.fit(test_dataset, test_pred_dataset)

# Fai delle predizioni post-processate
dt_eq_odds_pred = dt_eq_odds.predict(test_pred_dataset)
dt_eq_odds_labels = dt_eq_odds_pred.labels

# Calcolo delle metriche di performance per il modello post-processato
compute_performance_metrics(y_test, dt_eq_odds_labels, "Decision Tree Post-processed")

Decision Tree Post-processed Accuracy: 0.7138378862516793
Decision Tree Post-processed Recall: 0.8341143392689785
Decision Tree Post-processed F1 Score: 0.7358412567176519


In [67]:
# Add your functions here
# compute_performance_metrics and compute_classification_metric
# Add the definition for compute_fairness_metrics

# Train the SVM
svm_pipeline = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
svm_classifier = svm_pipeline.named_steps['linearsvc'].fit(X_train, y_train.values.ravel())
svm_classifier.fit(X_train, y_train)

# Predizione sui dati di test
svm_predictions = svm_classifier.predict(X_test)

# Aggiunta della colonna del target al dataframe di X_test e rimozione di valori mancanti
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
#dataset = dataset.dropna()

# Conversione dei dati in formato BinaryLabelDataset
test_dataset = BinaryLabelDataset(df=dataset, label_names=['deposit'], protected_attribute_names=age_features)

# Creazione di un dataset di predizioni per il post-processing
predictions = dataset.copy(deep=True)
predictions['deposit'] = svm_predictions
#predictions = predictions.dropna()

test_pred_dataset = BinaryLabelDataset(df=predictions, label_names=['deposit'], protected_attribute_names=age_features)

# Applicazione del post-processing con EqOddsPostprocessing
svm_eq_odds = EqOddsPostprocessing(privileged_groups=[{age_features[0]: 1}], unprivileged_groups=[{age_features[0]: 0}])
svm_eq_odds = svm_eq_odds.fit(test_dataset, test_pred_dataset)

# Fai delle predizioni post-processate
svm_eq_odds_pred = svm_eq_odds.predict(test_pred_dataset)
svm_eq_odds_labels = svm_eq_odds_pred.labels

# Calcolo delle metriche di performance per il modello post-processato
compute_performance_metrics(y_test, svm_eq_odds_labels, "Decision Tree Post-processed")

Decision Tree Post-processed Accuracy: 0.7111509180474698
Decision Tree Post-processed Recall: 0.8359887535145267
Decision Tree Post-processed F1 Score: 0.7344586249485385


In [68]:
# Add your functions here
# compute_performance_metrics and compute_classification_metric
# Add the definition for compute_fairness_metrics

# Train the Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth = None, random_state=42)
rf_classifier.fit(X_train, y_train)


# Predizione sui dati di test
rf_predictions = rf_classifier.predict(X_test)

# Aggiunta della colonna del target al dataframe di X_test e rimozione di valori mancanti
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
#dataset = dataset.dropna()

# Conversione dei dati in formato BinaryLabelDataset
test_dataset = BinaryLabelDataset(df=dataset, label_names=['deposit'], protected_attribute_names=age_features)

# Creazione di un dataset di predizioni per il post-processing
predictions = dataset.copy(deep=True)
predictions['deposit'] = rf_predictions
#predictions = predictions.dropna()

test_pred_dataset = BinaryLabelDataset(df=predictions, label_names=['deposit'], protected_attribute_names=age_features)

# Applicazione del post-processing con EqOddsPostprocessing
rf_eq_odds = EqOddsPostprocessing(privileged_groups=[{age_features[0]: 1}], unprivileged_groups=[{age_features[0]: 0}])
rf_eq_odds = rf_eq_odds.fit(test_dataset, test_pred_dataset)

# Fai delle predizioni post-processate
rf_eq_odds_pred = rf_eq_odds.predict(test_pred_dataset)
rf_eq_odds_labels = rf_eq_odds_pred.labels

# Calcolo delle metriche di performance per il modello post-processato
compute_performance_metrics(y_test, rf_eq_odds_labels, "Decision Tree Post-processed")

Decision Tree Post-processed Accuracy: 0.7277205553067622
Decision Tree Post-processed Recall: 0.8940955951265229
Decision Tree Post-processed F1 Score: 0.7583465818759937


In [69]:
# Creazione del dataset di test con le etichette originali e predizioni del modello
test_dataset_with_labels = X_test.copy(deep=True)
test_dataset_with_labels['deposit'] = y_test

# Creazione del dataset di predizioni del modello post-processato
predictions_post_processed = X_test.copy(deep=True)
predictions_post_processed['deposit'] = dt_eq_odds_labels

In [70]:
# Calcolo delle metriche di fairness
fairness_metrics = compute_classification_metric(test_dataset_with_labels,predictions_post_processed,'deposit',1,0,privileged_attributeAge,unprivileged_attributesAge)
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.164
Equal Opportunity Difference (EOD): -0.012
Average Odds Difference: -0.001


In [71]:
# Creazione del dataset di test con le etichette originali e predizioni del modello
test_dataset_with_labels = X_test.copy(deep=True)
test_dataset_with_labels['deposit'] = y_test

# Creazione del dataset di predizioni del modello post-processato
predictions_post_processed = X_test.copy(deep=True)
predictions_post_processed['deposit'] = svm_eq_odds_labels

In [72]:
# Calcolo delle metriche di fairness
fairness_metrics = compute_classification_metric(test_dataset_with_labels,predictions_post_processed,'deposit',1,0,privileged_attributeAge,unprivileged_attributesAge)
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.169
Equal Opportunity Difference (EOD): -0.001
Average Odds Difference: 0.001


In [73]:
# Creazione del dataset di test con le etichette originali e predizioni del modello
test_dataset_with_labels = X_test.copy(deep=True)
test_dataset_with_labels['deposit'] = y_test

# Creazione del dataset di predizioni del modello post-processato
predictions_post_processed = X_test.copy(deep=True)
predictions_post_processed['deposit'] = rf_eq_odds_labels

In [74]:
# Calcolo delle metriche di fairness
fairness_metrics = compute_classification_metric(test_dataset_with_labels,predictions_post_processed,'deposit',1,0,privileged_attributeAge,unprivileged_attributesAge)
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.187
Equal Opportunity Difference (EOD): 0.015
Average Odds Difference: -0.016
