In [33]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
from sklearn.model_selection import train_test_split

In [34]:
dataset_path = 'BankDataset/bank.csv'
df_raw = pd.read_csv(dataset_path)

In [35]:
df_raw.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [36]:
def compute_performance_metrics(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} Recall: {recall}")
    print(f"{model_name} F1 Score: {f1_score_value}")

def compute_classification_metric(dataset, predictions, label_name_v, favorable_label_v, unfavorable_label_v, privileged_attribute, unprivileged_attributes):
    features = [privileged_attribute] + unprivileged_attributes

    aif_sex_dataset = BinaryLabelDataset(
            df=dataset,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    aif_sex_pred = BinaryLabelDataset(
            df=predictions,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    sex_privileged_group = [{privileged_attribute: 1}]
    sex_unprivileged_groups = [{attr: 1} for attr in unprivileged_attributes]

    fairness_metrics = ClassificationMetric(dataset=aif_sex_dataset,
                            classified_dataset=aif_sex_pred,
                            unprivileged_groups=sex_unprivileged_groups,
                            privileged_groups=sex_privileged_group)
    
    return fairness_metrics

def compute_fairness_metrics(fairness_metrics: ClassificationMetric):
    # Values less than 0 indicate that privileged group has higher
    # proportion of predicted positive outcomes than unprivileged group.
    # Value higher than 0 indicates that unprivileged group has higher proportion
    # of predicted positive outcomes than privileged group.
    SPD = round(fairness_metrics.statistical_parity_difference(),3)

    # Measures the deviation from the equality of opportunity, which means that the same
    # proportion of each population receives the favorable outcome. This measure must be equal to 0 to be fair.
    EOD = round(fairness_metrics.equal_opportunity_difference(),3)

    # Average of difference in False Positive Rate and True Positive Rate for unprivileged and privileged groups
    # A value of 0 indicates equality of odds, which means that samples in both the privileged and unprivileged
    # groups have the same probability of being classified positively.
    AOD = round(fairness_metrics.average_odds_difference(),3)

    print(f"Statistical Parity Difference (SPD): {SPD}")
    print(f"Equal Opportunity Difference (EOD): {EOD}")
    print(f"Average Odds Difference: {AOD}")

In [37]:
df_raw.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [38]:
#sostituisco i valori della colonna deposit 
#1:yes e 0:no 
df_raw['deposit'] = df_raw['deposit'].replace({'yes': 1, 'no': 0})
df_raw.head()

  df_raw['deposit'] = df_raw['deposit'].replace({'yes': 1, 'no': 0})


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,1
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,1
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,1
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,1
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,1


In [39]:
categorize_age = lambda x: 'age<25' if x < 25 else ('25<=age<60' if x < 60 else 'age>=60')
df_raw['age'] = df_raw['age'].apply(categorize_age)
df_raw.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,25<=age<60,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,1
1,25<=age<60,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,1
2,25<=age<60,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,1
3,25<=age<60,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,1
4,25<=age<60,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,1


In [40]:
feature_object = df_raw.select_dtypes(include=['object'])
feature_object.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'poutcome'],
      dtype='object')

In [41]:
categorical_columns = ["age", "job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]
df_raw = pd.get_dummies(df_raw, columns=categorical_columns)
df_raw.head()

Unnamed: 0,balance,day,duration,campaign,pdays,previous,deposit,age_25<=age<60,age_age<25,age_age>=60,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,2343,5,1042,1,-1,0,1,True,False,False,...,False,False,True,False,False,False,False,False,False,True
1,45,5,1467,1,-1,0,1,True,False,False,...,False,False,True,False,False,False,False,False,False,True
2,1270,5,1389,1,-1,0,1,True,False,False,...,False,False,True,False,False,False,False,False,False,True
3,2476,5,579,1,-1,0,1,True,False,False,...,False,False,True,False,False,False,False,False,False,True
4,184,5,673,2,-1,0,1,True,False,False,...,False,False,True,False,False,False,False,False,False,True


In [42]:
X = df_raw.drop(columns="deposit")
y = df_raw["deposit"]

# Define four sets and apply the function
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, # 0.2 indicates a test set size of 20%
                                                    random_state=42)

In [43]:
dt_clf = DecisionTreeClassifier(random_state=42)

# The fit function will do the trick
dt_clf.fit(X_train, y_train)

# After the training phase, the model will be tested by predicting the values on the test set
dt_predictions = dt_clf.predict(X_test)

compute_performance_metrics(y_test,dt_predictions,"Decision Tree")

Decision Tree Accuracy: 0.7908643081056874
Decision Tree Recall: 0.7713214620431116
Decision Tree F1 Score: 0.7789872219592996


In [44]:
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear'))

# Addestra il classificatore sui dati di training
svm_classifier.fit(X_train, y_train)

# Fai delle predizioni sui dati di test
svm_pred = svm_classifier.predict(X_test)

compute_performance_metrics(y_test,svm_pred,"SVM")

SVM Accuracy: 0.8159426780116436
SVM Recall: 0.8003748828491096
SVM F1 Score: 0.8060405851816895


In [45]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth = None, random_state=42)

rf_classifier.fit(X_train, y_train)

rf_predictions = rf_classifier.predict(X_test)

compute_performance_metrics(y_test,rf_predictions,"Random Forest")

Random Forest Accuracy: 0.8392297357814599
Random Forest Recall: 0.85941893158388
Random Forest F1 Score: 0.836297309621523


In [46]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO IL DECISION TREE
dataset = X_test.copy(deep=True) # we create a copy of the test set
dataset['deposit'] = y_test  # and join the target feature with the others
predictions = dataset.copy(deep=True) # we do the same task
predictions['deposit'] = dt_predictions # but this time the target feature is made by the predictions of our model

In [51]:
#Attributo privilegiato la colonna age tra 25 e 59
#Attributo non privilegiato la colonna minore di 24 e maggiore di 59
# Valore favorevole 1
# Valore non favorevole 0

unprivileged_attributes = ["age_age<25", "age_age>=60"]
fairness_metrics = compute_classification_metric(dataset,predictions,"deposit",1,0,"age_25<=age<60",unprivileged_attributes) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.316
Equal Opportunity Difference (EOD): 0.064
Average Odds Difference: 0.142


In [55]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO L'SVM
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
predictions = dataset.copy(deep=True) 
predictions['deposit'] = svm_pred

In [57]:
#Attributo privilegiato la colonna age tra 25 e 59
#Attributo non privilegiato la colonna minore di 24 e maggiore di 59
# Valore favorevole 1
# Valore non favorevole 0

unprivileged_attributes = ["age_age<25", "age_age>=60"]
fairness_metrics = compute_classification_metric(dataset,predictions,"deposit",1,0,"age_25<=age<60",unprivileged_attributes) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.359
Equal Opportunity Difference (EOD): 0.088
Average Odds Difference: 0.166


In [58]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO Il Random Forest
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
predictions = dataset.copy(deep=True) 
predictions['deposit'] = rf_predictions

In [59]:
#Attributo non privilegiato la colonna minore di 24 e maggiore di 59
# Valore favorevole 1
# Valore non favorevole 0

unprivileged_attributes = ["age_age<25", "age_age>=60"]
fairness_metrics = compute_classification_metric(dataset,predictions,"deposit",1,0,"age_25<=age<60",unprivileged_attributes) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.362
Equal Opportunity Difference (EOD): 0.064
Average Odds Difference: 0.162
