In [67]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
from sklearn.model_selection import train_test_split

In [68]:
dataset_path = 'GermanCreditDataset/German-Dataset.csv'
df_raw = pd.read_csv(dataset_path)

In [69]:
def compute_performance_metrics(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} Recall: {recall}")
    print(f"{model_name} F1 Score: {f1_score_value}")

def compute_classification_metric(dataset, predictions, label_name_v, favorable_label_v, unfavorable_label_v, privileged_attribute, unprivileged_attributes):
    features = [privileged_attribute] + unprivileged_attributes

    aif_sex_dataset = BinaryLabelDataset(
            df=dataset,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    aif_sex_pred = BinaryLabelDataset(
            df=predictions,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    sex_privileged_group = [{privileged_attribute: 1}]
    sex_unprivileged_groups = [{attr: 1} for attr in unprivileged_attributes]

    fairness_metrics = ClassificationMetric(dataset=aif_sex_dataset,
                            classified_dataset=aif_sex_pred,
                            unprivileged_groups=sex_unprivileged_groups,
                            privileged_groups=sex_privileged_group)
    
    return fairness_metrics

def compute_fairness_metrics(fairness_metrics: ClassificationMetric):
    # Values less than 0 indicate that privileged group has higher
    # proportion of predicted positive outcomes than unprivileged group.
    # Value higher than 0 indicates that unprivileged group has higher proportion
    # of predicted positive outcomes than privileged group.
    SPD = round(fairness_metrics.statistical_parity_difference(),3)

    # Measures the deviation from the equality of opportunity, which means that the same
    # proportion of each population receives the favorable outcome. This measure must be equal to 0 to be fair.
    EOD = round(fairness_metrics.equal_opportunity_difference(),3)

    # Average of difference in False Positive Rate and True Positive Rate for unprivileged and privileged groups
    # A value of 0 indicates equality of odds, which means that samples in both the privileged and unprivileged
    # groups have the same probability of being classified positively.
    AOD = round(fairness_metrics.average_odds_difference(),3)

    print(f"Statistical Parity Difference (SPD): {SPD}")
    print(f"Equal Opportunity Difference (EOD): {EOD}")
    print(f"Average Odds Difference: {AOD}")

In [70]:
#A92: donna A93: uomo
# young<=25, old >25
#target: 1=good, 2=bad
df_raw.head()

Unnamed: 0,Status of exisiting checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [71]:
df_raw.columns

Index(['Status of exisiting checking account', 'Duration in month',
       'Credit history', 'Purpose', 'Credit amount', 'Savings account/bonds',
       'Present employment since',
       'Installment rate in percentage of disposable income', 'sex',
       'Other debtors / guarantors', 'Present residence since', 'Property',
       'Age in years', 'Other installment plans', 'Housing',
       'Number of existing credits at this bank', 'Job',
       'Number of people being liable to provide maintenance for', 'Telephone',
       'foreign worker', 'Target'],
      dtype='object')

In [72]:
#sostituisco i vari valori della colonna sex con solo Male e Female
df_raw = df_raw.replace({'A93' : 'Male','A91': 'Male' , 'A94': 'Male' ,'A92'  : 'Female', 'A95': "Female"})
df_raw.head()

Unnamed: 0,Status of exisiting checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Target
0,A11,6,A34,A43,1169,A65,A75,4,Male,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,Female,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,Male,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,Male,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,Male,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [73]:
#creo una nuova colonna age_group che piuttosto che essere in anni sono divisi in young e old
age_threshold=25
df_raw['age'] = df_raw['Age in years'].apply(lambda x: 'young' if x <= age_threshold else 'old')
df_raw.head()

Unnamed: 0,Status of exisiting checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,sex,Other debtors / guarantors,...,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Target,age
0,A11,6,A34,A43,1169,A65,A75,4,Male,A101,...,67,A143,A152,2,A173,1,A192,A201,1,old
1,A12,48,A32,A43,5951,A61,A73,2,Female,A101,...,22,A143,A152,1,A173,1,A191,A201,2,young
2,A14,12,A34,A46,2096,A61,A74,2,Male,A101,...,49,A143,A152,1,A172,2,A191,A201,1,old
3,A11,42,A32,A42,7882,A61,A74,2,Male,A103,...,45,A143,A153,1,A173,2,A191,A201,1,old
4,A11,24,A33,A40,4870,A61,A73,3,Male,A101,...,53,A143,A153,2,A173,2,A191,A201,2,old


In [74]:
#dato che serviva una colonna age con old e young ho eliminato la colonna age in years
#columns_to_drop = ["Age in years"]
#df_raw = df_raw.drop(columns=columns_to_drop)
df_raw = df_raw.dropna()
df_raw.head()

Unnamed: 0,Status of exisiting checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,sex,Other debtors / guarantors,...,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,foreign worker,Target,age
0,A11,6,A34,A43,1169,A65,A75,4,Male,A101,...,67,A143,A152,2,A173,1,A192,A201,1,old
1,A12,48,A32,A43,5951,A61,A73,2,Female,A101,...,22,A143,A152,1,A173,1,A191,A201,2,young
2,A14,12,A34,A46,2096,A61,A74,2,Male,A101,...,49,A143,A152,1,A172,2,A191,A201,1,old
3,A11,42,A32,A42,7882,A61,A74,2,Male,A103,...,45,A143,A153,1,A173,2,A191,A201,1,old
4,A11,24,A33,A40,4870,A61,A73,3,Male,A101,...,53,A143,A153,2,A173,2,A191,A201,2,old


In [75]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   Status of exisiting checking account                      1000 non-null   object
 1   Duration in month                                         1000 non-null   int64 
 2   Credit history                                            1000 non-null   object
 3   Purpose                                                   1000 non-null   object
 4   Credit amount                                             1000 non-null   int64 
 5   Savings account/bonds                                     1000 non-null   object
 6   Present employment since                                  1000 non-null   object
 7   Installment rate in percentage of disposable income       1000 non-null   int64 
 8   sex                          

In [76]:
categorical_columns = ["Status of exisiting checking account", "Credit history", "Purpose", "Savings account/bonds", "Present employment since", "sex", "Other debtors / guarantors", "Property", "Other installment plans", "Housing", "Job", "Telephone", "foreign worker", "age"]
df_raw = pd.get_dummies(df_raw, columns=categorical_columns)

In [77]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 62 columns):
 #   Column                                                    Non-Null Count  Dtype
---  ------                                                    --------------  -----
 0   Duration in month                                         1000 non-null   int64
 1   Credit amount                                             1000 non-null   int64
 2   Installment rate in percentage of disposable income       1000 non-null   int64
 3   Present residence since                                   1000 non-null   int64
 4   Age in years                                              1000 non-null   int64
 5   Number of existing credits at this bank                   1000 non-null   int64
 6   Number of people being liable to provide maintenance for  1000 non-null   int64
 7   Target                                                    1000 non-null   int64
 8   Status of exisiting checking account_A1

In [78]:
df_raw.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate in percentage of disposable income,Present residence since,Age in years,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Target,Status of exisiting checking account_A11,Status of exisiting checking account_A12,...,Job_A171,Job_A172,Job_A173,Job_A174,Telephone_A191,Telephone_A192,foreign worker_A201,foreign worker_A202,age_old,age_young
0,6,1169,4,4,67,2,1,1,True,False,...,False,False,True,False,False,True,True,False,True,False
1,48,5951,2,2,22,1,1,2,False,True,...,False,False,True,False,True,False,True,False,False,True
2,12,2096,2,3,49,1,2,1,False,False,...,False,True,False,False,True,False,True,False,True,False
3,42,7882,2,4,45,1,2,1,True,False,...,False,False,True,False,True,False,True,False,True,False
4,24,4870,3,4,53,2,2,2,True,False,...,False,False,True,False,True,False,True,False,True,False


In [79]:
#sostituisco i valori della colonna target 
#1:good con 1 mentre il valore 2: bad lo sostituisco con 0
df_raw['Target'] = df_raw['Target'].replace({1: 1, 2: 0})
df_raw.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate in percentage of disposable income,Present residence since,Age in years,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Target,Status of exisiting checking account_A11,Status of exisiting checking account_A12,...,Job_A171,Job_A172,Job_A173,Job_A174,Telephone_A191,Telephone_A192,foreign worker_A201,foreign worker_A202,age_old,age_young
0,6,1169,4,4,67,2,1,1,True,False,...,False,False,True,False,False,True,True,False,True,False
1,48,5951,2,2,22,1,1,0,False,True,...,False,False,True,False,True,False,True,False,False,True
2,12,2096,2,3,49,1,2,1,False,False,...,False,True,False,False,True,False,True,False,True,False
3,42,7882,2,4,45,1,2,1,True,False,...,False,False,True,False,True,False,True,False,True,False
4,24,4870,3,4,53,2,2,0,True,False,...,False,False,True,False,True,False,True,False,True,False


In [80]:
df_raw.columns

Index(['Duration in month', 'Credit amount',
       'Installment rate in percentage of disposable income',
       'Present residence since', 'Age in years',
       'Number of existing credits at this bank',
       'Number of people being liable to provide maintenance for', 'Target',
       'Status of exisiting checking account_A11',
       'Status of exisiting checking account_A12',
       'Status of exisiting checking account_A13',
       'Status of exisiting checking account_A14', 'Credit history_A30',
       'Credit history_A31', 'Credit history_A32', 'Credit history_A33',
       'Credit history_A34', 'Purpose_A40', 'Purpose_A41', 'Purpose_A410',
       'Purpose_A42', 'Purpose_A43', 'Purpose_A44', 'Purpose_A45',
       'Purpose_A46', 'Purpose_A48', 'Purpose_A49',
       'Savings account/bonds_A61', 'Savings account/bonds_A62',
       'Savings account/bonds_A63', 'Savings account/bonds_A64',
       'Savings account/bonds_A65', 'Present employment since_A71',
       'Present employmen

In [97]:
df_raw.to_csv('GermanCreditDataset/German-Dataset2.csv', index=False) 

In [81]:
X = df_raw.drop(columns="Target")
y = df_raw["Target"]

# Define four sets and apply the function
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, # 0.2 indicates a test set size of 20%
                                                    random_state=42)

In [82]:
dt_clf = DecisionTreeClassifier(random_state=42)

# The fit function will do the trick
dt_clf.fit(X_train, y_train)

# After the training phase, the model will be tested by predicting the values on the test set
dt_predictions = dt_clf.predict(X_test)

compute_performance_metrics(y_test,dt_predictions,"Decision Tree")

Decision Tree Accuracy: 0.69
Decision Tree Recall: 0.7801418439716312
Decision Tree F1 Score: 0.7801418439716312


In [83]:
svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear'))

# Addestra il classificatore sui dati di training
svm_classifier.fit(X_train, y_train)

# Fai delle predizioni sui dati di test
svm_pred = svm_classifier.predict(X_test)

compute_performance_metrics(y_test,svm_pred,"SVM")

SVM Accuracy: 0.76
SVM Recall: 0.8723404255319149
SVM F1 Score: 0.8367346938775511


In [84]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth = None, random_state=42)

rf_classifier.fit(X_train, y_train)

rf_predictions = rf_classifier.predict(X_test)

compute_performance_metrics(y_test,rf_predictions,"Random Forest")

Random Forest Accuracy: 0.76
Random Forest Recall: 0.900709219858156
Random Forest F1 Score: 0.8410596026490066


In [85]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO IL DECISION TREE
dataset = X_test.copy(deep=True) # we create a copy of the test set
dataset['Target'] = y_test  # and join the target feature with the others
predictions = dataset.copy(deep=True) # we do the same task
predictions['Target'] = dt_predictions # but this time the target feature is made by the predictions of our model

In [86]:
#Attributo privilegiato la colonna sex_Male
#Attributo non privilegiato la colonna sex_Female
# Valore favorevole 1
# Valore non favorevole 0

unprivileged_attributes = ["sex_Female"]
fairness_metrics = compute_classification_metric(dataset,predictions,"Target",1,0,"sex_Male",unprivileged_attributes) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): -0.012
Equal Opportunity Difference (EOD): -0.015
Average Odds Difference: -0.005


In [87]:
#Attributo privilegiato la colonna age_old
#Attributo non privilegiato la colonna age_young
#Valore di label favorevole 1 e label non favorevole 0

unprivileged_attributes = ["age_young"]
fairness_metrics = compute_classification_metric(dataset, predictions, "Target", 1, 0, "age_old", unprivileged_attributes)
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): -0.089
Equal Opportunity Difference (EOD): -0.127
Average Odds Difference: -0.055


In [88]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO L'SVM
dataset = X_test.copy(deep=True)
dataset['Target'] = y_test
predictions = dataset.copy(deep=True) 
predictions['Target'] = svm_pred

In [89]:
#Attributo privilegiato la colonna sex_Male
#Attributo non privilegiato la colonna sex_Female
# Valore favorevole 1
# Valore non favorevole 0

unprivileged_attributes = ["sex_Female"]
fairness_metrics = compute_classification_metric(dataset,predictions,"Target",1,0,"sex_Male",unprivileged_attributes) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): -0.021
Equal Opportunity Difference (EOD): 0.035
Average Odds Difference: -0.051


In [90]:
#Attributo privilegiato la colonna age_old
#Attributo non privilegiato la colonna age_young
#Valore di label favorevole 1 e label non favorevole 0

unprivileged_attributes = ["age_young"]
fairness_metrics = compute_classification_metric(dataset, predictions, "Target", 1, 0, "age_old", unprivileged_attributes)
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): -0.379
Equal Opportunity Difference (EOD): -0.242
Average Odds Difference: -0.447


In [91]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO Il Random Forest
dataset = X_test.copy(deep=True)
dataset['Target'] = y_test
predictions = dataset.copy(deep=True) 
predictions['Target'] = rf_predictions

In [92]:
#Attributo privilegiato la colonna sex_Male
#Attributo non privilegiato la colonna sex_Female
# Valore favorevole 1
# Valore non favorevole 0

unprivileged_attributes = ["sex_Female"]
fairness_metrics = compute_classification_metric(dataset,predictions,"Target",1,0,"sex_Male",unprivileged_attributes) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.023
Equal Opportunity Difference (EOD): 0.066
Average Odds Difference: 0.0


In [93]:
#Attributo privilegiato la colonna age_old
#Attributo non privilegiato la colonna age_young
#Valore di label favorevole 1 e label non favorevole 0

unprivileged_attributes = ["age_young"]
fairness_metrics = compute_classification_metric(dataset, predictions, "Target", 1, 0, "age_old", unprivileged_attributes)
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): -0.276
Equal Opportunity Difference (EOD): -0.143
Average Odds Difference: -0.343
