In [20]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, f1_score
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric
from sklearn.model_selection import train_test_split
from aif360.sklearn.inprocessing import ExponentiatedGradientReduction
#from aif360.algorithms.inprocessing import GerryFairClassifier

In [21]:
def compute_performance_metrics(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1_score_value = f1_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(f"{model_name} Recall: {recall}")
    print(f"{model_name} F1 Score: {f1_score_value}")

def compute_classification_metric(dataset, predictions, label_name_v, favorable_label_v, unfavorable_label_v, privileged_attribute, unprivileged_attributes):
    features = [privileged_attribute] + unprivileged_attributes

    aif_age_dataset = BinaryLabelDataset(
            df=dataset,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    aif_age_pred = BinaryLabelDataset(
            df=predictions,
            favorable_label=favorable_label_v,
            unfavorable_label=unfavorable_label_v,
            label_names=[label_name_v],
            protected_attribute_names=features,
            privileged_protected_attributes=[privileged_attribute],
        )

    age_privileged_group = [{privileged_attribute: 1}]
    age_unprivileged_groups = [{attr: 1} for attr in unprivileged_attributes]

    fairness_metrics = ClassificationMetric(dataset=aif_age_dataset,
                            classified_dataset=aif_age_pred,
                            unprivileged_groups=age_unprivileged_groups,
                            privileged_groups=age_privileged_group)
    
    return fairness_metrics

def compute_fairness_metrics(fairness_metrics: ClassificationMetric):
    # Values less than 0 indicate that privileged group has higher
    # proportion of predicted positive outcomes than unprivileged group.
    # Value higher than 0 indicates that unprivileged group has higher proportion
    # of predicted positive outcomes than privileged group.
    SPD = round(fairness_metrics.statistical_parity_difference(),3)

    # Measures the deviation from the equality of opportunity, which means that the same
    # proportion of each population receives the favorable outcome. This measure must be equal to 0 to be fair.
    EOD = round(fairness_metrics.equal_opportunity_difference(),3)

    # Average of difference in False Positive Rate and True Positive Rate for unprivileged and privileged groups
    # A value of 0 indicates equality of odds, which means that samples in both the privileged and unprivileged
    # groups have the same probability of being classified positively.
    AOD = round(fairness_metrics.average_odds_difference(),3)

    print(f"Statistical Parity Difference (SPD): {SPD}")
    print(f"Equal Opportunity Difference (EOD): {EOD}")
    print(f"Average Odds Difference: {AOD}")

In [22]:
dataset_path = 'bankTrainedTidy.csv'
df_raw = pd.read_csv(dataset_path)
pd.set_option('display.max_columns', None)

In [23]:

X = df_raw.drop(columns="deposit")
y = df_raw["deposit"]

# Define four sets and apply the function
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, # 0.2 indicates a test set size of 20%
                                                    random_state=42)

privileged_attributeAge = "age_25<=age<60"
unprivileged_attributesAge = ["age_age<25", "age_age>=60"]
age_features = [privileged_attributeAge] + unprivileged_attributesAge


In [24]:
# First, we train our decision tree regularly
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)


# We create an ExponentiatedGradientReduction object that acts on the trained model to change its internal behavior
processed_dt_clf = ExponentiatedGradientReduction(prot_attr=age_features, estimator=dt_clf, constraints='DemographicParity', drop_prot_attr=False)


# Then we train again the same model updated with the reduction
fair_dt_clf = processed_dt_clf.fit(X_train, y_train)

# After the training phase, the model will be tested by predicting the values on the test set
fair_dt_predictions = fair_dt_clf.predict(X_test)

compute_performance_metrics(y_test,fair_dt_predictions,"Decision Tree")



You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.pos_basis[i]["+", e, g] = 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series

Decision Tree Accuracy: 0.7621818181818182
Decision Tree Recall: 0.6265938069216758
Decision Tree F1 Score: 0.677832512315271


In [25]:
# Creo la pipeline senza addestrarla
svm_pipeline = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))

# Addestro il classificatore sui dati di training 
svm_classifier = svm_pipeline.named_steps['linearsvc'].fit(X_train, y_train.values.ravel())

# Configurazione e addestramento ExponentiatedGradientReduction
processed_svm = ExponentiatedGradientReduction(
    prot_attr=age_features, 
    estimator=svm_classifier, 
    constraints='DemographicParity',
    drop_prot_attr=False
)

fair_svm = processed_svm.fit(X_train, y_train)

# predizioni sui dati di test
fair_svm_pred = fair_svm.predict(X_test)

# Calcolo le metriche di performance
compute_performance_metrics(y_test, fair_svm_pred, "SVM")

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.pos_basis[i]["+", e, g] = 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series

SVM Accuracy: 0.7985454545454546
SVM Recall: 0.6648451730418944
SVM F1 Score: 0.7249255213505462


In [26]:
# First, we train our decision tree regularly
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth = None, random_state=42)
rf_classifier.fit(X_train, y_train)

# We create an ExponentiatedGradientReduction object that acts on the trained model to change its internal behavior
processed_rf_classifier = ExponentiatedGradientReduction(prot_attr=age_features, estimator=rf_classifier, constraints='DemographicParity', drop_prot_attr=False)

# Then we train again the same model updated with the reduction
fair_rf_classifier = processed_rf_classifier.fit(X_train, y_train)

# After the training phase, the model will be tested by predicting the values on the test set
fair_rf_predictions = fair_rf_classifier.predict(X_test)

compute_performance_metrics(y_test,fair_rf_predictions,"Random Forest")

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.pos_basis[i]["+", e, g] = 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series

Random Forest Accuracy: 0.8087272727272727
Random Forest Recall: 0.6593806921675774
Random Forest F1 Score: 0.7335359675785208


In [27]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO IL DECISION TREE
dataset = X_test.copy(deep=True) # we create a copy of the test set
dataset['deposit'] = y_test  # and join the target feature with the others
predictions = dataset.copy(deep=True) # we do the same task
predictions['deposit'] = fair_dt_predictions # but this time the target feature is made by the predictions of our model

In [28]:
#Attributo privilegiato la colonna age_25<=age<60
#Attributo non privilegiato la colonna age_age<25, age_age>=60
# Valore favorevole 1
# Valore non favorevole 0

fairness_metrics = compute_classification_metric(dataset,predictions,"deposit",1,0,privileged_attributeAge, unprivileged_attributesAge) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.006
Equal Opportunity Difference (EOD): -0.284
Average Odds Difference: -0.139


In [29]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO L'SVM
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
predictions = dataset.copy(deep=True) 
predictions['deposit'] = fair_svm_pred

In [30]:
#Attributo privilegiato la colonna age_25<=age<60
#Attributo non privilegiato la colonna age_age<25, age_age>=60
# Valore favorevole 1
# Valore non favorevole 0

fairness_metrics = compute_classification_metric(dataset,predictions,"deposit",1,0,privileged_attributeAge,unprivileged_attributesAge) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.065
Equal Opportunity Difference (EOD): -0.244
Average Odds Difference: -0.121


In [31]:
#CALCOLO LE METRICHE DI FAIRNESS CONSIDERANDO COME MODELLO Il Random Forest
dataset = X_test.copy(deep=True)
dataset['deposit'] = y_test
predictions = dataset.copy(deep=True) 
predictions['deposit'] = fair_rf_predictions

In [32]:
#Attributo privilegiato la colonna age_25<=age<60
#Attributo non privilegiato la colonna age_age<25, age_age>=60
# Valore favorevole 1
# Valore non favorevole 0

fairness_metrics = compute_classification_metric(dataset,predictions,"deposit",1,0,privileged_attributeAge,unprivileged_attributesAge) #prima favorevole
compute_fairness_metrics(fairness_metrics)

Statistical Parity Difference (SPD): 0.072
Equal Opportunity Difference (EOD): -0.238
Average Odds Difference: -0.127
