In [None]:
%pip install -qU fairlearn pandas scikit-learn scipy matplotlib seaborn ucimlrepo ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy import stats

from typing import Optional, Callable, List, Tuple
import sys
from IPython.display import Markdown, display

In [None]:
def load_uci_dataset(
    id: int,
    get_column_names: Optional[Callable[[np.ndarray], List[str]]] = None,
) -> DataFrame:
    # Fetch dataset
    dataset = fetch_ucirepo(id=id)

    X = dataset.data.features
    y = dataset.data.targets

    y = pd.DataFrame(y)

    # Concatenate features and target
    df = pd.concat([X, y], axis=1)

    # Display metadata and variable information
    print("Dataset Metadata:")
    print(dataset.metadata)
    print("\nVariable Information:")
    print(dataset.variables)

    if get_column_names is not None:
        column_names = get_column_names(dataset.variables)
        df.columns = column_names

    # Rename the last column to "target"
    df = df.rename(columns={df.columns[-1]: "target"})

    print(df.columns.values)

    # Return the complete DataFrame with all features and target column labeled "target"
    return df

In [142]:
dataset_names = ["Adult", "Credit", "Bank"]

In [143]:
adult = load_uci_dataset(2)
credit_card = load_uci_dataset(
    350, lambda dataset_variables: dataset_variables["description"][1:].str.lower()  # Ignore ID column
)
bank_marketing = load_uci_dataset(222)

Dataset Metadata:
{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extra

In [144]:
for name, dataset in zip(dataset_names, [adult, credit_card, bank_marketing]):
    print(f"Dataset: {name}")
    print(dataset.head())
    print(dataset.info())
    print("\n---\n")

Dataset: Adult
   age         workclass  fnlwgt  ... hours-per-week  native-country target
0   39         State-gov   77516  ...             40   United-States  <=50K
1   50  Self-emp-not-inc   83311  ...             13   United-States  <=50K
2   38           Private  215646  ...             40   United-States  <=50K
3   53           Private  234721  ...             40   United-States  <=50K
4   28           Private  338409  ...             40            Cuba  <=50K

[5 rows x 15 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 

In [None]:
def prepare_dataset(
    df: DataFrame,
    features: List[str],
    sensitive_feature: str,
    columns_names: Optional[List[str]] = None,
    convert_target: Optional[Callable[[str], int]] = None,
) -> Tuple[np.ndarray[np.ndarray], np.ndarray, np.ndarray]:
    pass
    df = df.dropna()

    # Select features
    X = df[features]
    y = df["target"]
    if convert_target is not None:
        y = y.apply(convert_target)

    if columns_names is not None:
        X.columns = columns_names

    sensitive_attribute = df[sensitive_feature]  # "sex" or "race" or "age"

    # Normalize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y.values, sensitive_attribute.values

In [146]:
features_adult = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
features_credit_card = credit_card.columns[:-1]  # Features are all columns except target
# features_bank_marketing = [col for col in bank_marketing.columns if bank_marketing[col].dtype == "int64" and col != "target"]
# print(features_bank_marketing)

# "duration" not included as specified on https://archive.ics.uci.edu/dataset/222/bank+marketing
features_bank_marketing = ["age", "balance", "campaign", "pdays", "previous"]
# ['limit_bal', 'sex', 'education', 'marriage', 'age', 'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6']

adult_preprocessed = prepare_dataset(
    adult,
    features=features_adult,
    sensitive_feature="sex",
    convert_target=lambda t: 0 if t.strip() == "<=50K" else 1,
)
credit_preprocessed = prepare_dataset(
    credit_card,
    features_credit_card,
    sensitive_feature="sex",
)
bank_preprocessed = prepare_dataset(
    bank_marketing,
    features_bank_marketing,
    sensitive_feature="marital",
    convert_target=lambda t: 0 if t.strip() == "no" else 1,
)
# prepare_dataset(
#     bank_marketing,
#     features=[
#         "age",
#         "job",
#         "marital",
#         "education",
#         "default",
#         "housing",
#         "loan",
#         "contact",
#         "month",
#         "day_of_week",
#         "duration",
#         "campaign",
#         "pdays",
#         "previous",
#         "poutcome",
#         "emp.var.rate",
#         "cons.price.idx",
#         "cons.conf.idx",
#         "euribor3m",
#         "nr.employed",
#     ],
#     sensitive_feature="marital",
# )

In [147]:
RANDOM_STATE = 42   # for reproducibility

In [148]:
def assess_fairness(X, y, sensitive_attribute, model, k=20):
    kf = KFold(n_splits=k, shuffle=True, random_state=RANDOM_STATE)

    metrics = {
        group: {"TPR": [], "FPR": [], "FN_FP_ratio": []}
        for group in np.unique(sensitive_attribute)
    }

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        sensitive_train, sensitive_test = (
            sensitive_attribute[train_index],
            sensitive_attribute[test_index],
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        for group in np.unique(sensitive_attribute):
            group_mask = sensitive_test == group
            tn, fp, fn, tp = confusion_matrix(
                y_test[group_mask], y_pred[group_mask]
            ).ravel()

            tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fn_fp_ratio = fn / fp if fp > 0 else 0

            metrics[group]["TPR"].append(tpr)
            metrics[group]["FPR"].append(fpr)
            metrics[group]["FN_FP_ratio"].append(fn_fp_ratio)

    return metrics

In [149]:
def statistical_test(metrics, group1, group2):
    results = {}
    for metric in ["TPR", "FPR", "FN_FP_ratio"]:
        t_stat, p_value = stats.ttest_ind(
            metrics[group1][metric], metrics[group2][metric]
        )
        results[metric] = {"t_statistic": t_stat, "p_value": p_value}
    return results

In [166]:
USE_ALL_PROCESSORS = -1
models = {
    "Logistic Regression": LogisticRegression(random_state=RANDOM_STATE, n_jobs=USE_ALL_PROCESSORS),
    "Random Forest": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=USE_ALL_PROCESSORS),
}

In [None]:
def run_dataset(X, y, sensitive_attribute):
    # Assess fairness for each model
    for model_name, model in models.items():
        print(f"\nAssessing fairness for {model_name}")
        metrics = assess_fairness(X, y, sensitive_attribute, model)

        # Print average metrics for each group
        groups = np.unique(sensitive_attribute)

        assert(len(groups) == 2) # Assumes sensitive attributes only have 2 possible values

        for group in groups:
            print(f"\nGroup: {group}")
            for metric, values in metrics[group].items():
                print(f"{metric}: {np.mean(values):.4f}")

        # Perform statistical test
        results = statistical_test(metrics, groups[0], groups[1])
        print("\nStatistical Test Results:")
        for metric, statistics in results.items():
            print(
                f"{metric}: t-statistic = {statistics['t_statistic']:.4f}, p-value = {statistics['p_value']:.4f}"
            )

        # Train and evaluate model on entire dataset
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\nOverall Accuracy: {accuracy:.4f}")

In [176]:
output_file = "output.md"

In [177]:
%%capture output
# Direct output to stdout to a .md file, then show it in Markdown
print("# Training and testing models for biases")
for name, dataset in zip(dataset_names, [adult_preprocessed, credit_preprocessed, ]):
    print(f"## Dataset {name}")
    X, y, sensitive_attribute = dataset
    run_dataset(X, y, sensitive_attribute)

with open(output_file, "w") as f:
    f.write(output.stdout)


In [178]:
with open(output_file) as f:
    display(Markdown(f.read()))

# Training and testing models for biases
## Dataset Adult

Assessing fairness for Logistic Regression

Group: Female
TPR: 0.3756
FPR: 0.2295
FN_FP_ratio: 1.7363

Group: Male
TPR: 0.5554
FPR: 0.2768
FN_FP_ratio: 1.7944

Statistical Test Results:
TPR: t-statistic = -23.5105, p-value = 0.0000
FPR: t-statistic = -8.6284, p-value = 0.0000
FN_FP_ratio: t-statistic = -1.0967, p-value = 0.2797

Overall Accuracy: 0.6252

Assessing fairness for Random Forest

Group: Female
TPR: 0.3904
FPR: 0.2777
FN_FP_ratio: 1.4016

Group: Male
TPR: 0.5563
FPR: 0.2931
FN_FP_ratio: 1.6904

Statistical Test Results:
TPR: t-statistic = -20.7366, p-value = 0.0000
FPR: t-statistic = -2.2705, p-value = 0.0289
FN_FP_ratio: t-statistic = -6.5685, p-value = 0.0000

Overall Accuracy: 0.6156
## Dataset Credit

Assessing fairness for Logistic Regression

Group: 1
TPR: 0.2649
FPR: 0.0366
FN_FP_ratio: 6.8539

Group: 2
TPR: 0.2200
FPR: 0.0210
FN_FP_ratio: 11.1209

Statistical Test Results:
TPR: t-statistic = 4.1715, p-value = 0.0002
FPR: t-statistic = 6.3880, p-value = 0.0000
FN_FP_ratio: t-statistic = -3.7590, p-value = 0.0006

Overall Accuracy: 0.8097

Assessing fairness for Random Forest

Group: 1
TPR: 0.3763
FPR: 0.0682
FN_FP_ratio: 3.0430

Group: 2
TPR: 0.3558
FPR: 0.0513
FN_FP_ratio: 3.4129

Statistical Test Results:
TPR: t-statistic = 1.9164, p-value = 0.0629
FPR: t-statistic = 4.5476, p-value = 0.0001
FN_FP_ratio: t-statistic = -1.5738, p-value = 0.1238

Overall Accuracy: 0.8160


In [163]:
with open(output_file) as f:
    display(Markdown(f.read()))

# Training and testing models for biases
## Dataset Adult

Assessing fairness for Logistic Regression

Group: Female
TPR: 0.3756
FPR: 0.2295
FN_FP_ratio: 1.7363

Group: Male
TPR: 0.5554
FPR: 0.2768
FN_FP_ratio: 1.7944

Statistical Test Results:
TPR: t-statistic = -23.5105, p-value = 0.0000
FPR: t-statistic = -8.6284, p-value = 0.0000
FN_FP_ratio: t-statistic = -1.0967, p-value = 0.2797

Overall Accuracy: 0.6252

Assessing fairness for Random Forest

Group: Female
TPR: 0.3904
FPR: 0.2777
FN_FP_ratio: 1.4016

Group: Male
TPR: 0.5563
FPR: 0.2931
FN_FP_ratio: 1.6904

Statistical Test Results:
TPR: t-statistic = -20.7366, p-value = 0.0000
FPR: t-statistic = -2.2705, p-value = 0.0289
FN_FP_ratio: t-statistic = -6.5685, p-value = 0.0000

Overall Accuracy: 0.6156
## Dataset Credit

Assessing fairness for Logistic Regression

Group: 1
TPR: 0.2649
FPR: 0.0366
FN_FP_ratio: 6.8539

Group: 2
TPR: 0.2200
FPR: 0.0210
FN_FP_ratio: 11.1209

Statistical Test Results:
TPR: t-statistic = 4.1715, p-value = 0.0002
FPR: t-statistic = 6.3880, p-value = 0.0000
FN_FP_ratio: t-statistic = -3.7590, p-value = 0.0006

Overall Accuracy: 0.8097

Assessing fairness for Random Forest

Group: 1
TPR: 0.3763
FPR: 0.0682
FN_FP_ratio: 3.0430

Group: 2
TPR: 0.3558
FPR: 0.0513
FN_FP_ratio: 3.4129

Statistical Test Results:
TPR: t-statistic = 1.9164, p-value = 0.0629
FPR: t-statistic = 4.5476, p-value = 0.0001
FN_FP_ratio: t-statistic = -1.5738, p-value = 0.1238

Overall Accuracy: 0.8160
