In [26]:
%pip install -qU fairlearn pandas scikit-learn scipy matplotlib seaborn ucimlrepo ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [27]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
from scipy import stats

from typing import Optional, Callable, List, Tuple
from IPython.display import Markdown, display

In [28]:
def load_uci_dataset(
    id: int,
    get_column_names: Optional[Callable[[np.ndarray], List[str]]] = None,
) -> DataFrame:
    # Fetch dataset
    dataset = fetch_ucirepo(id=id)

    X = dataset.data.features
    y = dataset.data.targets

    y = pd.DataFrame(y)

    # Concatenate features and target
    df = pd.concat([X, y], axis=1)

    # Display metadata and variable information
    print("Dataset Metadata:")
    print(dataset.metadata)
    print("\nVariable Information:")
    print(dataset.variables)

    if get_column_names is not None:
        column_names = get_column_names(dataset.variables)
        df.columns = column_names

    # Rename the last column to "target"
    df = df.rename(columns={df.columns[-1]: "target"})

    print(df.columns.values)

    # Return the complete DataFrame with all features and target column labeled "target"
    return df

In [29]:
dataset_names = ["Adult", "Credit", "Bank"]

In [30]:
adult = load_uci_dataset(2)
credit_card = load_uci_dataset(
    350, lambda dataset_variables: dataset_variables["description"][1:].str.lower()  # Ignore ID column
)
bank_marketing = load_uci_dataset(222)

Dataset Metadata:
{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extra

In [31]:
for name, dataset in zip(dataset_names, [adult, credit_card, bank_marketing]):
    print(f"Dataset: {name}")
    print(dataset.head())
    print(dataset.info())
    print("\n---\n")

Dataset: Adult
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country target  
0          2174             0              40  United-States  <=50K  
1             0            

In [32]:
def prepare_dataset(
    df: DataFrame,
    features: List[str],
    sensitive_attribute: str,
    columns_names: Optional[List[str]] = None,
    convert_target: Optional[Callable[[str], int]] = None,
) -> Tuple[np.ndarray[np.ndarray], np.ndarray, np.ndarray]:
    pass
    df = df.dropna()

    # Select features
    X = df[features]
    y = df["target"]
    if convert_target is not None:
        y = y.apply(convert_target)

    if columns_names is not None:
        X.columns = columns_names

    sensitive_series = df[sensitive_attribute]  # e.g. "sex" or "race"

    # Normalize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y.values, sensitive_series.values

In [35]:
features_adult = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
features_credit_card = credit_card.columns[:-1]  # Features are all columns except target
# features_bank_marketing = [col for col in bank_marketing.columns if bank_marketing[col].dtype == "int64" and col != "target"]
# print(features_bank_marketing)

# "duration" not included as specified on https://archive.ics.uci.edu/dataset/222/bank+marketing
features_bank_marketing = ["age", "balance", "campaign", "pdays", "previous"]
# ['limit_bal', 'sex', 'education', 'marriage', 'age', 'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6']

adult_preprocessed = prepare_dataset(
    adult,
    features=features_adult,
    sensitive_attribute="sex",
    convert_target=lambda t: 0 if t.strip() == "<=50K" else 1,
)
print(adult_preprocessed[0].shape)
print(adult_preprocessed[1].shape)
print(adult_preprocessed[2].shape)
credit_preprocessed = prepare_dataset(
    credit_card,
    features_credit_card,
    sensitive_attribute="sex",
)
bank_preprocessed = prepare_dataset(
    bank_marketing,
    features_bank_marketing,
    sensitive_attribute="marital",
    convert_target=lambda t: 0 if t.strip() == "no" else 1,
)

(47621, 5)
(47621,)
(47621,)


In [36]:
for arr in adult_preprocessed:
    print(arr)

[[ 0.02650056  1.13272862  0.14462945 -0.2174557  -0.04894289]
 [ 0.83778069  1.13272862 -0.14573472 -0.2174557  -2.25118792]
 [-0.04725218 -0.42472622 -0.14573472 -0.2174557  -0.04894289]
 ...
 [-0.04725218  1.13272862 -0.14573472 -0.2174557   0.76670342]
 [ 0.39526426  1.13272862  0.58284695 -0.2174557  -0.04894289]
 [-0.2685104   1.13272862 -0.14573472 -0.2174557   1.58234973]]
[0 0 0 ... 1 1 1]
['Male' 'Male' 'Male' ... 'Male' 'Male' 'Male']


In [37]:
RANDOM_STATE = 42   # for reproducibility

In [38]:
def assess_fairness(X, y, sensitive_attribute, model, k=20):
    kf = KFold(n_splits=k, shuffle=True, random_state=RANDOM_STATE)

    metrics = {
        group: {"TPR": [], "FPR": [], "FN_FP_ratio": []}
        for group in np.unique(sensitive_attribute)
    }

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        sensitive_train, sensitive_test = (
            sensitive_attribute[train_index],
            sensitive_attribute[test_index],
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        for group in np.unique(sensitive_attribute):
            group_mask = sensitive_test == group
            tn, fp, fn, tp = confusion_matrix(
                y_test[group_mask], y_pred[group_mask]
            ).ravel()

            tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fn_fp_ratio = fn / fp if fp > 0 else 0

            metrics[group]["TPR"].append(tpr)
            metrics[group]["FPR"].append(fpr)
            metrics[group]["FN_FP_ratio"].append(fn_fp_ratio)

    return metrics

In [None]:
METRICS_TO_TEST = ["TPR", "FPR", "FN_FP_ratio"]

def statistical_test(metrics, groups):
    if len(groups) == 2:
        return statistical_t_test(metrics, *groups)
    return statistical_anova_test(metrics, *groups)

def statistical_anova_test(metrics, *groups):
    results = {}
    for metric in METRICS_TO_TEST:
        # Perform one-way ANOVA for more than two groups
        data = []
        group_labels = []
        for group in groups:
            data.extend(metrics[group][metric])
            group_labels.extend([group] * len(metrics[group][metric]))
        
        f_stat, p_value = stats.f_oneway(*[metrics[group][metric] for group in groups])
        results[metric] = {
            "test": "ANOVA",
            "statistic": f_stat,
            "p_value": p_value
        }

        # Perform post-hoc Tukey's HSD test
        tukey_results = stats.tukey_hsd(data, group_labels)
        results[metric]["post_hoc"] = tukey_results
    return results

def statistical_t_test(metrics, group1, group2):
    results = {}
    for metric in METRICS_TO_TEST:
        t_stat, p_value = stats.ttest_ind(
            metrics[group1][metric], metrics[group2][metric]
        )
        results[metric] = {
            "test": "t-test",
            "statistic": t_stat,
            "p_value": p_value
        }
    return results

In [51]:
USE_ALL_PROCESSORS = -1
models = {
    "Logistic Regression": LogisticRegression(random_state=RANDOM_STATE, n_jobs=USE_ALL_PROCESSORS),
    # "Random Forest": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=USE_ALL_PROCESSORS),
    # "Support Vector Machine": SVC(random_state=RANDOM_STATE),
    # "K Nearest Neighbors": KNeighborsClassifier(n_jobs=USE_ALL_PROCESSORS),
}

In [57]:
STATISTIC_SIGNIFICANCE_MAXIMUM = 0.05

def run_dataset(X, y, sensitive_attribute):
    # Assess fairness for each model
    for i, (model_name, model) in enumerate(models.items()):
        print(f"\n\n### {i + 1}. Assessing fairness for {model_name}")
        metrics = assess_fairness(X, y, sensitive_attribute, model)

        # Print average metrics for each group
        groups = np.unique(sensitive_attribute)

        assert(len(groups) == 2) # Assumes sensitive attributes only have 2 possible values

        for group in groups:
            print(f"\n\nGroup: **{group}**")
            for metric, values in metrics[group].items():
                print(f"\n- {metric}: {np.mean(values):.4f}")

        # Perform statistical test
        results = statistical_test(metrics, groups)
        print("\n\n#### Statistical Test Results:")
        for metric, statistics in results.items():
            print(f"\n- {metric}:")
            print(f"\n\t- Test: {statistics['test']}")
            print(f"\n\t- Statistic: {statistics['statistic']:.4f}")
            print(f"\n\t- p-value: {statistics['p_value']:.4f}")

            if statistics['test'] == "ANOVA" and statistics['p_value'] < 0.05:
                print("\n\t- Post-hoc Tukey HSD Results:")
                print(statistics['post_hoc'])

        # for metric, statistics in results.items():
        #     print(
        #         f"\n- {metric}: `t-statistic = {statistics['t_statistic']:.4f}`, `p-value = {statistics['p_value']:.4f}`"
        #     )

        # Train and evaluate model on entire dataset
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\n\n**Overall Accuracy**: {accuracy:.4f}")

In [58]:
output_file = "output2.md"

In [59]:
%%capture output
# Capture output from stdout to output variable
print("# Training and testing models for biases")
for name, dataset in zip(dataset_names, [adult_preprocessed, credit_preprocessed, ]):
    print(f"## Dataset {name}")
    X, y, sensitive_attribute = dataset
    run_dataset(X, y, sensitive_attribute)

In [60]:
with open(output_file, "w") as f:
    f.write(output.stdout)  # This actually works

In [61]:
with open(output_file) as f:
    display(Markdown(f.read()))

# Training and testing models for biases
## Dataset Adult


### 1. Assessing fairness for Logistic Regression


Group: **Female**

- TPR: 0.3756

- FPR: 0.2295

- FN_FP_ratio: 1.7363


Group: **Male**

- TPR: 0.5554

- FPR: 0.2768

- FN_FP_ratio: 1.7944


#### Statistical Test Results:

- TPR:

	- Test: t-test

	- Statistic: -23.5105

	- p-value: 0.0000

- FPR:

	- Test: t-test

	- Statistic: -8.6284

	- p-value: 0.0000

- FN_FP_ratio:

	- Test: t-test

	- Statistic: -1.0967

	- p-value: 0.2797


**Overall Accuracy**: 0.6252
## Dataset Credit


### 1. Assessing fairness for Logistic Regression


Group: **1**

- TPR: 0.2649

- FPR: 0.0366

- FN_FP_ratio: 6.8539


Group: **2**

- TPR: 0.2200

- FPR: 0.0210

- FN_FP_ratio: 11.1209


#### Statistical Test Results:

- TPR:

	- Test: t-test

	- Statistic: 4.1715

	- p-value: 0.0002

- FPR:

	- Test: t-test

	- Statistic: 6.3880

	- p-value: 0.0000

- FN_FP_ratio:

	- Test: t-test

	- Statistic: -3.7590

	- p-value: 0.0006


**Overall Accuracy**: 0.8097
