In [None]:
%pip install -q ucimlrepo
%pip install -q sklearn
%pip install -q semopy
%pip install -q tableone

# Detecting and mitigating bias using causal modelling

## The method

Introduced by Hui and Lau (2024, doi:[10.1109/CCCIS63483.2024.00016](https://doi.org/10.1109/CCCIS63483.2024.00016)).

**Aim:** Detect and mitigate bias on a sensitive attribute in a black-box predictive model, by modelling and infering the causal relationship between the attribute, the model's preedicted outcome and the ground-truth outcome.

## The experiment

Reproducing Hui and Lau's study and results by training a baseline bias-unaware model on the [UCI Adult dataset](https://archive-beta.ics.uci.edu/dataset/2/adult) and applying causal modelling to detect and correct the bias introduced by the model between the protected attribute (sex) and the prediction. 

### Hypothesis
We are able to reproduce the results of the study.

In [None]:
import semopy
from sklearn.metrics import accuracy_score, confusion_matrix

def get_causal_model_params(X_train, y_train, y_pred_proba, protected_attribute):
  '''
    Trains a semopy causal model on the given training data and predictions,\
     for the following linear causal model:\
      y_pred ~ beta0 + beta1*y_true + beta2*protected_attribute,\
     to identify the causal relationship between the protected attribute\
      and the predicted outcome.

     Inputs
       X_train: training features
       y_train: training labels
       y_pred_proba: predicted probabilities
       protected_attribute: name of the protected attribute

     Outputs
       beta2: coefficient of the causal relationship between\
        the protected attribute and the predicted outcome
       beta2_pvalue: p-value of the causal relationship
  '''
  causal_features = pd.DataFrame()
  causal_features['protected_attribute'] = X_train[protected_attribute]
  causal_features['y_true'] = y_train
  causal_features['y_pred'] = y_pred_proba

  model_desc='''
    y_true ~ protected_attribute
    y_pred ~ y_true + protected_attribute
  '''

  causal_model = semopy.Model(model_desc)
  causal_model.fit(causal_features)
  causal_params = causal_model.inspect()

  print(causal_params.to_markdown())

  # Retrieve the coefficients of the causal model
  beta2 = causal_params.loc[(causal_params.rval == "protected_attribute") & 
                            (causal_params.lval == "y_pred"),'Estimate'].values[0]
  beta2_pvalue = causal_params.loc[(causal_params.rval == "protected_attribute") & 
                            (causal_params.lval == "y_pred"),'p-value'].values[0]

  return [beta2, beta2_pvalue]

def get_perf_metrics(y_true, y_pred, y_pred_proba):
  '''
    Calculates the performance metrics for a given set of predictions.

    Inputs
      y_true: true labels
      y_pred: predicted labels
      y_pred_proba: predicted probabilities

    Outputs
      accuracy: accuracy score
      FNR: False Negative Rate
  '''
  accuracy = accuracy_score(y_true, y_pred)
  tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
  FNR = fn / (fn + tp)

  return [accuracy, FNR]

# Data Pre-Processing

In [38]:
from tableone import TableOne
from ucimlrepo import fetch_ucirepo
import pandas as pd

# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

adult_df = pd.concat([X,y], axis=1)
adult_df.rename(lambda x: x.replace('-','_'), axis=1, inplace=True)

# Reduce dimensionality of categorical features before one-hot encoding
# native_country: bucket rare countries in 'other', handle na and ? as unknown
country_counts = adult_df['native_country'].value_counts(normalize=True)
rare_countries = country_counts[country_counts <= 0.01]

adult_df['native_country_grouped'] = adult_df['native_country'].replace(rare_countries.index, 'other')

adult_df['native_country_grouped'] = adult_df['native_country_grouped'].replace('?','unknown').fillna('Unknown')

# occupation: handle na and ? as unknown, categorise occupations
adult_df['occupation_grouped'] = adult_df['occupation'].replace('?', 'Unknown').fillna('Unknown')
occ_map = {
    'Prof-specialty': 'Professional', 'Exec-managerial': 'Professional',
    'Adm-clerical': 'Service_Admin', 'Sales': 'Service_Admin', 
    'Tech-support': 'Service_Admin', 'Protective-serv': 'Service_Admin',
    'Craft-repair': 'Blue_Collar', 'Machine-op-inspct': 'Blue_Collar', 
    'Transport-moving': 'Blue_Collar', 'Handlers-cleaners': 'Blue_Collar', 
    'Farming-fishing': 'Blue_Collar',
    'Other-service': 'Other', 'Priv-house-serv': 'Other', 'Armed-Forces': 'Other','Unknown':'Unknown'
}
adult_df['occupation_grouped'] = adult_df['occupation_grouped'].map(occ_map).fillna('Unknown')

# workclass: handle na and ? as unknown, categorise workclasses
adult_df['workclass_grouped'] = adult_df['workclass'].replace('?', 'Unknown').fillna('Unknown')
adult_df.loc[adult_df['workclass_grouped'].str.contains('gov'), 'workclass_grouped'] = 'Gov'
adult_df.loc[adult_df['workclass_grouped'].str.contains('Self-emp'), 'workclass_grouped'] = 'Self-emp'
adult_df.loc[adult_df['workclass_grouped'].str.contains('Never'), 'workclass_grouped'] = 'Without-pay'

# income label and sex feature
adult_df['income'] = adult_df['income'].map({'<=50K':0,'<=50K.':0,'>50K':1,'>50K.':1})
adult_df['sex'] = adult_df['sex'].map({'Male':1,'Female':0})

# Select features and labels
X = adult_df.drop(['workclass','education','occupation','native_country','fnlwgt','income'], axis=1)
cat_features = ['marital_status', 'relationship', 'race', 'sex', 'native_country_grouped', 'occupation_grouped', 'workclass_grouped']
num_features = ['age','capital_gain','capital_loss','hours_per_week']

y = adult_df['income']

# Baseline statistics
table1 = TableOne(adult_df,
                groupby='sex',
                continuous=num_features,
                categorical=cat_features)
print(table1.tabulate(tablefmt="github"))



|                               |                       | Missing   | Overall         | 0              | 1               |
|-------------------------------|-----------------------|-----------|-----------------|----------------|-----------------|
| n                             |                       |           | 48842           | 16192          | 32650           |
| age, mean (SD)                |                       | 0         | 38.6 (13.7)     | 36.9 (14.1)    | 39.5 (13.4)     |
| marital_status, n (%)         | Divorced              |           | 6633 (13.6)     | 4001 (24.7)    | 2632 (8.1)      |
|                               | Married-AF-spouse     |           | 37 (0.1)        | 25 (0.2)       | 12 (0.0)        |
|                               | Married-civ-spouse    |           | 22379 (45.8)    | 2480 (15.3)    | 19899 (60.9)    |
|                               | Married-spouse-absent |           | 628 (1.3)       | 304 (1.9)      | 324 (1.0)       |
|               

# Model Training

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import numpy as np

# Training / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4)

# Set up and train the pipeline
preprocessor = ColumnTransformer(
  transformers = [('num', StandardScaler(), num_features),
                  ('cat', OneHotEncoder(drop='first'), cat_features)]
  )
pipeline_steps = [('preprocessor', preprocessor), ('lg', LogisticRegression(max_iter=1000))]

pred_model = Pipeline(pipeline_steps)

pred_model.fit(X_train, y_train)

# Predicted outcome probabilities
y_pred_prob_train = pred_model.predict_proba(X_train)[:,1]
y_pred_prob_test = pred_model.predict_proba(X_test)[:,1]

# Apply classification threshold corresponding to 
# the prevalence of the negative class in the training set
neg_class_prevalence = 1 - y_train.sum()/len(y_train)
class_threshold = np.quantile(y_pred_prob_train, neg_class_prevalence)

y_pred_train = (y_pred_prob_train >= class_threshold).astype(int)
y_pred_test = (y_pred_prob_test >= class_threshold).astype(int)


# Causal-based bias detection and mitigation

In [20]:
# Causal Model Analysis on the training set
# Considering the following causal model:
# y_pred = beta0 + beta1*y_true + beta2*a
# where a is the protected attribute, i.e. sex
beta2, beta2_pvalue = get_causal_model_params(X_train, y_train, y_pred_prob_train, 'sex')

print(f'\nCausal path between protected attribute (sex) and prediction: {round(beta2,3)} (P-value = {beta2_pvalue})')

# Get the classification threshold for the corrected predictions
y_correct_pred_prob_train = y_pred_prob_train - beta2*X_train['sex']
correct_class_threshold = np.quantile(y_correct_pred_prob_train, neg_class_prevalence)

# Calculate the corrected prediction probability
y_correct_pred_prob_test = y_pred_prob_test - beta2*X_test['sex']
y_correct_pred_test = (y_correct_pred_prob_test >= correct_class_threshold).astype(int) 

|    | lval   | op   | rval                |   Estimate |    Std. Err |   z-value |   p-value |
|---:|:-------|:-----|:--------------------|-----------:|------------:|----------:|----------:|
|  0 | y_true | ~    | protected_attribute |  0.195933  | 0.00489045  |   40.0645 |         0 |
|  1 | y_pred | ~    | y_true              |  0.380323  | 0.00269746  |  140.993  |         0 |
|  2 | y_pred | ~    | protected_attribute |  0.120873  | 0.0024442   |   49.453  |         0 |
|  3 | y_true | ~~   | y_true              |  0.173681  | 0.0013578   |  127.914  |         0 |
|  4 | y_pred | ~~   | y_pred              |  0.0413552 | 0.000323305 |  127.914  |         0 |

Causal path between protected attribute (sex) and prediction: 0.121 (P-value = 0.0)


# Performance and fairness

In [36]:
# Global performance of the baseline model
accuracy_train, FNR_train = get_perf_metrics(y_train, y_pred_train, y_pred_prob_train)
accuracy_test, FNR_test = get_perf_metrics(y_test, y_pred_test, y_pred_prob_test)

# Stratified performance
audit_df = X_test.copy()
audit_df['y_true'] = y_test
audit_df['y_pred'] = y_pred_test
audit_df['y_pred_prob'] = y_pred_prob_test
audit_df['y_correct_pred_prob'] = y_correct_pred_prob_test
audit_df['y_correct_pred'] = y_correct_pred_test

male_df = audit_df[audit_df['sex'] == 1]
female_df = audit_df[audit_df['sex'] == 0]

# Baseline perf stratified by sex and fairness
accuracy_m, FNR_m = get_perf_metrics(
  male_df['y_true'],
  male_df['y_pred'],
  male_df['y_pred_prob'])

accuracy_f, FNR_f = get_perf_metrics(
  female_df['y_true'],
  female_df['y_pred'],
  female_df['y_pred_prob'])

# Corrected perf, global, stratified by sex and fairness
accuracy_correct_test, FNR_correct_test = get_perf_metrics(y_test, y_correct_pred_test, y_correct_pred_prob_test)

accuracy_correct_m, FNR_correct_m = get_perf_metrics(
  male_df['y_true'],
  male_df['y_correct_pred'],
  male_df['y_correct_pred_prob']  
)

accuracy_correct_f, FNR_correct_f = get_perf_metrics(
  female_df['y_true'],
  female_df['y_correct_pred'],
  female_df['y_correct_pred_prob']  
)

# Gather performance metrics for comparison
perf_metrics= [
    {'Model': 'Baseline', 'Group': 'Overall',
    'Accuracy': accuracy_test,
    'FNR': FNR_test, 
    'TPR': 1-FNR_test},
    {'Model': 'Baseline', 'Group': 'Female',
    'Accuracy': accuracy_f,
    'FNR': FNR_f,
    'TPR': 1-FNR_f},
    {'Model': 'Baseline', 'Group': 'Male',
    'Accuracy': accuracy_m,
    'FNR': FNR_m,
    'TPR': 1-FNR_m},
    {'Model': 'Corrected', 'Group': 'Overall',
    'Accuracy': accuracy_correct_test,
    'FNR': FNR_correct_test,
    'TPR': 1-FNR_correct_test},
    {'Model': 'Corrected', 'Group': 'Female',
    'Accuracy': accuracy_correct_f,
    'FNR': FNR_correct_f,
    'TPR': 1-FNR_correct_f},
    {'Model': 'Corrected', 'Group': 'Male',
    'Accuracy': accuracy_correct_m,
    'FNR': FNR_correct_m,
    'TPR': 1-FNR_correct_m},
]

performance_comparison = pd.DataFrame(perf_metrics).pivot_table(index=['Model','Group']).round(3)

print(performance_comparison)

print(f'\nBaseline Equal Opportunity Difference: {(FNR_m-FNR_f):.3f}')
print(f'\nCorrected Equal Opportunity Difference: {(FNR_correct_m-FNR_correct_f):.3f}')


                   Accuracy    FNR    TPR
Model     Group                          
Baseline  Female      0.926  0.406  0.594
          Male        0.786  0.327  0.673
          Overall     0.832  0.339  0.661
Corrected Female      0.917  0.337  0.663
          Male        0.788  0.344  0.656
          Overall     0.830  0.343  0.657

Baseline Equal Opportunity Difference: -0.079

Corrected Equal Opportunity Difference: 0.007
