#This notebook is a quick introduction to the practical application or visualization of fairness and unfairness in a ML model. It uses data from the Fairlearn 0.8.0 package. 

## The dataset is from the [UCI Adult dataset](https://archive.ics.uci.edu/ml/datasets/Adult) which predicts whether a person makes more (label 1) or less (label 0) than $50,000 a year

In [1]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

In [6]:
pip install fairlearn

Collecting fairlearn
  Downloading fairlearn-0.8.0-py3-none-any.whl (235 kB)
     ------------------------------------- 235.0/235.0 kB 76.1 kB/s eta 0:00:00
Installing collected packages: fairlearn
Successfully installed fairlearn-0.8.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Load the dataset
data = fetch_openml(data_id=1590, as_frame=True, parser='auto')
X = pd.get_dummies(data.data)
y_true = (data.target == '>50K') * 1
sex = data.data['sex']
sex.value_counts()

Male      32650
Female    16192
Name: sex, dtype: int64

In [3]:
# Evaluate fairness-related metrics
from fairlearn.metrics import MetricFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [4]:
classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
classifier.fit(X, y_true)

In [5]:
y_pred = classifier.predict(X)

In [6]:
gm = MetricFrame(metrics=accuracy_score, y_true=y_true, y_pred=y_pred,
                sensitive_features=sex)
print(gm.overall)

0.8443552680070431


In [7]:
print(gm.by_group)

sex
Female    0.925148
Male      0.804288
Name: accuracy_score, dtype: float64


In [8]:
# selection_rate: the percentage of the population which have 1 as their label
from fairlearn.metrics import selection_rate
sr = MetricFrame(metrics=selection_rate, y_true=y_true,
                y_pred=y_pred, sensitive_features=sex)
print(sr.overall)
print(sr.by_group)

0.16385487899758405
sex
Female    0.063550
Male      0.213599
Name: selection_rate, dtype: float64


In [18]:
# Plot metrics
metrics = {
    "accuracy": accuracy_score,
    "precision": precision_score,
    "false positive rate": false_positive_rate,
    "false negative rate": false_negative_rate,
    "selection rate": selection_rate,
    "count": count,
}
metric_frame = MetricFrame(metrics=metrics, y_true=y_true,
                          y_pred=y_pred, sensitive_features=sex
                          )
metric_frame.by_group.plot.bar(
    subplots=True,
    layout=[3, 3],
    legend=False,
    figsize=[12, 8],
    title="Show all metrics",)

# Plot not showing due to undefined variable!!! FIX THIS!!!

NameError: name 'precision_score' is not defined

In [13]:
# Mitigate selection_rate disparity
from fairlearn.reductions import ExponentiatedGradient, DemographicParity
np.random.seed(0) # for consistent results with ExponentiatedGradient
constraint = DemographicParity()
classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
mitigator = ExponentiatedGradient(classifier, constraint)
mitigator.fit(X, y_true, sensitive_features=sex)

In [14]:
y_pred_mitigated = mitigator.predict(X)

In [15]:
sr_mitigated = MetricFrame(metrics=selection_rate, y_true=y_true,
                          y_pred=y_pred_mitigated, sensitive_features=sex)

In [16]:
print(sr_mitigated.overall)

0.16614798738790384


In [17]:
print(sr_mitigated.by_group)

sex
Female    0.155262
Male      0.171547
Name: selection_rate, dtype: float64
