In [9]:
# %pip install --upgrade scikit-learn scikit-lego pandas matplotlib

In [10]:
import pandas as pd
from sklego.datasets import load_arrests

In [11]:
df = load_arrests(give_pandas=True).assign(
    colour=lambda d: d['colour'] == 'Black',
    sex=lambda d: d['sex'] == 'Female',
    age=lambda d: d['age'] < 25,
)

X, y = df.drop(columns='released'), df['released']
y = (y == 'Yes').astype(int)
print(df.head().to_markdown())

|    | released   | colour   |   year | age   | sex   | employed   | citizen   |   checks |
|---:|:-----------|:---------|-------:|:------|:------|:-----------|:----------|---------:|
|  0 | Yes        | False    |   2002 | True  | False | Yes        | Yes       |        3 |
|  1 | No         | True     |   1999 | True  | False | Yes        | Yes       |        3 |
|  2 | Yes        | False    |   2000 | True  | False | Yes        | Yes       |        3 |
|  3 | No         | True     |   2000 | False | False | Yes        | Yes       |        1 |
|  4 | Yes        | True     |   1999 | False | True  | Yes        | Yes       |        1 |


In [58]:
PandasTypeSelector('number').fit_transform(X, y)

Unnamed: 0,year,checks
0,2002,3
1,1999,3
2,2000,3
3,2000,1
4,1999,1
...,...,...
5221,2000,0
5222,2000,0
5223,1999,1
5224,1998,4


In [40]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklego.metrics import equal_opportunity_score
from sklego.preprocessing import PandasTypeSelector, ColumnDropper, ColumnSelector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklego.linear_model import EqualOpportunityClassifier


categorical_pipeline = make_pipeline(
    PandasTypeSelector('object'),
    OneHotEncoder(sparse=False, drop='first'),
)
numerical_pipeline = make_pipeline(
    PandasTypeSelector('number'),
    StandardScaler()
)
pipeline = make_pipeline(
    ColumnDropper(['colour', 'age', 'sex']),
    make_union(
        categorical_pipeline,
        numerical_pipeline,
    ),
    LogisticRegression(class_weight='balanced')
    
)

eq_op_pipeline = make_pipeline(
    make_union(
        ColumnSelector(['colour', 'age', 'sex']),
        categorical_pipeline,
        numerical_pipeline,
    ),
    EqualOpportunityClassifier(covariance_threshold=0.9, positive_target=1, sensitive_cols=[0, 1, 2])
)

In [41]:
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, make_scorer

def do_gridsearch(pipe_model):
    gs = GridSearchCV(
        pipe_model, 
        param_grid={},
        scoring = {
            'eq_op_colour': equal_opportunity_score('colour', positive_target=1),
            'eq_op_age': equal_opportunity_score('age', positive_target=1),
            'eq_op_sex': equal_opportunity_score('sex', positive_target=1),
            'precision': make_scorer(precision_score, pos_label=1),
            'recall': make_scorer(recall_score, pos_label=1)
        },
        cv=StratifiedKFold(5),
        refit='precision',
        n_jobs=1
    )

    return gs.fit(X, y)

In [42]:
standard_model = do_gridsearch(pipeline)
fair_model = do_gridsearch(eq_op_pipeline)

In [53]:
standard_model.best_estimator_[-1].intercept_, standard_model.best_estimator_[-1].coef_

(array([-1.06557607]),
 array([[ 0.79136998,  0.75373455, -0.01010115, -0.59511747]]))

In [54]:
fair_model.best_estimator_[-1].intercept_, fair_model.best_estimator_[-1].coef_

(array([[0.5833983]]),
 array([[ 0.77103632,  0.68263498, -0.01963918, -0.57983793]]))

In [45]:
pd.DataFrame(standard_model.cv_results_).filter(like='mean_')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_eq_op_colour,mean_test_eq_op_age,mean_test_eq_op_sex,mean_test_precision,mean_test_recall
0,0.038761,0.06161,0.698671,0.786174,0.830984,0.91879,0.634526


In [46]:
pd.DataFrame(fair_model.cv_results_).filter(like='mean_')

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_eq_op_colour,mean_test_eq_op_age,mean_test_eq_op_sex,mean_test_precision,mean_test_recall
0,0.636421,0.053975,0.974033,0.992975,0.989297,0.835393,0.989387


In [None]:
fitted_pipeline

In [65]:
(fitted_pipeline.predict(X) == 'Yes').mean()

0.9747416762342136

In [17]:
df.groupby(['released', 'colour']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,age,sex,employed,citizen,checks
released,colour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
No,False,559,559,559,559,559,559
No,True,333,333,333,333,333,333
Yes,False,3379,3379,3379,3379,3379,3379
Yes,True,955,955,955,955,955,955


In [18]:
df.groupby(['released', 'sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,colour,year,age,employed,citizen,checks
released,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
No,False,829,829,829,829,829,829
No,True,63,63,63,63,63,63
Yes,False,3954,3954,3954,3954,3954,3954
Yes,True,380,380,380,380,380,380


In [20]:
df['year'].value_counts()

2000    1270
2001    1211
1999    1099
1998     877
1997     492
2002     277
Name: year, dtype: int64

In [21]:
df['sex'].value_counts()

False    4783
True      443
Name: sex, dtype: int64

In [22]:
df['colour'].value_counts()

False    3938
True     1288
Name: colour, dtype: int64