# Imports

In [12]:
import random
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from aif360.datasets import GermanDataset
from aif360.algorithms.preprocessing import Reweighing, OptimPreproc, LFR, DisparateImpactRemover
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools
from aif360.algorithms.preprocessing.optim_preproc_helpers.distortion_functions  import get_distortion_german, get_distortion_adult
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_german, load_preproc_data_adult
from copy import deepcopy
import numpy as np

from FairBoost import FairBoost, Bootstrap_type

In [13]:
np.random.seed(42)

# Debug

In [2]:
# !pip install ipdb
import  ipdb

In [3]:
%pdb off

Automatic pdb calling has been turned OFF


# Code

In [4]:
dataset_orig = load_preproc_data_adult()
dataset_orig_train, dataset_orig_test = dataset_orig.split([0.7], shuffle=True)

In [5]:
optim_options = {
    "distortion_fun": get_distortion_adult,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0] 
    }   


privileged_groups = [{'sex': 1.0}]
unprivileged_groups = [{'sex': 0.0}]
pp2 = OptimPreproc(OptTools, optim_options,
                  unprivileged_groups = unprivileged_groups,
                  privileged_groups = privileged_groups)

Privileged and unprivileged groups specified will not be used. The protected attributes are directly specified in the data preprocessing function. The current implementation automatically adjusts for discrimination across all groups. This can be changed by changing the optimization code.


In [6]:

## TODO: Fairboost does not support Reweighing!!
pp1 = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)


In [7]:
pp3 = LFR(unprivileged_groups=unprivileged_groups,
         privileged_groups=privileged_groups,
         k=10, Ax=0.1, Ay=1.0, Az=2.0,
         verbose=1
        )

In [8]:
pp4 = DisparateImpactRemover(repair_level=.5)
pp4.transform = pp4.fit_transform

In [9]:
pp = (pp1,pp2,pp4)

In [10]:
model = LogisticRegression()

ens = FairBoost(model, pp, bootstrap_type=Bootstrap_type.DEFAULT)
ens = ens.fit(dataset_orig_train)
y_pred = ens.predict(dataset_orig_test)
accuracy_score(y_pred, dataset_orig_test.labels)

Reweighing
OptimPreproc
DisparateImpactRemover
(34189, 18), (34189, 1), (34189, 1)
(34189, 18), (34189, 1), (34189, 1)
(34189, 18), (34189, 1), (34189, 1)
Reweighing
OptimPreproc
DisparateImpactRemover


0.8031802361291203

In [16]:
model = LogisticRegression()
model = model.fit(dataset_orig_train.features, dataset_orig_train.labels.ravel())
y_pred = model.predict(dataset_orig_test.features)
accuracy_score(y_pred, dataset_orig_test.labels)

0.8041356718760664

DIR

In [4]:
pp_ = DisparateImpactRemover(repair_level=1)
dataset_orig_train_m = pp_.fit_transform(dataset_orig_train)
y = deepcopy(dataset_orig_test.labels)
dataset_orig_test_m = pp_.fit_transform(dataset_orig_test)
model = LogisticRegression()
model = model.fit(dataset_orig_train_m.features, dataset_orig_train_m.labels.ravel())
y_pred = model.predict(dataset_orig_test_m.features)
accuracy_score(y_pred, y)

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8014741008667167

In [5]:
(dataset_orig_test.labels == y).all()

True

In [6]:
dataset_orig_test.labels is y

False

OPTIM


In [None]:
optim_options = {
    "distortion_fun": get_distortion_adult,
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0] 
    }   


privileged_groups = [{'sex': 1.0}]
unprivileged_groups = [{'sex': 0.0}]
pp_ = OptimPreproc(OptTools, optim_options,
                  unprivileged_groups = unprivileged_groups,
                  privileged_groups = privileged_groups)

dataset_orig_train_m = pp_.fit_transform(dataset_orig_train)
y = deepcopy(dataset_orig_test.labels)
dataset_orig_test_m = pp_.transform(dataset_orig_test)
model = LogisticRegression()
model = model.fit(dataset_orig_train_m.features, dataset_orig_train_m.labels.ravel())
y_pred = model.predict(dataset_orig_test_m.features)
accuracy_score(y_pred, y)

Privileged and unprivileged groups specified will not be used. The protected attributes are directly specified in the data preprocessing function. The current implementation automatically adjusts for discrimination across all groups. This can be changed by changing the optimization code.

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 21 times so far.


This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 2

0.7928069337337064

In [None]:
print((dataset_orig_test.labels == y).all())
print(dataset_orig_test.labels is y)

LFR

In [9]:
privileged_groups = [{'sex': 1.0}]
unprivileged_groups = [{'sex': 0.0}]
pp_ = LFR(unprivileged_groups=unprivileged_groups,
         privileged_groups=privileged_groups,
         k=10, Ax=0.1, Ay=1.0, Az=2.0,
         verbose=1
        )
dataset_orig_train_m = pp_.fit_transform(dataset_orig_train)
y = deepcopy(dataset_orig_test.labels)
dataset_orig_test_m = pp_.transform(dataset_orig_test)
model = LogisticRegression()
model = model.fit(dataset_orig_train_m.features, dataset_orig_train_m.labels.ravel())
y_pred = model.predict(dataset_orig_test_m.features)
accuracy_score(y_pred, y)

step: 0, loss: 0.8979977405808632, L_x: 0.5146691135723371,  L_y: 0.8261957803771842,  L_z: 0.010167524423222666
step: 250, loss: 0.7154116975733398, L_x: 0.5136321847105362,  L_y: 0.6444896083516051,  L_z: 0.009779435375340522
step: 500, loss: 0.6260708559143251, L_x: 0.5086164425080072,  L_y: 0.5577051214516938,  L_z: 0.008752045105915275
step: 750, loss: 0.6144027227772167, L_x: 0.5027025394672887,  L_y: 0.5490162492231774,  L_z: 0.007558109803655221
step: 1000, loss: 0.6048071116585004, L_x: 0.48236644135739387,  L_y: 0.54600181770085,  L_z: 0.005284324910955532
step: 1250, loss: 0.5917195492667402, L_x: 0.4554178623806014,  L_y: 0.536531519369358,  L_z: 0.004823121829661031
step: 1500, loss: 0.5829306019709901, L_x: 0.38684488350261836,  L_y: 0.5257616473260712,  L_z: 0.009242233147328503
step: 1750, loss: 0.5571488066201193, L_x: 0.3503511417119411,  L_y: 0.4974340117509254,  L_z: 0.012339840348999917
step: 2000, loss: 0.5475890690464027, L_x: 0.3337642112794511,  L_y: 0.49150589

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

In [None]:
print((dataset_orig_test.labels == y).all())
print(dataset_orig_test.labels is y)

True

In [None]:
import numpy as np 
np.argwhere(dataset_orig_test.labels != y.ravel())

array([[    9],
       [   10],
       [   11],
       ...,
       [14646],
       [14647],
       [14648]])