In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import fairsd.subgroupdiscovery as dsd

#Import dataset
d = fetch_openml(data_id=1590, as_frame=True)
dataset = d.data
d_train=pd.get_dummies(dataset)
y_true = (d.target == '>50K') * 1

#training the classifier
classifier = DecisionTreeClassifier(min_samples_leaf=10, max_depth=4)
classifier.fit(d_train, y_true)

#producing y_pred
y_pred = classifier.predict(d_train)



# Use of the FairSD package
* the BeamSearch algorithm is used to find the most discriminated subgroups
* **a function of the fairlearn.metrics module is used as a quality function**
* the execute method return a **ResultSet object**

In [3]:
from fairlearn.metrics import demographic_parity_difference

task=dsd.SubgroupDiscoveryTask(dataset, y_true, y_pred, qf = demographic_parity_difference)
result_set=dsd.BeamSearch(beam_width=10).execute(task)

print(type(result_set))


<class 'fairsd.subgroupdiscovery.ResultSet'>


## What can we do with this returned ResultSet object?

1) We can easily transform this object into a dataframe as shown below. Each row of this dataframe represents a subgroup.

In [4]:
df=result_set.to_dataframe()
display(df)

Unnamed: 0,quality,description,size_sg,relative_size_sg,target_share_sg
sg0,0.814013,"capital-gain = '(None, 5013.0]'",944,0.944,0.199153
sg1,0.659905,"capital-gain = '(None, 5013.0]' AND capital-lo...",917,0.917,0.183206
sg2,0.645292,"capital-gain = '(None, 5013.0]' AND fnlwgt = '...",932,0.932,0.201717
sg3,0.581699,"education-num = '(None, 11.0]' AND capital-gai...",694,0.694,0.138329
sg4,0.568322,"education-num = '(11.0, None]' AND age = '(26....",235,0.235,0.480851
sg5,0.565667,"education-num = '(11.0, None]' AND age = '(26....",236,0.236,0.478814
sg6,0.564394,"education-num = '(11.0, None]' AND age = '(26....",208,0.208,0.490385
sg7,0.563291,"education-num = '(None, 11.0]' AND fnlwgt = '(...",684,0.684,0.140351
sg8,0.559349,"capital-gain = '(None, 5013.0]' AND fnlwgt = '...",905,0.905,0.185635
sg9,0.554517,"education-num = '(None, 11.0]' AND capital-los...",679,0.679,0.12813


2) **We can select a subgroup X from the result set and automatically generate the feature "Belong to subgroup X"**. This is very useful for deepening the analysis on this subgroup, for example we can use the FairLearn library for this purpose.
An example is shown below:

In [5]:
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate

# Here we generate the feature "Belong to subgroup n. 0"
# The result is a pandas Series. The name of this Series is "sg0".
# This series contains an element for each instance of the dataset. Each element is True 
# iff the istance belong to the subgroup sg0
sg_feature = result_set.extract_sg_feature(sg_number=0, data=dataset)

# Here we basically use the FairLearn library to further analyzing the subgroup sg0
selection_rate = MetricFrame(selection_rate, y_true, y_pred, sensitive_features=sg_feature)
print(selection_rate.by_group)

sg0
False    0.946429
True     0.132415
Name: selection_rate, dtype: object
