# Decoupled Classifiers Case Study 3

This notebook will follow a similar approach to what was done in the notebook [Decoupled Classifiers Case Study 2](./case_2.ipynb).

In [1]:
import sys
sys.path.append('../../../../notebooks')

import pandas as pd
import numpy as np
import random

from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

from raimitigations.utils import split_data
import raimitigations.dataprocessing as dp
from raimitigations.cohort import DecoupledClass, CohortDefinition, CohortManager, fetch_cohort_results, plot_value_counts_cohort
from sklearn.pipeline import Pipeline
from download import download_datasets

SEED = 100

Load and split the data into train and test sets:

In [2]:
data_dir = '../../../datasets/'
download_datasets(data_dir)
df = pd.read_csv(data_dir + 'hr_promotion/train.csv')
df.drop(columns=['employee_id'], inplace=True)
label_col = 'is_promoted'

X_train, X_test, y_train, y_test = split_data(df, label_col, test_size=0.3, random_state=SEED)

df


Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54803,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0


In [3]:
def get_model():
    model = DecisionTreeClassifier(max_features="sqrt")
    #model = LGBMClassifier(random_state=SEED)
    return model

let's begin by creating a simple pipeline for our baselines:

In [4]:
pipe = Pipeline([
    ("imputer", dp.BasicImputer(verbose=False)),
    ("scaler", dp.DataStandardScaler(verbose=False)),
    ("encoder", dp.EncoderOHE(verbose=False)),
    ("estimator", get_model())
])

pipe.fit(X_train, y_train)
pred = pipe.predict_proba(X_test)

Starting with the baseline cohorts, let's look at a few feature columns in the data and find one where we can use the `DecoupledClass` to make an improvement over its cohorts:

### The *"education"* cohorts

In [5]:
pred_train = pipe.predict_proba(X_train)
_, th_dict = fetch_cohort_results(X_train, y_train, pred_train, cohort_col=["education"], return_th_dict=True)
fetch_cohort_results(X_test, y_test, pred, cohort_col=["education"], fixed_th=th_dict)


Unnamed: 0,cohort,cht_query,roc,precision,recall,f1,accuracy,threshold,num_pos,%_pos,cht_size
0,all,all,0.661747,0.659996,0.661755,0.660869,0.893754,0.5,1417,0.086176,16443
1,cohort_0,"(`education` == ""Bachelor's"")",0.662243,0.657236,0.662257,0.65969,0.897732,0.5,914,0.083235,10981
2,cohort_1,"(`education` == ""Below Secondary"")",0.537547,0.542072,0.537547,0.539439,0.835443,1.0,22,0.092827,237
3,cohort_2,"(`education` == ""Master's & above"")",0.67167,0.680482,0.67167,0.675907,0.884022,1.0,432,0.096536,4475
4,cohort_3,(`education`.isnull()),0.600599,0.573292,0.600599,0.583838,0.912,1.0,49,0.065333,750


### The *"recruitement_channel"* cohorts:

In [7]:
pred_train = pipe.predict_proba(X_train)
_, th_dict = fetch_cohort_results(X_train, y_train, pred_train, cohort_col=["recruitment_channel"], return_th_dict=True)
fetch_cohort_results(X_test, y_test, pred, cohort_col=["recruitment_channel"], fixed_th=th_dict)


Unnamed: 0,cohort,cht_query,roc,precision,recall,f1,accuracy,threshold,num_pos,%_pos,cht_size
0,all,all,0.661747,0.659996,0.661755,0.660869,0.893754,0.5,1417,0.086176,16443
1,cohort_0,"(`recruitment_channel` == ""other"")",0.652845,0.653555,0.652823,0.653188,0.894335,0.5,759,0.082851,9161
2,cohort_1,"(`recruitment_channel` == ""referred"")",0.677889,0.701244,0.677889,0.688362,0.867052,1.0,39,0.112717,346
3,cohort_2,"(`recruitment_channel` == ""sourcing"")",0.671858,0.665113,0.671909,0.66841,0.894319,0.5,619,0.089245,6936


### The "department" cohorts

In [11]:
pred_train = pipe.predict_proba(X_train)
_, th_dict = fetch_cohort_results(X_train, y_train, pred_train, cohort_col=["department"], return_th_dict=True)
fetch_cohort_results(X_test, y_test, pred, cohort_col=["department"], fixed_th=th_dict)

Unnamed: 0,cohort,cht_query,roc,precision,recall,f1,accuracy,threshold,num_pos,%_pos,cht_size
0,all,all,0.661747,0.659996,0.661755,0.660869,0.893754,0.5,1417,0.086176,16443
1,cohort_0,"(`department` == ""Analytics"")",0.593289,0.592893,0.593456,0.593173,0.861742,0.5,149,0.094066,1584
2,cohort_1,"(`department` == ""Finance"")",0.654926,0.686629,0.654926,0.668602,0.891102,0.5,61,0.081009,753
3,cohort_2,"(`department` == ""HR"")",0.592792,0.636306,0.592792,0.609139,0.926136,1.0,28,0.039773,704
4,cohort_3,"(`department` == ""Legal"")",0.549439,0.545026,0.549439,0.547039,0.895062,1.0,21,0.064815,324
5,cohort_4,"(`department` == ""Operations"")",0.695779,0.693521,0.695779,0.69464,0.898465,1.0,312,0.09209,3388
6,cohort_5,"(`department` == ""Procurement"")",0.635658,0.634419,0.635658,0.635034,0.878844,1.0,197,0.091799,2146
7,cohort_6,"(`department` == ""R&D"")",0.500977,0.500777,0.500977,0.500157,0.847751,1.0,27,0.093426,289
8,cohort_7,"(`department` == ""Sales & Marketing"")",0.70314,0.693611,0.70314,0.698223,0.917038,1.0,393,0.076178,5159
9,cohort_8,"(`department` == ""Technology"")",0.648056,0.642349,0.648056,0.645108,0.864504,0.5,229,0.109256,2096


In [12]:
rebalance = dp.Rebalance(verbose=False, strategy_over=0.2)
new_X_train, new_y_train = rebalance.fit_resample(X_train, y_train)

In [14]:
# Post-rebalance
pred_train = pipe.predict_proba(new_X_train)
_, th_dict = fetch_cohort_results(new_X_train, new_y_train, pred_train, cohort_col=["department"], return_th_dict=True)
fetch_cohort_results(X_test, y_test, pred, cohort_col=["department"], fixed_th=th_dict)

Unnamed: 0,cohort,cht_query,roc,precision,recall,f1,accuracy,threshold,num_pos,%_pos,cht_size
0,all,all,0.661747,0.659996,0.661755,0.660869,0.893754,0.5,1417,0.086176,16443
1,cohort_0,"(`department` == ""Analytics"")",0.593289,0.592893,0.593456,0.593173,0.861742,0.5,149,0.094066,1584
2,cohort_1,"(`department` == ""Finance"")",0.654926,0.686629,0.654926,0.668602,0.891102,0.5,61,0.081009,753
3,cohort_2,"(`department` == ""HR"")",0.592792,0.636306,0.592792,0.609139,0.926136,1.0,28,0.039773,704
4,cohort_3,"(`department` == ""Legal"")",0.549439,0.545026,0.549439,0.547039,0.895062,1.0,21,0.064815,324
5,cohort_4,"(`department` == ""Operations"")",0.695779,0.693521,0.695779,0.69464,0.898465,1.0,312,0.09209,3388
6,cohort_5,"(`department` == ""Procurement"")",0.635658,0.634419,0.635658,0.635034,0.878844,1.0,197,0.091799,2146
7,cohort_6,"(`department` == ""R&D"")",0.500977,0.500777,0.500977,0.500157,0.847751,1.0,27,0.093426,289
8,cohort_7,"(`department` == ""Sales & Marketing"")",0.70314,0.693611,0.70314,0.698223,0.917038,1.0,393,0.076178,5159
9,cohort_8,"(`department` == ""Technology"")",0.648056,0.642349,0.648056,0.645108,0.864504,0.5,229,0.109256,2096


Let's see if we can improve the *"department"* cohorts using the `DecoupledClass`. Since this isn't a sensitive attribute and we have a larger number of cohorts, we slightly improve the label distribution by merging invalid cohorts and applying fairness optimization using the fairness metric `dem_parity`.

### Merging Cohorts

In [15]:
preprocessing = [dp.BasicImputer(verbose=False), dp.DataMinMaxScaler(verbose=False), dp.EncoderOHE(drop=False, unknown_err=False, verbose=False)]

dec_class = DecoupledClass(
    cohort_col=["department"],
    transform_pipe=preprocessing,
    estimator=get_model(),
    minority_min_rate=0.1,
    min_cohort_pct=0.1,
    theta=False,
    fairness_loss="dem_parity",
    lambda_coef=0.5,
    max_joint_loss_time=2000
)
dec_class.fit(new_X_train, new_y_train)

pred = dec_class.predict_proba(X_test)
fetch_cohort_results(X_test, y_test, pred, cohort_def=dec_class, fixed_th=True)


Unnamed: 0,cohort,cht_query,roc,precision,recall,f1,accuracy,threshold,num_pos,%_pos,cht_size
0,all,all,0.686997,0.636966,0.68689,0.655803,0.872773,0.5,1988,0.120903,16443
1,cohort_2,"(((((`department` == ""HR"")) or ((`department` ...",0.62173,0.598863,0.621534,0.608199,0.866174,0.5,384,0.10509,3654
2,cohort_4,"(`department` == ""Operations"")",0.717631,0.64989,0.717857,0.673653,0.866883,1.0,473,0.13961,3388
3,cohort_5,"(`department` == ""Procurement"")",0.69259,0.634567,0.69259,0.654963,0.860671,1.0,294,0.136999,2146
4,cohort_7,"(`department` == ""Sales & Marketing"")",0.727807,0.682784,0.727807,0.701917,0.91006,0.5,475,0.092072,5159
5,cohort_8,"(`department` == ""Technology"")",0.649119,0.598299,0.649119,0.612687,0.81584,1.0,359,0.171279,2096
