# Cohort Case Study 1

In [1]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from raimitigations.utils import split_data, fetch_cohort_results
import raimitigations.dataprocessing as dp
from raimitigations.cohort import CohortDefinition, CohortManager

SEED = 42
#SEED = None

def _create_country_df(samples: int, sectors: dict, country_name: str):
    df = None
    for key in sectors.keys():
        size = int(samples * sectors[key]["prob_occur"])
        invest = np.random.uniform(low=sectors[key]["min"], high=sectors[key]["max"], size=size)
        min_invest = min(invest)
        max_invest = max(invest)
        range_invest = max_invest - min_invest
        bankrupt_th = sectors[key]["prob_success"] * range_invest
        bankrupt = []
        for i in range(invest.shape[0]):
            if invest[i] > bankrupt_th:
                bankrupt.append(0)
            else:
                bankrupt.append(1)
        noise_ind = np.random.choice(range(size), int(size*0.1), replace=False)
        for ind in noise_ind:
            bankrupt[ind] = int(not bankrupt[ind])
        noise_ind = np.random.choice(range(size), int(size*0.1), replace=False)
        for ind in noise_ind:
            invest[ind] = np.nan
        
        country_col = [country_name for _ in range(size)]
        sector_col = [key for _ in range(size)]
        df_sector = pd.DataFrame({
            "investment":invest,
            "sector":sector_col,
            "country":country_col,
            "bankrupt":bankrupt
        })
        
        if df is None:
            df = df_sector
        else:
            df = pd.concat([df, df_sector], axis=0)
    return df

In [2]:
def create_df_multiple_distributions(samples: list):
    np.random.seed(SEED)
    random.seed(SEED)
    sectors_c1 = {
        "s1": {"prob_occur":0.2, "prob_success":0.8, "min":20000, "max":100000},
        "s2": {"prob_occur":0.4, "prob_success":0.6, "min":1000, "max":15000},
        "s3": {"prob_occur":0.1, "prob_success":0.9, "min":1000000, "max":10000000},
        "s4": {"prob_occur":0.3, "prob_success":0.7, "min":4000000, "max":900000000},
    }
    sectors_c2 = {
        "s1": {"prob_occur":0.05, "prob_success":0.6, "min":1000, "max":5000},
        "s2": {"prob_occur":0.6, "prob_success":0.9, "min":100000, "max":1500000},
        "s3": {"prob_occur":0.2, "prob_success":0.5, "min":50000, "max":300000},
        "s4": {"prob_occur":0.15, "prob_success":0.8, "min":1000000, "max":10000000},
    }
    sectors_c3 = {
        "s1": {"prob_occur":0.3, "prob_success":0.9, "min":300, "max":600},
        "s2": {"prob_occur":0.6, "prob_success":0.7, "min":5000, "max":9000},
        "s3": {"prob_occur":0.07, "prob_success":0.8, "min":4000, "max":20000},
        "s4": {"prob_occur":0.03, "prob_success":0.5, "min":600000, "max":1300000},
    }
    countries = {
        "A":{"sectors":sectors_c1, "sample_rate":0.75},
        "B":{"sectors":sectors_c2, "sample_rate":0.1},
        "C":{"sectors":sectors_c2, "sample_rate":0.15}
    }
    df = None
    for key in countries.keys():
        n_sample = int(samples * countries[key]["sample_rate"])
        df_c = _create_country_df(n_sample, countries[key]["sectors"], key)
        if df is None:
            df = df_c
        else:
            df = pd.concat([df, df_c], axis=0)
    
    idx = pd.Index([i for i in range(df.shape[0])])
    df = df.set_index(idx)
    return df

In [3]:
df = create_df_multiple_distributions(3000)
df

Unnamed: 0,investment,sector,country,bankrupt
0,4.996321e+04,s1,A,1
1,9.605714e+04,s1,A,0
2,7.855952e+04,s1,A,0
3,6.789268e+04,s1,A,0
4,3.248149e+04,s1,A,1
...,...,...,...,...
2994,3.292908e+06,s4,C,1
2995,9.230735e+06,s4,C,0
2996,2.017477e+06,s4,C,1
2997,7.816826e+06,s4,C,0


In [4]:
subset = df.query("sector == 's1' and investment < 60000")
subset['bankrupt'].value_counts(normalize=True)

1    0.829787
0    0.170213
Name: bankrupt, dtype: float64

In [5]:
X_train, X_test, y_train, y_test = split_data(df, label="bankrupt", test_size=0.15)

#model = LGBMClassifier(random_state=SEED)
model = LogisticRegression()
pipe = Pipeline([
            ("imputer", dp.BasicImputer(verbose=False)),
            ("scaler", dp.DataMinMaxScaler(verbose=False)),
            ("encoder", dp.EncoderOHE(verbose=False)),
            ("estimator", model),
        ])
pipe.fit(X_train, y_train)
pred = pipe.predict_proba(X_test)
    
experiments = {}
experiments["Baseline"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Baseline"]

Unnamed: 0,cohort,cht_query,cht_size,roc,pr,recall,f1,acc
0,all,all,450,0.637731,0.675484,0.669915,0.637375,0.637778
1,cohort_0,"(`country` == ""A"")",344,0.667306,0.718684,0.705273,0.678668,0.680233
2,cohort_1,"(`country` == ""B"")",46,0.807527,0.782609,0.821505,0.775828,0.782609
3,cohort_2,"(`country` == ""C"")",60,0.59371,0.649832,0.671374,0.647474,0.666667


In [6]:
model = LogisticRegression()

cht_manager = CohortManager(
    transform_pipe=[
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
    ],
    cohort_col=["country"]
)

pipe = Pipeline([
            ("imputer", cht_manager),
            ("encoder", dp.EncoderOHE(verbose=False)),
            ("estimator", model),
        ])
pipe.fit(X_train, y_train)
pred = pipe.predict_proba(X_test)

experiments["Imputer for each Cohort"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Imputer for each Cohort"]

Unnamed: 0,cohort,cht_query,cht_size,roc,pr,recall,f1,acc
0,all,all,450,0.672092,0.704011,0.708096,0.686405,0.686667
1,cohort_0,"(`country` == ""A"")",344,0.658224,0.724443,0.712887,0.68779,0.688953
2,cohort_1,"(`country` == ""B"")",46,0.7,0.7,0.723656,0.670014,0.673913
3,cohort_2,"(`country` == ""C"")",60,0.749037,0.720539,0.752246,0.69457,0.7


In [7]:
model = LogisticRegression()

cht_manager = CohortManager(
    transform_pipe=[
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
        dp.EncoderOHE(verbose=False),
        model
    ],
    cohort_col=["country"]
)
cht_manager.fit(X_train, y_train)
pred = cht_manager.predict_proba(X_test)

experiments["Decoupled Classifiers (country)"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Decoupled Classifiers (country)"]

Unnamed: 0,cohort,cht_query,cht_size,roc,pr,recall,f1,acc
0,all,all,450,0.719926,0.742963,0.714853,0.720782,0.744444
1,cohort_0,"(`country` == ""A"")",344,0.711713,0.761347,0.726475,0.730741,0.75
2,cohort_1,"(`country` == ""B"")",46,0.841935,0.8,0.83871,0.778846,0.782609
3,cohort_2,"(`country` == ""C"")",60,0.777279,0.729911,0.764442,0.710145,0.716667


In [8]:
model = LogisticRegression()

cht_manager = CohortManager(
    transform_pipe=[
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
        dp.EncoderOHE(verbose=False),
        model
    ],
    cohort_col=["sector"]
)
cht_manager.fit(X_train, y_train)
pred = cht_manager.predict_proba(X_test)

experiments["Decoupled Classifiers (sector)"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Decoupled Classifiers (sector)"]

Unnamed: 0,cohort,cht_query,cht_size,roc,pr,recall,f1,acc
0,all,all,450,0.769938,0.784946,0.787375,0.786055,0.793333
1,cohort_0,"(`country` == ""A"")",344,0.790549,0.814216,0.809886,0.81169,0.81686
2,cohort_1,"(`country` == ""B"")",46,0.762366,0.73913,0.772043,0.730994,0.73913
3,cohort_2,"(`country` == ""C"")",60,0.768293,0.7375,0.743902,0.649903,0.65


In [9]:
model = LogisticRegression()

cht_manager = CohortManager(
    transform_pipe=[
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
        dp.EncoderOHE(verbose=False),
        model
    ],
    cohort_col=["sector", "country"]
)
cht_manager.fit(X_train, y_train)
pred = cht_manager.predict_proba(X_test)

experiments["Decoupled Classifiers (sector,country)"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Decoupled Classifiers (sector,country)"]

Unnamed: 0,cohort,cht_query,cht_size,roc,pr,recall,f1,acc
0,all,all,450,0.842418,0.812736,0.813295,0.813011,0.82
1,cohort_0,"(`country` == ""A"")",344,0.840309,0.851995,0.824148,0.831215,0.840116
2,cohort_1,"(`country` == ""B"")",46,0.891398,0.849903,0.886022,0.860041,0.869565
3,cohort_2,"(`country` == ""C"")",60,0.842747,0.784512,0.825417,0.788484,0.8
