# Cohort Case Study 1

In [25]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

import raimitigations.dataprocessing as dp
from raimitigations.cohort import CohortDefinition, CohortManager

SEED = 42
#SEED = None

def _create_country_df(samples: int, sectors: dict, country_name: str):
    df = None
    for key in sectors.keys():
        size = int(samples * sectors[key]["prob_occur"])
        invest = np.random.uniform(low=sectors[key]["min"], high=sectors[key]["max"], size=size)
        min_invest = min(invest)
        max_invest = max(invest)
        range_invest = max_invest - min_invest
        bankrupt_th = sectors[key]["prob_success"] * range_invest
        bankrupt = []
        for i in range(invest.shape[0]):
            if invest[i] > bankrupt_th:
                bankrupt.append(0)
            else:
                bankrupt.append(1)
        noise_ind = np.random.choice(range(size), int(size*0.1), replace=False)
        for ind in noise_ind:
            bankrupt[ind] = int(not bankrupt[ind])
        noise_ind = np.random.choice(range(size), int(size*0.1), replace=False)
        for ind in noise_ind:
            invest[ind] = np.nan
        
        country_col = [country_name for _ in range(size)]
        sector_col = [key for _ in range(size)]
        df_sector = pd.DataFrame({
            "investment":invest,
            "sector":sector_col,
            "country":country_col,
            "bankrupt":bankrupt
        })
        
        if df is None:
            df = df_sector
        else:
            df = pd.concat([df, df_sector], axis=0)
    return df

In [26]:
def create_df_multiple_distributions(samples: list):
    np.random.seed(SEED)
    random.seed(SEED)
    sectors_c1 = {
        "s1": {"prob_occur":0.2, "prob_success":0.8, "min":20000, "max":100000},
        "s2": {"prob_occur":0.4, "prob_success":0.6, "min":1000, "max":15000},
        "s3": {"prob_occur":0.1, "prob_success":0.9, "min":1000000, "max":10000000},
        "s4": {"prob_occur":0.3, "prob_success":0.7, "min":4000000, "max":900000000},
    }
    sectors_c2 = {
        "s1": {"prob_occur":0.05, "prob_success":0.6, "min":1000, "max":5000},
        "s2": {"prob_occur":0.6, "prob_success":0.9, "min":100000, "max":1500000},
        "s3": {"prob_occur":0.2, "prob_success":0.5, "min":50000, "max":300000},
        "s4": {"prob_occur":0.15, "prob_success":0.8, "min":1000000, "max":10000000},
    }
    sectors_c3 = {
        "s1": {"prob_occur":0.3, "prob_success":0.9, "min":300, "max":600},
        "s2": {"prob_occur":0.6, "prob_success":0.7, "min":5000, "max":9000},
        "s3": {"prob_occur":0.07, "prob_success":0.8, "min":4000, "max":20000},
        "s4": {"prob_occur":0.03, "prob_success":0.5, "min":600000, "max":1300000},
    }
    countries = {
        "A":{"sectors":sectors_c1, "sample_rate":0.75},
        "B":{"sectors":sectors_c2, "sample_rate":0.1},
        "C":{"sectors":sectors_c2, "sample_rate":0.15}
    }
    df = None
    for key in countries.keys():
        n_sample = int(samples * countries[key]["sample_rate"])
        df_c = _create_country_df(n_sample, countries[key]["sectors"], key)
        if df is None:
            df = df_c
        else:
            df = pd.concat([df, df_c], axis=0)
    
    idx = pd.Index([i for i in range(df.shape[0])])
    df = df.set_index(idx)
    return df

In [27]:
df = create_df_multiple_distributions(3000)
df

Unnamed: 0,investment,sector,country,bankrupt
0,4.996321e+04,s1,A,1
1,9.605714e+04,s1,A,0
2,7.855952e+04,s1,A,0
3,6.789268e+04,s1,A,0
4,3.248149e+04,s1,A,1
...,...,...,...,...
2994,3.292908e+06,s4,C,1
2995,9.230735e+06,s4,C,0
2996,2.017477e+06,s4,C,1
2997,7.816826e+06,s4,C,0


In [28]:
subset = df.query("sector == 's1' and investment < 60000")
subset['bankrupt'].value_counts(normalize=True)

1    0.829787
0    0.170213
Name: bankrupt, dtype: float64

In [29]:
def fetch_cohort_results(X, y_true, y_pred, cohort_col):
    def _metric_tuple_to_dict(metric_tuple):
        metric_dict = {
            "roc":metric_tuple[0],
            "pr":metric_tuple[2],
            "recall":metric_tuple[3],
            "f1":metric_tuple[4],
            "acc":metric_tuple[5],
        }
        return metric_dict
        
    metrics = {}
    metrics['all'] = _metric_tuple_to_dict( dp.fetch_results(y_true, y_pred, best_th_auc=True) )
    
    cht_manager = CohortManager(cohort_col=cohort_col)
    cht_manager.fit(X, y_true)
    subsets = cht_manager.get_subsets(X, y_pred)
    y_pred_dict = {}
    for cht_name in subsets.keys():
        y_pred_dict[cht_name] = subsets[cht_name]['y']
        
    subsets = cht_manager.get_subsets(X, y_true)
    for cht_name in subsets.keys():
        x_subset = subsets[cht_name]['X']
        y_subset = subsets[cht_name]['y']
        y_pred_subset = y_pred_dict[cht_name]
        metrics[cht_name] = _metric_tuple_to_dict( dp.fetch_results(y_subset, y_pred_subset, best_th_auc=True) )
        
    queries = cht_manager.get_queries()
        
    df_dict = {"cohort":[], "roc":[], "pr":[], "recall":[], "f1":[], "acc":[], "cht_query":[]}
    for key in metrics.keys():
        df_dict["cohort"].append(key)
        df_dict["roc"].append(metrics[key]["roc"])
        df_dict["pr"].append(metrics[key]["pr"].mean())
        df_dict["recall"].append(metrics[key]["recall"].mean())
        df_dict["f1"].append(metrics[key]["f1"].mean())
        df_dict["acc"].append(metrics[key]["acc"])
        if key == "all":
            df_dict["cht_query"].append("all")
        else:
            df_dict["cht_query"].append(queries[key])
    
    df = pd.DataFrame(df_dict)
    return df
        
    

In [30]:
X_train, X_test, y_train, y_test = dp.split_data(df, label="bankrupt", test_size=0.15)

#model = LGBMClassifier(random_state=SEED)
model = LogisticRegression()
pipe = Pipeline([
            ("imputer", dp.BasicImputer(verbose=False)),
            ("scaler", dp.DataMinMaxScaler(verbose=False)),
            ("encoder", dp.EncoderOHE(verbose=False)),
            ("estimator", model),
        ])
pipe.fit(X_train, y_train)
pred = pipe.predict_proba(X_test)
    
experiments = {}
experiments["Baseline"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Baseline"]

Unnamed: 0,cohort,roc,pr,recall,f1,acc,cht_query
0,all,0.637731,0.675484,0.669915,0.637375,0.637778,all
1,cohort_0,0.667306,0.718684,0.705273,0.678668,0.680233,(`country` == 'A')
2,cohort_1,0.807527,0.782609,0.821505,0.775828,0.782609,(`country` == 'B')
3,cohort_2,0.59371,0.649832,0.671374,0.647474,0.666667,(`country` == 'C')


In [31]:
#model = LGBMClassifier(random_state=SEED)
model = LogisticRegression()

cht_manager = CohortManager(
    transform_pipe=[
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
    ],
    cohort_col=["country"]
)

pipe = Pipeline([
            ("imputer", cht_manager),
            ("encoder", dp.EncoderOHE(verbose=False)),
            ("estimator", model),
        ])
pipe.fit(X_train, y_train)
pred = pipe.predict_proba(X_test)

experiments["Imputer for each Cohort"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Imputer for each Cohort"]

Unnamed: 0,cohort,roc,pr,recall,f1,acc,cht_query
0,all,0.672092,0.704011,0.708096,0.686405,0.686667,all
1,cohort_0,0.658224,0.724443,0.712887,0.68779,0.688953,(`country` == 'A')
2,cohort_1,0.7,0.7,0.723656,0.670014,0.673913,(`country` == 'B')
3,cohort_2,0.749037,0.720539,0.752246,0.69457,0.7,(`country` == 'C')


In [32]:
#model = LGBMClassifier(random_state=SEED)
model = LogisticRegression()

cht_manager = CohortManager(
    transform_pipe=[
        dp.BasicImputer(verbose=False),
        dp.DataMinMaxScaler(verbose=False),
        dp.EncoderOHE(verbose=False),
        model
    ],
    cohort_col=["country"]
)
cht_manager.fit(X_train, y_train)
pred = cht_manager.predict_proba(X_test)

experiments["Decoupled Classifiers"] = fetch_cohort_results(X_test, y_test, pred, cohort_col=["country"])
experiments["Decoupled Classifiers"]

Unnamed: 0,cohort,roc,pr,recall,f1,acc,cht_query
0,all,0.719926,0.742963,0.714853,0.720782,0.744444,all
1,cohort_0,0.711713,0.761347,0.726475,0.730741,0.75,(`country` == 'A')
2,cohort_1,0.841935,0.8,0.83871,0.778846,0.782609,(`country` == 'B')
3,cohort_2,0.777279,0.729911,0.764442,0.710145,0.716667,(`country` == 'C')
