# Cohort Case Study 1

In [43]:
import random
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.pipeline import Pipeline

import raimitigations.dataprocessing as dp
from raimitigations.cohort import CohortDefinition, CohortManager

SEED = 42

def _create_country_df(samples: int, sectors: dict, country_name: str):
    df = None
    for key in sectors.keys():
        size = int(samples * sectors[key]["prob_occur"])
        invest = np.random.uniform(low=sectors[key]["min"], high=sectors[key]["max"], size=size)
        min_invest = min(invest)
        max_invest = max(invest)
        range_invest = max_invest - min_invest
        bankrupt_th = sectors[key]["prob_success"] * range_invest
        bankrupt = []
        for i in range(invest.shape[0]):
            if invest[i] > bankrupt_th:
                bankrupt.append(0)
            else:
                bankrupt.append(1)
        noise_ind = np.random.choice(range(size), int(size*0.1), replace=False)
        for ind in noise_ind:
            bankrupt[ind] = int(not bankrupt[ind])
        noise_ind = np.random.choice(range(size), int(size*0.1), replace=False)
        for ind in noise_ind:
            invest[ind] = np.nan
        
        country_col = [country_name for _ in range(size)]
        sector_col = [key for _ in range(size)]
        df_sector = pd.DataFrame({
            "investment":invest,
            "sector":sector_col,
            "country":country_col,
            "bankrupt":bankrupt
        })
        
        if df is None:
            df = df_sector
        else:
            df = pd.concat([df, df_sector], axis=0)
    return df

In [48]:
def create_df_two_distributions(samples: list):
    #np.random.seed(SEED)
    #random.seed(SEED)
    sectors_c1 = {
        "s1": {"prob_occur":0.2, "prob_success":0.8, "min":20000, "max":100000},
        "s2": {"prob_occur":0.4, "prob_success":0.6, "min":1000, "max":15000},
        "s3": {"prob_occur":0.1, "prob_success":0.9, "min":1000000, "max":10000000},
        "s4": {"prob_occur":0.3, "prob_success":0.7, "min":4000000, "max":900000000},
    }
    sectors_c2 = {
        "s1": {"prob_occur":0.05, "prob_success":0.6, "min":1000, "max":5000},
        "s2": {"prob_occur":0.6, "prob_success":0.9, "min":100000, "max":1500000},
        "s3": {"prob_occur":0.2, "prob_success":0.5, "min":50000, "max":300000},
        "s4": {"prob_occur":0.15, "prob_success":0.8, "min":1000000, "max":10000000},
    }
    countries = {
        "A":{"sectors":sectors_c1, "sample_rate":0.8},
        "B":{"sectors":sectors_c2, "sample_rate":0.2}
    }
    df = None
    for key in countries.keys():
        n_sample = int(samples * countries[key]["sample_rate"])
        df_c = _create_country_df(n_sample, countries[key]["sectors"], key)
        if df is None:
            df = df_c
        else:
            df = pd.concat([df, df_c], axis=0)
    return df
    
df = create_df_two_distributions(1000)
df

Unnamed: 0,investment,sector,country,bankrupt
0,3.642943e+04,s1,A,1
1,5.996472e+04,s1,A,1
2,9.683313e+04,s1,A,0
3,4.748264e+04,s1,A,1
4,,s1,A,1
...,...,...,...,...
25,,s4,B,0
26,3.017386e+06,s4,B,1
27,3.726918e+06,s4,B,1
28,1.878654e+06,s4,B,1


In [36]:
subset = df.query("sector == 's1' and investment < 60000")
subset['bankrupt'].value_counts(normalize=True)

1    1.0
Name: bankrupt, dtype: float64