In [66]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoost, CatBoostClassifier

In [67]:
train_df = pd.read_parquet("../data/train_data.pqt")
test_df = pd.read_parquet("../data/test_data.pqt")
train_df.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}


In [68]:
test_df.head(5)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,
3,200001,month_4,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}
4,200001,month_5,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}


In [69]:
for col in train_df.columns:
    if train_df[col].dtype in ['int64', 'float64']:
        mean_val = train_df[col].mean()
        train_df[col] = train_df[col].fillna(mean_val)
    else:
        mod_val = train_df[train_df[col].notna()][col].mode()
        train_df[col] = train_df[col].fillna(mod_val[0])
        train_df[col] = train_df[col].astype("category")
for col in test_df.columns:
    if test_df[col].dtype in ['int64', 'float64']:
        mean_val = test_df[col].mean()
        test_df[col] = test_df[col].fillna(mean_val)
    else:
        if col != 'start_cluster':
            mod_val = test_df[test_df[col].notna()][col].mode()
            test_df[col] = test_df[col].fillna(mod_val[0])
            test_df[col] = test_df[col].astype("category")

In [70]:
m1_df = train_df[train_df['date'] == 'month_1']
m2_df = train_df[train_df['date'] == 'month_2']
m3_df = train_df[train_df['date'] == 'month_3']

m1_df = m1_df.merge(m2_df[['id','start_cluster']],how='left',on='id')
m2_df = m2_df.merge(m3_df[['id','start_cluster']],how='left',on='id')
mdf = test_df[test_df['date'] == 'month_4']

mdf = mdf.merge(test_df[test_df['date'] == 'month_5'][['id','start_cluster']],how='left',on='id')
cluster_train = pd.concat([m1_df,m2_df,mdf])
cluster_train = cluster_train.rename(columns={'start_cluster_y':'target'})
cluster_train = cluster_train.drop(['date','end_cluster'],axis=1)
cluster_train.head(5)

Unnamed: 0,id,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,ogrn_days_end_month,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster_x,target
0,0,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,-0.488553,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}","{α, γ}"
1,1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,index_city_code_46,0.324343,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
2,2,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,index_city_code_46,-0.256297,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
3,3,-0.156643,-0.204861,-0.12566,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,-1.185321,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}
4,4,-0.138847,-0.182486,-0.12563,-0.138328,channel_code_8,city_21,city_type_0,index_city_code_46,-1.417577,...,0.957443,0.672129,-0.078233,0.558209,0.707687,-0.178408,0.252946,0.440474,{α},{α}


In [43]:
X = cluster_train.drop(['id','target'],axis=1)
y = cluster_train["target"]
cat_cols = X.select_dtypes(exclude=['number']).columns.to_list()
cat_f = [X.columns.get_loc(column_name) for column_name in cat_cols if column_name in X.columns.tolist()]
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.3,
                                                  random_state=42)

In [44]:
cl_model  = CatBoostClassifier(random_state= 42,
                                 cat_features = cat_f,
                                 iterations=30, 
                                 thread_count=8,
                                 learning_rate=0.3,
                                 l2_leaf_reg= 6.769230769230769,
                                 depth= 8,
                                 custom_metric=['AUC:hints=skip_train~false'])
cl_model.fit(x_train,y_train, 
              eval_set=(x_val, y_val),
              plot=True,
              verbose=25
              )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.5055891	test: 0.4793383	best: 0.4793383 (0)	total: 23.8s	remaining: 11m 30s
25:	learn: 0.3048624	test: 0.2909394	best: 0.2909394 (25)	total: 6m 35s	remaining: 1m
29:	learn: 0.3024364	test: 0.2892138	best: 0.2892138 (29)	total: 7m 34s	remaining: 0us

bestTest = 0.2892138392
bestIteration = 29


<catboost.core.CatBoostClassifier at 0x1e16f58faa0>

In [71]:
cluster_test = test_df[test_df['date'] =='month_5']
cluster_test = cluster_test.rename(columns={'start_cluster':'start_cluster_x'})
cluster_test.head(3)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster_x
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
4,200001,month_5,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
7,200002,month_5,0.572242,1.502779,-0.125995,0.574963,channel_code_12,city_14,city_type_0,index_city_code_78,...,0.499912,0.949989,0.522704,2.442243,0.56394,0.84102,3.313686,0.256701,0.550364,{other}


In [72]:
yp_proba = cl_model.predict_proba(cluster_test.drop(['date','id'],axis=1))
y_pred =  yp_proba / yp_proba.sum(axis=1, keepdims=True)
max_column_indices = np.argmax(y_pred, axis=1)
max_column_names = [cl_model.classes_[idx] for idx in max_column_indices]
max_column_names

['{α}',
 '{α}',
 '{other}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α, μ}',
 '{α}',
 '{α}',
 '{}',
 '{α}',
 '{α}',
 '{α}',
 '{}',
 '{α}',
 '{α}',
 '{α, γ}',
 '{α}',
 '{α}',
 '{α}',
 '{α, β}',
 '{α}',
 '{α}',
 '{α}',
 '{α, β}',
 '{α}',
 '{α}',
 '{α}',
 '{α, ε, η}',
 '{α}',
 '{α, η}',
 '{α, δ}',
 '{α}',
 '{α, γ}',
 '{α, ε}',
 '{α}',
 '{other}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α, η}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α, η}',
 '{α}',
 '{α, η}',
 '{α}',
 '{α}',
 '{α}',
 '{α, η}',
 '{α}',
 '{α}',
 '{α}',
 '{α, η}',
 '{other}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{}',
 '{α}',
 '{α}',
 '{α}',
 '{}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α, β}',
 '{α}',
 '{α, δ}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{α}',
 '{}',
 '{α, γ}',
 '{α}',
 '{α, ε, η}',
 '{α}',
 '{α}',
 '{α, η}',
 '{α}',
 '{α}',
 '{α}',
 '{other}',
 '{α}',
 '{α}',
 '{other}',
 '{α, β}',
 '{α, μ}',
 '{α, η}',
 '{α}',
 '{other}',
 '{α}',
 '{α}',
 '{α, δ}',
 '{α, γ}',
 '{α, γ

In [73]:
mcr_df = test_df[test_df['date'] =='month_6']
mcr_df = mcr_df.reset_index(drop=True)
mcr_df['start_cluster'] = max_column_names

In [74]:
mcr_df

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.006812,0.945281,0.396267,-0.152800,0.549468,0.541020,0.387566,0.268543,0.836079,{α}
1,200001,month_6,-0.156722,-0.204920,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
2,200002,month_6,-0.048015,0.448252,-0.125995,-0.047215,channel_code_12,city_14,city_type_0,index_city_code_78,...,0.123154,0.946850,0.453739,2.614870,0.565087,0.818798,4.449125,0.258723,0.627287,{other}
3,200003,month_6,-0.156579,-0.204813,-0.125501,-0.156115,channel_code_7,city_31,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
4,200004,month_6,-0.153379,-0.201932,-0.125995,-0.154155,channel_code_7,city_0,city_type_0,index_city_code_46,...,-0.027573,0.944889,0.396267,-0.165324,0.547032,0.418798,-0.201123,0.250924,0.374540,{α}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,month_6,-0.153707,-0.202806,-0.125498,-0.153234,channel_code_14,city_22,city_type_0,index_city_code_29,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
99996,299996,month_6,-0.154929,-0.197878,-0.125873,-0.154459,channel_code_1,city_96,city_type_0,index_city_code_66,...,-0.028584,0.944497,0.384773,-0.155776,0.549755,0.507687,-0.191186,0.252657,0.440474,{α}
99997,299997,month_6,-0.105294,-0.141429,-0.104590,-0.104671,channel_code_17,city_85,city_type_0,index_city_code_103,...,-0.028584,0.944497,0.384773,0.087901,0.551904,0.685465,0.090620,0.258723,0.594320,{α}
99998,299998,month_6,-0.155350,-0.203711,-0.125995,-0.155980,channel_code_9,city_25,city_type_0,index_city_code_30,...,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α}


In [75]:
test_df = pd.concat([test_df[test_df['date'] != 'month_6'],mcr_df])
test_df

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.010952,0.946066,0.407762,-0.153950,0.548895,0.541020,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
3,200001,month_4,-0.156722,-0.204920,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
4,200001,month_5,-0.156722,-0.204920,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
6,200002,month_4,-0.096506,0.185905,-0.125995,-0.095856,channel_code_12,city_14,city_type_0,index_city_code_78,...,0.510730,0.950774,0.545693,0.715525,0.554913,0.718798,0.445811,0.254968,0.495419,{other}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,month_6,-0.153707,-0.202806,-0.125498,-0.153234,channel_code_14,city_22,city_type_0,index_city_code_29,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
99996,299996,month_6,-0.154929,-0.197878,-0.125873,-0.154459,channel_code_1,city_96,city_type_0,index_city_code_66,...,-0.028584,0.944497,0.384773,-0.155776,0.549755,0.507687,-0.191186,0.252657,0.440474,{α}
99997,299997,month_6,-0.105294,-0.141429,-0.104590,-0.104671,channel_code_17,city_85,city_type_0,index_city_code_103,...,-0.028584,0.944497,0.384773,0.087901,0.551904,0.685465,0.090620,0.258723,0.594320,{α}
99998,299998,month_6,-0.155350,-0.203711,-0.125995,-0.155980,channel_code_9,city_25,city_type_0,index_city_code_30,...,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α}


In [76]:
for col in test_df.columns:
    if test_df[col].dtype in ['int64', 'float64']:
        mean_val = test_df[col].mean()
        test_df[col] = test_df[col].fillna(mean_val)
    else:
        if col != 'start_cluster':
            mod_val = test_df[test_df[col].notna()][col].mode()
            test_df[col] = test_df[col].fillna(mod_val[0])
            test_df[col] = test_df[col].astype("category")
test_df['id'] = test_df['id'].astype(int)
test_df.head(5)

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
3,200001,month_4,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
4,200001,month_5,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
6,200002,month_4,-0.096506,0.185905,-0.125995,-0.095856,channel_code_12,city_14,city_type_0,index_city_code_78,...,0.51073,0.950774,0.545693,0.715525,0.554913,0.718798,0.445811,0.254968,0.495419,{other}


In [77]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [78]:
def calcEntropy(x):
    #Function to calculate entropy
    #-sum(p_i log_2(p_i)) where pi is each possible value in dataset 

    stat = pd.Series(x)
    stat = stat.value_counts().reset_index()
    stat = stat.assign(perc_cnt = stat['count']/ sum(stat['count']))

    stat = stat.assign(entropy = -stat['perc_cnt'] * np.log2(stat['perc_cnt']))

    stat['total_entropy'] =sum(stat['entropy'])
    return sum(stat['entropy'])


def calcIG(x, y):
    #1.Calc original entropy
    entropy_origin = calcEntropy(y)

    #2. Calculate entropy after deviding features
    x = pd.Series(x)
    x_df = x.value_counts().reset_index()
    x_df['percCount'] = x_df['count'] / sum(x_df['count'])

    #2.1 Check is binarycolumn
    if x_df.shape[0]<=2:
        x_df['entropy'] = x.apply(lambda bin: calcEntropy(y[x[x==bin].index])[1])
    else:

        df_ =pd.DataFrame(data=np.array([x,y]).transpose(), columns=['x', 'y']).sort_values(by='x')
        df_['moving_average'] = df_['x'].rolling(window=2).mean()
        min_entropy, min_aver,min_left,min_right = 1000, 0, 0, 0
        print(len(df_['moving_average'][1:].unique()))
        aveg_move = df_['moving_average'][1:].unique()
        if len(aveg_move) > 100:
            _, aveg_move = pd.qcut(df_['moving_average'][1:].unique(), 15,retbins='bins')
        for aver in aveg_move:
            left_node = df_.query('x < @aver')['y']
            right_node = df_.query('x > @aver')['y']
            left_node_entropy = calcEntropy(left_node)
            right_node_entropy = calcEntropy(right_node)

            total_entropy = left_node_entropy * (left_node.shape[0] / df_.shape[0])  + right_node_entropy * (right_node.shape[0] / df_.shape[0])
            if total_entropy < min_entropy:
                min_entropy = total_entropy
                min_left, min_right = left_node_entropy, right_node_entropy
                min_aver = aver

            #print(f"С treshold {aver} total weighted entropy=  {total_entropy}")
        left_df = df_[df_['x'] < min_aver]
        right_df =df_[df_['x'] >= min_aver]
        return pd.DataFrame({"diff":[f"<{min_aver}", f">={min_aver}"],
                            "count":[left_df.shape[0], right_df.shape[0]] ,
                             "percCount":[left_df.shape[0] / df_.shape[0], right_df.shape[0]/ df_.shape[0]] ,
                             "entropy":[min_left, min_right],
                             "weighted_entropy":[min_left*(left_df.shape[0] / df_.shape[0]), min_right *( right_df.shape[0]/ df_.shape[0])],
                            "name" : [x.name,x.name]
                            
                            }),entropy_origin - min_entropy,min_aver

    # 3. Calculate weighted average
    x_df['weighted_entropy'] = x_df['entropy'] * x_df['percCount']

    #4. Calculate Information gain
    x_df['IG'] = entropy_origin - sum(x_df['weighted_entropy'] / x_df.shape[0])

    return x_df, x_df['IG'][0], 1

In [79]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()
weights_dict

{'{}': 1,
 '{α, ε, η}': 1,
 '{α, ε, θ}': 1,
 '{α, θ}': 1,
 '{α, π}': 1,
 '{α, ε}': 2,
 '{α, η}': 2,
 '{α, μ}': 2,
 '{α}': 2,
 '{λ}': 2,
 '{other}': 3,
 '{α, β}': 3,
 '{α, γ}': 3,
 '{α, δ}': 3,
 '{α, ε, ψ}': 3,
 '{α, λ}': 3,
 '{α, ψ}': 3}

In [80]:
m1_df = train_df[train_df['date'] == 'month_1']
m2_df = train_df[train_df['date'] == 'month_2']
m3_df = train_df[train_df['date'] == 'month_3']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
df.head(5)

Unnamed: 0,id,date_x,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,index_city_code_46,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
2,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
3,3,month_3,-0.156522,-0.204718,-0.125759,-0.156058,channel_code_14,city_21,city_type_0,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}
4,4,month_3,-0.141798,-0.170262,-0.125672,-0.141289,channel_code_8,city_21,city_type_0,index_city_code_46,...,0.957443,0.672129,-0.078233,0.558209,0.707687,-0.178408,0.252946,0.440474,{α},{α}


In [81]:
train_df = df
train_df = train_df.drop(['date','date_x','date_y'],axis=1)
train_df.head(5)

Unnamed: 0,id,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,ogrn_days_end_month_x,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,-0.488553,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,1,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,index_city_code_46,0.324343,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
2,2,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,index_city_code_46,-0.256297,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
3,3,-0.156522,-0.204718,-0.125759,-0.156058,channel_code_14,city_21,city_type_0,index_city_code_46,-1.185321,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}
4,4,-0.141798,-0.170262,-0.125672,-0.141289,channel_code_8,city_21,city_type_0,index_city_code_46,-1.417577,...,0.957443,0.672129,-0.078233,0.558209,0.707687,-0.178408,0.252946,0.440474,{α},{α}


In [82]:
m1_df = test_df[test_df['date'] == 'month_4']
m2_df = test_df[test_df['date'] == 'month_5']
m3_df = test_df[test_df['date'] == 'month_6']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
df.head(5)

Unnamed: 0,id,date_x,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,index_city_code_46,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200001,month_6,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
2,200002,month_6,-0.048015,0.448252,-0.125995,-0.047215,channel_code_12,city_14,city_type_0,index_city_code_78,...,0.51073,0.950774,0.545693,0.715525,0.554913,0.718798,0.445811,0.254968,0.495419,{other}
3,200003,month_6,-0.156579,-0.204813,-0.125501,-0.156115,channel_code_7,city_31,city_type_0,index_city_code_46,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
4,200004,month_6,-0.153379,-0.201932,-0.125995,-0.154155,channel_code_7,city_0,city_type_0,index_city_code_46,...,,,,,,,,,,


In [83]:
test_df = df 
test_df = test_df.drop(['date','date_x','date_y'],axis=1)

In [84]:
train_df.head(5)

Unnamed: 0,id,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,ogrn_days_end_month_x,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,-0.488553,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,1,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,index_city_code_46,0.324343,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
2,2,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,index_city_code_46,-0.256297,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
3,3,-0.156522,-0.204718,-0.125759,-0.156058,channel_code_14,city_21,city_type_0,index_city_code_46,-1.185321,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{α}
4,4,-0.141798,-0.170262,-0.125672,-0.141289,channel_code_8,city_21,city_type_0,index_city_code_46,-1.417577,...,0.957443,0.672129,-0.078233,0.558209,0.707687,-0.178408,0.252946,0.440474,{α},{α}


In [85]:
test_df.head(5)

Unnamed: 0,id,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,ogrn_days_end_month_x,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,index_city_code_46,-1.533705,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200001,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_46,0.092087,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
2,200002,-0.048015,0.448252,-0.125995,-0.047215,channel_code_12,city_14,city_type_0,index_city_code_78,-1.069193,...,0.51073,0.950774,0.545693,0.715525,0.554913,0.718798,0.445811,0.254968,0.495419,{other}
3,200003,-0.156579,-0.204813,-0.125501,-0.156115,channel_code_7,city_31,city_type_0,index_city_code_46,-0.256297,...,-0.028584,0.946254,0.411172,-0.165588,0.551103,0.507374,-0.201123,0.255138,0.451548,{α}
4,200004,-0.153379,-0.201932,-0.125995,-0.154155,channel_code_7,city_0,city_type_0,index_city_code_46,0.672727,...,,,,,,,,,,


In [86]:
for col in train_df.columns:
    if train_df[col].dtype in ['int64', 'float64']:
        mean_val = train_df[col].mean()
        train_df[col] = train_df[col].fillna(mean_val)
    else:
        mod_val = train_df[train_df[col].notna()][col].mode()
        train_df[col] = train_df[col].fillna(mod_val[0])
        train_df[col] = train_df[col].astype("category")
for col in test_df.columns:
    if test_df[col].dtype in ['int64', 'float64']:
        mean_val = test_df[col].mean()
        test_df[col] = test_df[col].fillna(mean_val)
    else:
            mod_val = test_df[test_df[col].notna()][col].mode()
            test_df[col] = test_df[col].fillna(mod_val[0])
            test_df[col] = test_df[col].astype("category")

In [87]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['end_cluster_x'])
mypdf = pd.DataFrame()
trend = []
mina = []
info = []
col_list = []
for col in train_df.columns:
    if col in ['id',"end_cluster_x",'end_cluster_y','end_cluster']:continue
    x = train_df[col]
    print(col,end=' ')
    if x.dtype in ('category','object'):
        x = label_encoder.fit_transform(x)
    stat,trend_entr,min_aver = calcIG(x,y)
    trend.append(trend_entr)
    mina.append(min_aver)
    col_list.append(col)
    mypdf = pd.concat([mypdf, stat])

balance_amt_avg_x 166712
balance_amt_max_x 155324
balance_amt_min_x 121862
balance_amt_day_avg_x 165846
channel_code_x 85
city_x 12829
city_type_x 10909
index_city_code_x 455
ogrn_days_end_month_x 63
ogrn_days_end_quarter_x 185
ogrn_month_x 23
ogrn_year_x 41
ft_registration_date_x 9998
max_founderpres_x 9497
min_founderpres_x 9284
ogrn_exist_months_x 475
okved_x 173
segment_x 7
sum_of_paym_2m_x 118985
sum_of_paym_6m_x 136719
sum_of_paym_1y_x 147160
sum_a_oper_1m_x 6095
cnt_a_oper_1m_x 138
sum_b_oper_1m_x 1619
cnt_b_oper_1m_x 60
sum_c_oper_1m_x 29450
cnt_c_oper_1m_x 647
sum_deb_d_oper_1m_x 33720
cnt_deb_d_oper_1m_x 192
sum_cred_d_oper_1m_x 7506
cnt_cred_d_oper_1m_x 195
sum_deb_e_oper_1m_x 121350
cnt_deb_e_oper_1m_x 1640
cnt_days_deb_e_oper_1m_x 65
sum_cred_e_oper_1m_x 100953
cnt_cred_e_oper_1m_x 1251
cnt_days_cred_e_oper_1m_x 65
sum_deb_f_oper_1m_x 48848
cnt_deb_f_oper_1m_x 412
cnt_days_deb_f_oper_1m_x 65
sum_cred_f_oper_1m_x 4076
cnt_cred_f_oper_1m_x 195
cnt_days_cred_f_oper_1m_x 63
su

In [88]:
entropy_df = pd.DataFrame(columns = ['entropy_change','min_aver','feature'])
entropy_df['entropy_change'] = trend
entropy_df['min_aver'] = mina 
entropy_df['feature'] = col_list
entropy_df

Unnamed: 0,entropy_change,min_aver,feature
0,0.096960,-0.152721,balance_amt_avg_x
1,0.124642,-0.202255,balance_amt_max_x
2,0.388460,-0.125995,balance_amt_min_x
3,0.096390,-0.152208,balance_amt_day_avg_x
4,0.435250,43.000000,channel_code_x
...,...,...,...
265,0.464091,0.407687,cnt_days_deb_h_oper_3m
266,0.131432,-0.196819,sum_cred_h_oper_3m
267,0.548466,0.250924,cnt_cred_h_oper_3m
268,0.548466,0.374540,cnt_days_cred_h_oper_3m


In [89]:
cols_features = entropy_df.sort_values(by='entropy_change', ascending=False).iloc[:60]['feature'].tolist()
cols_features

['city_type',
 'city_type_y',
 'city_type_x',
 'cnt_b_oper_3m_x',
 'cnt_b_oper_1m_x',
 'cnt_cred_f_oper_3m_x',
 'cnt_days_cred_f_oper_3m_x',
 'cnt_days_cred_f_oper_1m_x',
 'cnt_cred_f_oper_1m_x',
 'cnt_cred_d_oper_3m_x',
 'cnt_b_oper_3m_y',
 'cnt_cred_d_oper_1m_x',
 'cnt_b_oper_1m_y',
 'cnt_cred_f_oper_3m_y',
 'cnt_days_cred_f_oper_3m_y',
 'cnt_b_oper_3m',
 'cnt_days_cred_f_oper_1m_y',
 'cnt_cred_f_oper_1m_y',
 'cnt_cred_d_oper_3m_y',
 'cnt_cred_d_oper_1m_y',
 'cnt_b_oper_1m',
 'cnt_days_cred_f_oper_3m',
 'cnt_cred_f_oper_3m',
 'cnt_days_cred_f_oper_1m',
 'cnt_cred_f_oper_1m',
 'cnt_a_oper_1m_x',
 'cnt_cred_d_oper_3m',
 'cnt_a_oper_3m_x',
 'cnt_cred_d_oper_1m',
 'cnt_a_oper_1m_y',
 'cnt_a_oper_3m_y',
 'cnt_a_oper_3m',
 'cnt_a_oper_1m',
 'segment',
 'cnt_c_oper_1m_x',
 'cnt_c_oper_3m_x',
 'cnt_days_cred_g_oper_1m_x',
 'cnt_cred_g_oper_1m_x',
 'segment_y',
 'segment_x',
 'cnt_days_cred_g_oper_1m_y',
 'cnt_cred_g_oper_1m_y',
 'cnt_c_oper_1m_y',
 'cnt_c_oper_3m_y',
 'start_cluster_x',
 'cn

In [90]:
train_df = train_df[cols_features + ['end_cluster_x']]
test_df = test_df[cols_features]

In [91]:
X = train_df.drop(['end_cluster_x'],axis= 1)
Z = test_df[cols_features]
y = train_df["end_cluster_x"]
n_models = 15
k = X.shape[1] // n_models
d_model = {}
for i in range(n_models):
    X_train = X.iloc[:,i*k:(i+1)*k]
    X_test = Z.iloc[:,i*k:(i+1)*k]
    Y = y
    cat_f = X_train.select_dtypes(exclude=['number']).columns.to_list()
    d_model[i] = (X_train, Y,X_test,cat_f)


In [92]:
for n_m in range(n_models):
    x_train, y_train,x_test,cat_f = d_model[n_m]
    cb = CatBoostClassifier(random_state= 42,
                                 cat_features=cat_f,
                                 iterations=50, 
                                 thread_count=8,
                                 learning_rate=0.3,
                                 depth=2,
                                 custom_metric=['AUC:hints=skip_train~false'])
    xfit, x_val, yfit, y_val = train_test_split(x_train, y_train,
                                                  test_size=0.2,
                                                  random_state=42)
    cb.fit(xfit,yfit)
    

    yp_proba = cb.predict_proba(x_train)
    y_pred =  yp_proba / yp_proba.sum(axis=1, keepdims=True)
    max_column_indices = np.argmax(y_pred, axis=1)
    max_column_names = [cb.classes_[idx] for idx in max_column_indices]
    
    train_pred_proba_df = pd.DataFrame(max_column_names, columns=['Answer_'])
    train_df = train_df.join(train_pred_proba_df,how='left',rsuffix=f'{n_m}_mod')
    
    yp_proba = cb.predict_proba(x_test)
    y_pred =  yp_proba / yp_proba.sum(axis=1, keepdims=True)
    max_column_indices = np.argmax(y_pred, axis=1)
    max_column_names = [cb.classes_[idx] for idx in max_column_indices]
    
    test_pred_proba_df = pd.DataFrame(max_column_names, columns=['Answer_'])
    test_df = test_df.join(test_pred_proba_df,how='left',rsuffix=f'{n_m}_mod')


0:	learn: 1.6367321	total: 1.18s	remaining: 57.7s
1:	learn: 1.5176970	total: 1.96s	remaining: 47s
2:	learn: 1.4547825	total: 2.77s	remaining: 43.4s
3:	learn: 1.4172414	total: 3.55s	remaining: 40.8s
4:	learn: 1.3933184	total: 4.49s	remaining: 40.4s
5:	learn: 1.3782799	total: 5.4s	remaining: 39.6s
6:	learn: 1.3687695	total: 6.22s	remaining: 38.2s
7:	learn: 1.3612642	total: 7.13s	remaining: 37.4s
8:	learn: 1.3566261	total: 7.94s	remaining: 36.2s
9:	learn: 1.3518204	total: 8.9s	remaining: 35.6s
10:	learn: 1.3493288	total: 9.78s	remaining: 34.7s
11:	learn: 1.3477907	total: 10.9s	remaining: 34.4s
12:	learn: 1.3467838	total: 11.9s	remaining: 33.8s
13:	learn: 1.3456712	total: 12.9s	remaining: 33.1s
14:	learn: 1.3452322	total: 14s	remaining: 32.6s
15:	learn: 1.3447886	total: 15s	remaining: 31.8s
16:	learn: 1.3445193	total: 16.2s	remaining: 31.5s
17:	learn: 1.3444114	total: 17.1s	remaining: 30.3s
18:	learn: 1.3442744	total: 18s	remaining: 29.3s
19:	learn: 1.3438086	total: 19s	remaining: 28.5s
20

In [93]:
X = train_df.drop(['end_cluster_x'],axis=1)
y = train_df["end_cluster_x"]
cat_cols = X.select_dtypes(exclude=['number']).columns.to_list()
cat_f = [X.columns.get_loc(column_name) for column_name in cat_cols if column_name in X.columns.tolist()]
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.3,
                                                  random_state=42)

In [111]:
MetaMind  = CatBoostClassifier(random_state= 42,
                                 cat_features = cat_f,
                                 iterations=125, 
                                 thread_count=8,
                                 learning_rate=0.15,
                                 l2_leaf_reg= 6.769230769230769,
                                 min_data_in_leaf=675,
                                 max_depth= 8,
                                 custom_metric=['AUC:hints=skip_train~false'])
MetaMind.fit(x_train,y_train, 
              eval_set=(x_val, y_val),
              plot=True,
              verbose=25
              )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.7797312	test: 1.6819629	best: 1.6819629 (0)	total: 20.2s	remaining: 41m 47s
25:	learn: 0.9370318	test: 0.8368584	best: 0.8368584 (25)	total: 5m 46s	remaining: 21m 59s
50:	learn: 0.9152392	test: 0.8196874	best: 0.8196874 (50)	total: 10m 53s	remaining: 15m 47s
75:	learn: 0.9071763	test: 0.8156447	best: 0.8156447 (75)	total: 16m 24s	remaining: 10m 34s
100:	learn: 0.8997975	test: 0.8132868	best: 0.8132868 (100)	total: 21m 55s	remaining: 5m 12s
124:	learn: 0.8956988	test: 0.8124989	best: 0.8124989 (124)	total: 27m 14s	remaining: 0us

bestTest = 0.8124988868
bestIteration = 124


<catboost.core.CatBoostClassifier at 0x1e1a344c740>

In [112]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [113]:
yp_proba = MetaMind.predict_proba(x_val)
y_pred_normalized = yp_proba / yp_proba.sum(axis=1, keepdims=True)
weighted_roc_auc(y_val, y_pred_normalized, MetaMind.classes_, weights_dict)

0.8988554714941756

In [97]:
sample_submission_df = pd.read_csv("sample_submission.csv")

In [99]:
test_pred_proba = MetaMind.predict_proba(test_df)
test_pred_proba = test_pred_proba / test_pred_proba.sum(axis=1, keepdims=True)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=MetaMind.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [100]:
test_pred_proba_df.shape

(100000, 17)

In [101]:
test_pred_proba_df.head(100)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.008555,0.057142,0.008965,0.015077,0.006017,0.000577,0.001355,0.000398,0.005865,0.006609,0.008883,0.001053,0.002327,0.000292,0.001802,0.874715,0.000370
1,0.038277,0.453397,0.012124,0.016726,0.007589,0.003383,0.003876,0.002349,0.009495,0.022345,0.011489,0.003356,0.007578,0.001988,0.006715,0.396826,0.002486
2,0.690063,0.009170,0.008866,0.072778,0.008021,0.010711,0.007523,0.005955,0.016627,0.044562,0.015932,0.006166,0.007366,0.002605,0.023226,0.067663,0.002766
3,0.038277,0.453397,0.012124,0.016726,0.007589,0.003383,0.003876,0.002349,0.009495,0.022345,0.011489,0.003356,0.007578,0.001988,0.006715,0.396826,0.002486
4,0.036779,0.116260,0.015664,0.037397,0.009710,0.002159,0.002380,0.001136,0.009622,0.034866,0.008337,0.002224,0.008146,0.000957,0.004110,0.709159,0.001093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.289381,0.149092,0.004138,0.005483,0.003660,0.010941,0.004597,0.002791,0.010324,0.136318,0.010108,0.002523,0.004431,0.002314,0.003505,0.358048,0.002347
96,0.038277,0.453397,0.012124,0.016726,0.007589,0.003383,0.003876,0.002349,0.009495,0.022345,0.011489,0.003356,0.007578,0.001988,0.006715,0.396826,0.002486
97,0.038277,0.453397,0.012124,0.016726,0.007589,0.003383,0.003876,0.002349,0.009495,0.022345,0.011489,0.003356,0.007578,0.001988,0.006715,0.396826,0.002486
98,0.014112,0.072318,0.013725,0.019936,0.006934,0.000461,0.000604,0.000271,0.006080,0.009584,0.004578,0.000952,0.002903,0.000182,0.001485,0.845620,0.000255


In [106]:
def recalibrate_probabilities(df, weights):
    new_df = df.copy()
    for col in df.columns:
        if col in weights:
            new_df[col] = df[col] * weights[col]
  
    row_sums = new_df.sum(axis=1)
    new_df = new_df.div(row_sums, axis=0)
    return new_df
weights = {k: v for k, v in sorted(weights_dict.items(), key=lambda item: item[0])}
sample_submission_df[sorted_classes] = recalibrate_probabilities(test_pred_proba_df, weights)
sample_submission_df.to_csv("f1_submission.csv", index=False)

In [107]:
feat = MetaMind.get_feature_importance(prettified=True)


In [108]:
feat

Unnamed: 0,Feature Id,Importances
0,Answer_11_mod,32.840289
1,start_cluster_x,13.775087
2,Answer_9_mod,10.382218
3,Answer_12_mod,5.287236
4,start_cluster,4.632949
...,...,...
70,cnt_c_oper_3m_y,0.000000
71,cnt_c_oper_1m,0.000000
72,cnt_cred_g_oper_1m,0.000000
73,cnt_days_cred_g_oper_1m,0.000000
