# Baseline

In [39]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoost, CatBoostClassifier

## Загрузка данных

In [40]:
train_df = pd.read_parquet("../data/train_data.pqt")
test_df = pd.read_parquet("../datatest_data.pqt")

In [43]:
for col in train_df.columns:
    if train_df[col].dtype in ['int64', 'float64']:
        mean_val = train_df[col].mean()
        train_df[col] = train_df[col].fillna(mean_val)
    else:
        mod_val = train_df[train_df[col].notna()][col].mode()
        train_df[col] = train_df[col].fillna(mod_val[0])
        train_df[col] = train_df[col].astype("category")
for col in test_df.columns:
    if test_df[col].dtype in ['int64', 'float64']:
        mean_val = test_df[col].mean()
        test_df[col] = test_df[col].fillna(mean_val)
    else:
        if col != 'start_cluster':
            mod_val = test_df[test_df[col].notna()][col].mode()
            test_df[col] = test_df[col].fillna(mod_val[0])
            test_df[col] = test_df[col].astype("category")

In [45]:
m1_df = train_df[train_df['date'] == 'month_1']
m2_df = train_df[train_df['date'] == 'month_2']
m3_df = train_df[train_df['date'] == 'month_3']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
train_df = df
train_df = train_df.drop(['date','date_x','date_y'],axis=1)

In [46]:
m3_df = pd.read_csv("test_6m.csv")

In [47]:
m1_df = test_df[test_df['date'] == 'month_4']
m2_df = test_df[test_df['date'] == 'month_5']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
test_df = df 
test_df = test_df.drop(['date','date_x','date_y'],axis=1)

In [49]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [50]:
train_df['b_amt_avg_trend'] = train_df['balance_amt_day_avg_x'] - train_df[['balance_amt_day_avg','balance_amt_day_avg_y','balance_amt_day_avg_x']].mean(axis=1)

train_df['b_amt_min_trend'] = train_df['balance_amt_min_x'] - train_df[['balance_amt_min_y','balance_amt_min','balance_amt_min_x']].mean(axis=1)

train_df['mx_founder_trend'] = train_df['max_founderpres_x'] - train_df[['max_founderpres_x','max_founderpres','max_founderpres_y']].mean(axis=1)

train_df['sm_cred_e_op_trend_3m'] = train_df['sum_cred_e_oper_3m_x'] - train_df[['sum_cred_e_oper_3m_y','sum_cred_e_oper_3m_x','sum_cred_e_oper_3m']].mean(axis=1)

train_df['sm_deb_e_op_trend_1m'] = train_df['sum_deb_e_oper_1m_x'] - train_df[['sum_deb_e_oper_1m_y','sum_deb_e_oper_1m','sum_deb_e_oper_1m_x']].mean(axis=1)

In [51]:
train_df['min_founderpres_trend_long'] =train_df['min_founderpres_x'] - train_df['min_founderpres_y']

train_df['sum_deb_e_oper_3m_trend_long'] =train_df['sum_deb_e_oper_3m_x'] - train_df['sum_deb_e_oper_3m_y']

train_df['sum_cred_e_oper_1m_trend_short'] = train_df['sum_cred_e_oper_1m_x'] - train_df['sum_cred_e_oper_1m_y']

In [52]:
def calcEntropy(x):
    #Function to calculate entropy
    #-sum(p_i log_2(p_i)) where pi is each possible value in dataset 

    stat = pd.Series(x)
    stat = stat.value_counts().reset_index()
    stat = stat.assign(perc_cnt = stat['count']/ sum(stat['count']))

    stat = stat.assign(entropy = -stat['perc_cnt'] * np.log2(stat['perc_cnt']))

    stat['total_entropy'] =sum(stat['entropy'])
    return sum(stat['entropy'])


def calcIG(x, y):
    #1.Calc original entropy
    entropy_origin = calcEntropy(y)

    #2. Calculate entropy after deviding features
    x = pd.Series(x)
    x_df = x.value_counts().reset_index()
    x_df['percCount'] = x_df['count'] / sum(x_df['count'])

    #2.1 Check is binarycolumn
    if x_df.shape[0]<=2:
        x_df['entropy'] = x.apply(lambda bin: calcEntropy(y[x[x==bin].index])[1])
    else:

        df_ =pd.DataFrame(data=np.array([x,y]).transpose(), columns=['x', 'y']).sort_values(by='x')
        df_['moving_average'] = df_['x'].rolling(window=2).mean()
        min_entropy, min_aver,min_left,min_right = 1000, 0, 0, 0
        print(len(df_['moving_average'][1:].unique()))
        aveg_move = df_['moving_average'][1:].unique()
        if len(aveg_move) > 100:
            _, aveg_move = pd.qcut(df_['moving_average'][1:].unique(), 15,retbins='bins')
        for aver in aveg_move:
            left_node = df_.query('x < @aver')['y']
            right_node = df_.query('x > @aver')['y']
            left_node_entropy = calcEntropy(left_node)
            right_node_entropy = calcEntropy(right_node)

            total_entropy = left_node_entropy * (left_node.shape[0] / df_.shape[0])  + right_node_entropy * (right_node.shape[0] / df_.shape[0])
            if total_entropy < min_entropy:
                min_entropy = total_entropy
                min_left, min_right = left_node_entropy, right_node_entropy
                min_aver = aver

            #print(f"С treshold {aver} total weighted entropy=  {total_entropy}")
        left_df = df_[df_['x'] < min_aver]
        right_df =df_[df_['x'] >= min_aver]
        return pd.DataFrame({"diff":[f"<{min_aver}", f">={min_aver}"],
                            "count":[left_df.shape[0], right_df.shape[0]] ,
                             "percCount":[left_df.shape[0] / df_.shape[0], right_df.shape[0]/ df_.shape[0]] ,
                             "entropy":[min_left, min_right],
                             "weighted_entropy":[min_left*(left_df.shape[0] / df_.shape[0]), min_right *( right_df.shape[0]/ df_.shape[0])],
                            "name" : [x.name,x.name]
                            
                            }),entropy_origin - min_entropy,min_aver

    # 3. Calculate weighted average
    x_df['weighted_entropy'] = x_df['entropy'] * x_df['percCount']

    #4. Calculate Information gain
    x_df['IG'] = entropy_origin - sum(x_df['weighted_entropy'] / x_df.shape[0])

    return x_df, x_df['IG'][0], 1

In [53]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['end_cluster_x'])
mypdf = pd.DataFrame()
trend = []
mina = []
info = []
for col in train_df.columns:
    if col in ['id','date_x','date_y','date',"end_cluster_x","end_cluster_y","end_cluster"]:continue
    x = train_df[col]
    print(col,end=' ')
    if x.dtype in ('category','object'):
        x = label_encoder.fit_transform(x)
    stat,trend_entr,min_aver = calcIG(x,y)
    trend.append(trend_entr)
    mina.append(min_aver)
    mypdf = pd.concat([mypdf, stat])

balance_amt_avg_x 166712
balance_amt_max_x 155324
balance_amt_min_x 121862
balance_amt_day_avg_x 165846
channel_code_x 85
city_x 12829
city_type_x 10909
index_city_code_x 455
ogrn_days_end_month_x 63
ogrn_days_end_quarter_x 185
ogrn_month_x 23
ogrn_year_x 41
ft_registration_date_x 9998
max_founderpres_x 9497
min_founderpres_x 9284
ogrn_exist_months_x 475
okved_x 173
segment_x 7
sum_of_paym_2m_x 118985
sum_of_paym_6m_x 136719
sum_of_paym_1y_x 147160
sum_a_oper_1m_x 6095
cnt_a_oper_1m_x 138
sum_b_oper_1m_x 1619
cnt_b_oper_1m_x 60
sum_c_oper_1m_x 29450
cnt_c_oper_1m_x 647
sum_deb_d_oper_1m_x 33720
cnt_deb_d_oper_1m_x 192
sum_cred_d_oper_1m_x 7506
cnt_cred_d_oper_1m_x 195
sum_deb_e_oper_1m_x 121350
cnt_deb_e_oper_1m_x 1640
cnt_days_deb_e_oper_1m_x 65
sum_cred_e_oper_1m_x 100953
cnt_cred_e_oper_1m_x 1251
cnt_days_cred_e_oper_1m_x 65
sum_deb_f_oper_1m_x 48848
cnt_deb_f_oper_1m_x 412
cnt_days_deb_f_oper_1m_x 65
sum_cred_f_oper_1m_x 4076
cnt_cred_f_oper_1m_x 195
cnt_days_cred_f_oper_1m_x 63
su

In [56]:
col_list = train_df.columns.tolist()
for key in ['id',"end_cluster_x","end_cluster_y","end_cluster"]:
    col_list.pop(col_list.index(key))
col_list    

['balance_amt_avg_x',
 'balance_amt_max_x',
 'balance_amt_min_x',
 'balance_amt_day_avg_x',
 'channel_code_x',
 'city_x',
 'city_type_x',
 'index_city_code_x',
 'ogrn_days_end_month_x',
 'ogrn_days_end_quarter_x',
 'ogrn_month_x',
 'ogrn_year_x',
 'ft_registration_date_x',
 'max_founderpres_x',
 'min_founderpres_x',
 'ogrn_exist_months_x',
 'okved_x',
 'segment_x',
 'sum_of_paym_2m_x',
 'sum_of_paym_6m_x',
 'sum_of_paym_1y_x',
 'sum_a_oper_1m_x',
 'cnt_a_oper_1m_x',
 'sum_b_oper_1m_x',
 'cnt_b_oper_1m_x',
 'sum_c_oper_1m_x',
 'cnt_c_oper_1m_x',
 'sum_deb_d_oper_1m_x',
 'cnt_deb_d_oper_1m_x',
 'sum_cred_d_oper_1m_x',
 'cnt_cred_d_oper_1m_x',
 'sum_deb_e_oper_1m_x',
 'cnt_deb_e_oper_1m_x',
 'cnt_days_deb_e_oper_1m_x',
 'sum_cred_e_oper_1m_x',
 'cnt_cred_e_oper_1m_x',
 'cnt_days_cred_e_oper_1m_x',
 'sum_deb_f_oper_1m_x',
 'cnt_deb_f_oper_1m_x',
 'cnt_days_deb_f_oper_1m_x',
 'sum_cred_f_oper_1m_x',
 'cnt_cred_f_oper_1m_x',
 'cnt_days_cred_f_oper_1m_x',
 'sum_deb_g_oper_1m_x',
 'cnt_deb_g_o

In [57]:
entropy_df = pd.DataFrame(columns = ['entropy_change','min_aver','feature'])
entropy_df['entropy_change'] = trend
entropy_df['min_aver'] = mina 
entropy_df['feature'] = col_list
entropy_df

Unnamed: 0,entropy_change,min_aver,feature
0,0.096960,-0.152721,balance_amt_avg_x
1,0.124642,-0.202255,balance_amt_max_x
2,0.388460,-0.125995,balance_amt_min_x
3,0.096390,-0.152208,balance_amt_day_avg_x
4,0.435250,43.000000,channel_code_x
...,...,...,...
273,0.073531,0.000054,sm_cred_e_op_trend_3m
274,0.083247,0.000113,sm_deb_e_op_trend_1m
275,0.001248,-0.987255,min_founderpres_trend_long
276,0.092806,0.000057,sum_deb_e_oper_3m_trend_long


In [58]:
features_df = mypdf.merge(entropy_df,left_on='name',right_on='feature',how='right').drop(['name'],axis=1)
features_df

Unnamed: 0,diff,count,percCount,entropy,weighted_entropy,entropy_change,min_aver,feature
0,<-0.15272112436026009,87212.0,0.436060,1.752514,0.764201,0.096960,-0.152721,balance_amt_avg_x
1,>=-0.15272112436026009,112788.0,0.563940,2.319149,1.307861,0.096960,-0.152721,balance_amt_avg_x
2,<-0.20225474053088924,72436.0,0.362180,1.612460,0.584001,0.124642,-0.202255,balance_amt_max_x
3,>=-0.20225474053088924,127564.0,0.637820,2.289643,1.460380,0.124642,-0.202255,balance_amt_max_x
4,<-0.1259950191492185,0.0,0.000000,0.000000,0.000000,0.388460,-0.125995,balance_amt_min_x
...,...,...,...,...,...,...,...,...
524,>=-0.9872548992821699,197507.0,0.987535,2.171167,2.144103,0.001248,-0.987255,min_founderpres_trend_long
525,<5.6632264807271376e-05,128303.0,0.641515,1.977609,1.268666,0.092806,0.000057,sum_deb_e_oper_3m_trend_long
526,>=5.6632264807271376e-05,71697.0,0.358485,2.252679,0.807552,0.092806,0.000057,sum_deb_e_oper_3m_trend_long
527,<0.0009396285107861022,127072.0,0.635360,1.906735,1.211463,0.122843,0.000940,sum_cred_e_oper_1m_trend_short


In [109]:
cols_features = entropy_df.sort_values(by='entropy_change', ascending=False).iloc[:90]['feature'].tolist()
cols_features

['city_type',
 'city_type_y',
 'city_type_x',
 'cnt_b_oper_3m_x',
 'cnt_b_oper_1m_x',
 'cnt_days_cred_f_oper_3m_x',
 'cnt_cred_f_oper_3m_x',
 'cnt_days_cred_f_oper_1m_x',
 'cnt_cred_f_oper_1m_x',
 'cnt_cred_d_oper_3m_x',
 'cnt_b_oper_3m_y',
 'cnt_cred_d_oper_1m_x',
 'cnt_b_oper_1m_y',
 'cnt_cred_f_oper_3m_y',
 'cnt_days_cred_f_oper_3m_y',
 'cnt_b_oper_3m',
 'cnt_days_cred_f_oper_1m_y',
 'cnt_cred_f_oper_1m_y',
 'cnt_cred_d_oper_3m_y',
 'cnt_cred_d_oper_1m_y',
 'cnt_b_oper_1m',
 'cnt_days_cred_f_oper_3m',
 'cnt_cred_f_oper_3m',
 'cnt_days_cred_f_oper_1m',
 'cnt_cred_f_oper_1m',
 'cnt_a_oper_1m_x',
 'cnt_cred_d_oper_3m',
 'cnt_a_oper_3m_x',
 'cnt_cred_d_oper_1m',
 'cnt_a_oper_1m_y',
 'cnt_a_oper_3m_y',
 'cnt_a_oper_3m',
 'cnt_a_oper_1m',
 'segment',
 'cnt_c_oper_1m_x',
 'cnt_c_oper_3m_x',
 'cnt_cred_g_oper_1m_x',
 'cnt_days_cred_g_oper_1m_x',
 'segment_y',
 'segment_x',
 'cnt_cred_g_oper_1m_y',
 'cnt_days_cred_g_oper_1m_y',
 'cnt_c_oper_1m_y',
 'cnt_c_oper_3m_y',
 'start_cluster_x',
 'cn

In [110]:
entropy_df[entropy_df['feature'] == 'end_cluster_x']

Unnamed: 0,entropy_change,min_aver,feature


In [111]:
X = train_df[cols_features]
y = train_df["end_cluster_x"]
cat_cols = X.select_dtypes(exclude=['number']).columns.to_list()
cat_f = [X.columns.get_loc(column_name) for column_name in cat_cols if column_name in X.columns.tolist()]
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)


In [113]:
GIGA_model  = CatBoostClassifier(random_state= 42,
                                 cat_features = cat_f,
                                 thread_count=8,
                                 iterations=100, 
                                 learning_rate=0.2,
                                 max_depth=8,
                                 custom_metric=['AUC:hints=skip_train~false'])
GIGA_model.fit(x_train,y_train, 
              eval_set=(x_val, y_val),
              plot=True,
              verbose=25
              )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.5028023	test: 1.3846421	best: 1.3846421 (0)	total: 11.4s	remaining: 18m 44s
25:	learn: 0.8837031	test: 0.7927182	best: 0.7927182 (25)	total: 3m 5s	remaining: 8m 47s
50:	learn: 0.8625452	test: 0.7833818	best: 0.7833818 (50)	total: 5m 57s	remaining: 5m 43s
75:	learn: 0.8453891	test: 0.7793663	best: 0.7793663 (75)	total: 8m 49s	remaining: 2m 47s
99:	learn: 0.8333151	test: 0.7778664	best: 0.7778664 (99)	total: 11m 45s	remaining: 0us

bestTest = 0.7778663699
bestIteration = 99


<catboost.core.CatBoostClassifier at 0x2173ec5deb0>

In [130]:
feat = GIGA_model.get_feature_importance(prettified=True)
feat = feat.iloc[:80,:]
feat

Unnamed: 0,Feature Id,Importances
0,start_cluster_x,21.415879
1,cnt_b_oper_1m_x,13.048624
2,start_cluster,7.729423
3,start_cluster_y,7.665197
4,segment_x,4.154427
...,...,...
75,cnt_c_oper_1m,0.080428
76,cnt_b_oper_3m,0.072559
77,cnt_days_cred_g_oper_3m_x,0.050731
78,cnt_b_oper_3m_x,0.027969


In [131]:
feat.to_csv('../data/feat_l.csv',index=False)