In [16]:
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
import os.path as path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
data_path = 'Alfa x FinU Hack - Материалы'

In [19]:
def fillna(df):
    num_features = df.drop('id', axis=1).select_dtypes(include=['float64', 'int64']).columns.to_list()
    
    df_fillna = df.copy()

    cat_features = df_fillna.select_dtypes(include=['object']).drop('start_cluster', axis=1).columns.to_list()

    df_fillna[cat_features] = df_fillna[cat_features].fillna('Unknown')
    df_fillna[num_features] = df_fillna[num_features].ffill()

    return df_fillna


In [20]:
df_train = fillna(pd.read_parquet(path.join(data_path, 'train_data.pqt')))

In [21]:
df_test = fillna(pd.read_parquet(path.join(data_path, 'test_data.pqt')))

In [22]:
df_month_concated = pd.merge(pd.merge(df_train[df_train['date'] == 'month_1'].drop(['end_cluster','date'], axis=1), \
                                            df_train[df_train['date'] == 'month_2'].drop(['end_cluster','date'], axis=1), how='left', on='id'),  \
                                                df_train[df_train['date'] == 'month_3'].drop(['end_cluster','date'], axis=1), how='outer', on='id')
df_month_concated = df_month_concated.sort_values(by='id').reset_index(drop=True)
df_month_concated = df_month_concated.fillna('nan')
df_month_concated = df_month_concated.drop('id', axis=1)

In [23]:
cat_features = ['channel_code_x',
 'city_x',
 'city_type_x',
 'index_city_code_x',
 'ogrn_month_x',
 'ogrn_year_x',
 'okved_x',
 'segment_x',
 'start_cluster_x',
 'channel_code_y',
 'city_y',
 'city_type_y',
 'index_city_code_y',
 'ogrn_month_y',
 'ogrn_year_y',
 'okved_y',
 'segment_y',
 'start_cluster_y',
 'channel_code',
 'city',
 'city_type',
 'index_city_code',
 'ogrn_month',
 'ogrn_year',
 'okved',
 'segment']


In [31]:
kf = StratifiedKFold(n_splits = 5, random_state=42, shuffle=True)

In [32]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [33]:
cluster_weights = pd.read_excel(path.join(data_path, "cluster_weights.xlsx")).set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [34]:
df_month_concated

Unnamed: 0,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,ogrn_days_end_month_x,ogrn_days_end_quarter_x,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,-0.488553,-0.135063,...,0.033607,0.946458,0.442244,0.877050,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}"
1,-0.081586,-0.091860,-0.114040,-0.080890,channel_code_2,city_14,city_type_0,Unknown,0.324343,1.258747,...,-0.019422,0.948027,0.488221,0.043221,0.560788,0.707687,-0.167905,0.259011,0.605309,{other}
2,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,Unknown,-0.256297,-1.257854,...,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.176302,0.252368,0.429485,{α}
3,-0.156643,-0.204861,-0.125660,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,-1.185321,-0.367365,...,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α}
4,-0.138847,-0.182486,-0.125630,-0.138328,channel_code_8,city_21,city_type_0,Unknown,-1.417577,-0.444799,...,0.053048,0.957051,0.637647,-0.078297,0.556919,0.641020,-0.172533,0.253523,0.462452,{α}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,-0.067129,-0.108082,-0.031401,-0.066388,channel_code_9,city_20,city_type_0,index_city_code_28,1.601751,-0.638383,...,-0.028584,0.944497,0.384773,-0.078382,0.549755,0.518798,-0.201123,0.250924,0.374540,{other}
199996,0.543173,0.382148,1.484547,0.545804,channel_code_14,city_3595,city_type_2778,index_city_code_48,-0.953065,-0.328648,...,-0.027923,0.944889,0.396267,-0.153030,0.552334,0.518798,-0.201123,0.250924,0.374540,"{α, γ}"
199997,-0.082159,-0.028263,-0.120011,-0.081465,channel_code_8,city_0,city_type_0,index_city_code_58,1.485623,-0.677100,...,-0.021165,0.944889,0.396267,0.680322,0.568239,0.841020,1.044968,0.263633,0.693221,{other}
199998,-0.156775,-0.204960,-0.125987,-0.156311,channel_code_9,city_14,city_type_0,Unknown,0.324343,-1.064270,...,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α}


In [35]:
best_model_start_cluster = 'no_model'
max_score = float("-inf")

for i, (train_index, test_index) in enumerate(kf.split(df_month_concated, df_month_concated['start_cluster'])):
    print(f"Fold {i}:")

    train = df_month_concated.iloc[train_index]
    test = df_month_concated.iloc[test_index]

    X_train, X_test, y_train, y_test = train.drop('start_cluster', axis=1), test.drop('start_cluster', axis=1), \
                                        train['start_cluster'], test['start_cluster']
    
    model = CatBoostClassifier(cat_features=cat_features, task_type="GPU", devices='0', verbose=500)

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    try:
        score = weighted_roc_auc(y_test, y_pred_proba, model.classes_, weights_dict)
        print(score)
        if score > max_score:
            max_score = score
            best_model_start_cluster = model
    except:
        print('Попались все объекты одного класса')
    

Fold 0:
Learning rate set to 0.182778
0:	learn: 0.9229812	total: 99.7ms	remaining: 1m 39s
500:	learn: 0.1801282	total: 23.5s	remaining: 23.4s
999:	learn: 0.1497698	total: 47.2s	remaining: 0us
0.9906235737855751
Fold 1:
Learning rate set to 0.182778
0:	learn: 0.9248346	total: 177ms	remaining: 2m 57s
500:	learn: 0.1807076	total: 22.8s	remaining: 22.7s
999:	learn: 0.1485612	total: 45.6s	remaining: 0us
0.9896511826454208
Fold 2:
Learning rate set to 0.182778
0:	learn: 0.9241884	total: 221ms	remaining: 3m 41s
500:	learn: 0.1831428	total: 22.8s	remaining: 22.7s
999:	learn: 0.1495598	total: 45.9s	remaining: 0us
0.9899906712506437
Fold 3:
Learning rate set to 0.182778
0:	learn: 0.9250683	total: 101ms	remaining: 1m 41s
500:	learn: 0.1838824	total: 22.8s	remaining: 22.7s
999:	learn: 0.1519314	total: 46.1s	remaining: 0us
0.9902012845275691
Fold 4:
Learning rate set to 0.182778
0:	learn: 0.9261810	total: 309ms	remaining: 5m 9s
500:	learn: 0.1835195	total: 23.1s	remaining: 23s
999:	learn: 0.1512208

In [36]:
df_month_concated2 = pd.merge(pd.merge(df_train[df_train['date'] == 'month_1'].drop(['end_cluster','date'], axis=1), \
                                            df_train[df_train['date'] == 'month_2'].drop(['end_cluster','date'], axis=1), how='left', on='id'),  \
                                                df_train[df_train['date'] == 'month_3'].drop(['date'], axis=1), how='outer', on='id')
df_month_concated2 = df_month_concated2.sort_values(by='id').reset_index(drop=True)
df_month_concated2 = df_month_concated2.fillna('nan')
df_month_concated2 = df_month_concated2.drop('id', axis=1)

In [37]:
df_month_concated2

Unnamed: 0,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,ogrn_days_end_month_x,ogrn_days_end_quarter_x,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,-0.488553,-0.135063,...,0.946458,0.442244,0.877050,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
1,-0.081586,-0.091860,-0.114040,-0.080890,channel_code_2,city_14,city_type_0,Unknown,0.324343,1.258747,...,0.948027,0.488221,0.043221,0.560788,0.707687,-0.167905,0.259011,0.605309,{other},{other}
2,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,Unknown,-0.256297,-1.257854,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.176302,0.252368,0.429485,{α},{α}
3,-0.156643,-0.204861,-0.125660,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,-1.185321,-0.367365,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α},"{α, γ}"
4,-0.138847,-0.182486,-0.125630,-0.138328,channel_code_8,city_21,city_type_0,Unknown,-1.417577,-0.444799,...,0.957051,0.637647,-0.078297,0.556919,0.641020,-0.172533,0.253523,0.462452,{α},{α}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,-0.067129,-0.108082,-0.031401,-0.066388,channel_code_9,city_20,city_type_0,index_city_code_28,1.601751,-0.638383,...,0.944497,0.384773,-0.078382,0.549755,0.518798,-0.201123,0.250924,0.374540,{other},{other}
199996,0.543173,0.382148,1.484547,0.545804,channel_code_14,city_3595,city_type_2778,index_city_code_48,-0.953065,-0.328648,...,0.944889,0.396267,-0.153030,0.552334,0.518798,-0.201123,0.250924,0.374540,"{α, γ}","{α, γ}"
199997,-0.082159,-0.028263,-0.120011,-0.081465,channel_code_8,city_0,city_type_0,index_city_code_58,1.485623,-0.677100,...,0.944889,0.396267,0.680322,0.568239,0.841020,1.044968,0.263633,0.693221,{other},{other}
199998,-0.156775,-0.204960,-0.125987,-0.156311,channel_code_9,city_14,city_type_0,Unknown,0.324343,-1.064270,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α},{}


In [38]:
cat_features2 = ['channel_code_x',
 'city_x',
 'city_type_x',
 'index_city_code_x',
 'ogrn_month_x',
 'ogrn_year_x',
 'okved_x',
 'segment_x',
 'start_cluster_x',
 'channel_code_y',
 'city_y',
 'city_type_y',
 'index_city_code_y',
 'ogrn_month_y',
 'ogrn_year_y',
 'okved_y',
 'segment_y',
 'start_cluster_y',
 'channel_code',
 'city',
 'city_type',
 'index_city_code',
 'ogrn_month',
 'ogrn_year',
 'okved',
 'segment',
 'start_cluster']

In [39]:
best_model_end_cluster = 'no_model'
max_score2 = float("-inf")

for i, (train_index, test_index) in enumerate(kf.split(df_month_concated2, df_month_concated2['end_cluster'])):
    print(f"Fold {i}:")

    train = df_month_concated2.iloc[train_index]
    test = df_month_concated2.iloc[test_index]

    train.loc[:, cat_features2] = train[cat_features2].astype("category")
    test.loc[:, cat_features2] = test[cat_features2].astype("category")

    X_train, X_test, y_train, y_test = train.drop('end_cluster', axis=1), test.drop('end_cluster', axis=1), \
                                        train['end_cluster'], test['end_cluster']
    
    model = CatBoostClassifier(cat_features=cat_features2, task_type="GPU", devices='0', verbose=500)

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)

    try:
        score = weighted_roc_auc(y_test, y_pred_proba, model.classes_, weights_dict)

        print(score)

        if score > max_score2:
            max_score2 = score
            best_model_end_cluster = model
    except:
        print("Попались все объекты одного класса")

    

Fold 0:
Learning rate set to 0.182778
0:	learn: 1.5635558	total: 51.2ms	remaining: 51.1s
500:	learn: 0.7058932	total: 22.2s	remaining: 22.1s
999:	learn: 0.6239935	total: 45.1s	remaining: 0us
0.9051606964459138
Fold 1:
Learning rate set to 0.182778
0:	learn: 1.5630030	total: 110ms	remaining: 1m 49s
500:	learn: 0.7115282	total: 23.1s	remaining: 23s
999:	learn: 0.6291155	total: 48.3s	remaining: 0us
0.9032012841745947
Fold 2:
Learning rate set to 0.182778
0:	learn: 1.5652692	total: 190ms	remaining: 3m 10s
500:	learn: 0.7087315	total: 26.3s	remaining: 26.2s
999:	learn: 0.6273945	total: 51.3s	remaining: 0us
0.9051391637410016
Fold 3:
Learning rate set to 0.182778
0:	learn: 1.5641902	total: 112ms	remaining: 1m 51s
500:	learn: 0.7064430	total: 24.5s	remaining: 24.4s
999:	learn: 0.6259970	total: 48.1s	remaining: 0us
0.8942609938557301
Fold 4:
Learning rate set to 0.182778
0:	learn: 1.5656004	total: 113ms	remaining: 1m 53s
500:	learn: 0.7064091	total: 23.3s	remaining: 23.3s
999:	learn: 0.6277509

In [40]:
# important_features = pd.DataFrame({'feature':best_model_end_cluster.feature_names_, 'coef':best_model_end_cluster.feature_importances_}).sort_values(by='coef', ascending=False).reset_index(drop='index').head(20)
# important_features

In [41]:
df_test_month_concated2 = pd.merge(pd.merge(df_test[df_test['date'] == 'month_4'].drop(['date'], axis=1), \
                                            df_test[df_test['date'] == 'month_5'].drop(['date'], axis=1), how='left', on='id'),  \
                                                df_test[df_test['date'] == 'month_6'].drop(['date'], axis=1), how='outer', on='id')
df_test_month_concated2 = df_test_month_concated2.sort_values(by='id').reset_index(drop=True)
df_test_month_concated2 = df_test_month_concated2.fillna('nan')
df_test_month_concated2 = df_test_month_concated2.drop('id', axis=1)

In [42]:
df_test_predicted_start_cluster = df_test_month_concated2.copy()

In [43]:
for index, row in df_test_predicted_start_cluster.iterrows():
    df_test_predicted_start_cluster.at[index, 'start_cluster'] = best_model_start_cluster.predict(row)


In [44]:
df_test_predicted_end_cluster = df_test_predicted_start_cluster.copy()

In [45]:
df_test_predicted_end_cluster.loc[:, 'start_cluster'] = df_test_predicted_end_cluster['start_cluster'].astype("string")

In [46]:
submission = pd.DataFrame(columns=["id","{other}","{}","{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}","{α}","{λ}"])

In [47]:
for index, row in df_test_predicted_end_cluster.iterrows():
    prediction = list([index + 200000])
    prediction.extend(best_model_end_cluster.predict_proba(row))
    submission.loc[index] = prediction


In [None]:
submission['id'] = submission['id'].astype(int)

In [None]:
submission

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.010532,0.035543,0.009840,0.019539,0.004924,0.000272,0.002329,0.000225,0.008936,0.004597,0.014399,0.000473,0.001412,0.000011,0.001779,0.885182,0.000006
1,200001,0.008035,0.580232,0.000550,0.002964,0.000600,0.000164,0.000384,0.000045,0.001750,0.006563,0.001780,0.000142,0.000476,0.000003,0.000637,0.395657,0.000019
2,200002,0.617629,0.011135,0.006262,0.089237,0.019737,0.004461,0.005087,0.023595,0.036153,0.011162,0.014641,0.007086,0.003673,0.000035,0.043672,0.106408,0.000026
3,200003,0.029502,0.602549,0.000196,0.001209,0.000844,0.000423,0.000320,0.000125,0.002380,0.013916,0.003398,0.000049,0.000555,0.000004,0.000482,0.344041,0.000009
4,200004,0.306393,0.030241,0.015032,0.003834,0.004578,0.023270,0.007445,0.001961,0.035250,0.060496,0.006077,0.005342,0.284762,0.000725,0.010519,0.199516,0.004559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,0.013175,0.327491,0.002733,0.005835,0.002399,0.000054,0.000399,0.000014,0.000965,0.001796,0.002019,0.000817,0.000237,0.000032,0.001925,0.639059,0.001050
99996,299996,0.014908,0.046652,0.011939,0.031074,0.006556,0.000210,0.001106,0.000126,0.007744,0.004092,0.008181,0.002276,0.001548,0.000044,0.002022,0.860358,0.001165
99997,299997,0.025090,0.031913,0.027902,0.057031,0.014573,0.000110,0.001360,0.000431,0.009618,0.003766,0.007021,0.001088,0.002509,0.000077,0.012115,0.805381,0.000015
99998,299998,0.151611,0.015235,0.001580,0.003931,0.001486,0.000958,0.003478,0.080628,0.010194,0.002184,0.003552,0.003840,0.001383,0.000303,0.609807,0.100751,0.009079


In [None]:
submission.to_csv(path.join(data_path, 'submission5.csv'), index=False)