In [26]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoost, CatBoostClassifier

## Загрузка данных

In [27]:
train_df = pd.read_parquet("../data/train_data.pqt")
test_df = pd.read_parquet("../data/test_data.pqt")

In [39]:
train_df

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.877050,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.091860,-0.114040,-0.080890,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.430750,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,199998,month_2,-0.156775,-0.204960,-0.125987,-0.156311,channel_code_9,city_14,city_type_0,,...,,,-0.165588,,,-0.201123,,,{},{}
599996,199998,month_3,-0.156776,-0.204960,-0.125995,-0.156312,channel_code_9,city_14,city_type_0,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α},{}
599997,199999,month_1,-0.156712,-0.204913,-0.125831,-0.156248,channel_code_14,city_1876,city_type_0,index_city_code_195,...,,,-0.165588,,,-0.201123,,,{},{}
599998,199999,month_2,-0.156712,-0.204913,-0.125831,-0.156248,channel_code_14,city_1876,city_type_0,index_city_code_195,...,,,-0.165588,,,-0.201123,,,{},{}


In [28]:
train_df[['date', 'start_cluster']]

Unnamed: 0,date,start_cluster
0,month_1,"{α, γ}"
1,month_2,"{α, γ}"
2,month_3,"{α, γ}"
3,month_1,{other}
4,month_2,{other}
...,...,...
599995,month_2,{}
599996,month_3,{α}
599997,month_1,{}
599998,month_2,{}


In [29]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
    "channel_code_x", "city_x", "city_type_x",
    "okved_x", "segment_x",
    "index_city_code_x", "ogrn_month_x", "ogrn_year_x",
    "channel_code_y", "city_y", "city_type_y",
    "okved_y", "segment_y", "start_cluster_y",
    "index_city_code_y", "ogrn_month_y", "ogrn_year_y",
]

## Создание расширенного датасета для тестовой и обучающих выборок, датасеты хранят данные о пользователе за три последовательных месяца

In [30]:
m1_df = train_df[train_df['date'] == 'month_1']
m2_df = train_df[train_df['date'] == 'month_2']
m3_df = train_df[train_df['date'] == 'month_3']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
train_df = df
m1_df = test_df[test_df['date'] == 'month_4']
m2_df = test_df[test_df['date'] == 'month_5']
m3_df = test_df[test_df['date'] == 'month_6']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
test_df = df 

In [31]:
train_df

Unnamed: 0,id,date_x,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
2,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
3,3,month_3,-0.156522,-0.204718,-0.125759,-0.156058,channel_code_14,city_21,city_type_0,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α},{α}
4,4,month_3,-0.141798,-0.170262,-0.125672,-0.141289,channel_code_8,city_21,city_type_0,,...,0.957443,0.672129,-0.078233,0.558209,0.707687,-0.178408,0.252946,0.440474,{α},{α}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,199995,month_3,-0.039281,-0.053694,-0.044193,-0.038454,channel_code_9,city_20,city_type_0,index_city_code_28,...,0.944497,0.384773,-0.161095,0.547319,0.429909,-0.201123,0.250924,0.374540,"{α, η}",{other}
199996,199996,month_3,0.293117,0.189316,0.857952,0.294974,channel_code_14,city_3595,city_type_2778,index_city_code_48,...,0.944889,0.396267,-0.151699,0.553767,0.574354,-0.201123,0.250924,0.374540,"{α, γ}","{α, γ}"
199997,199997,month_3,0.032941,0.140726,-0.125362,0.033992,channel_code_8,city_0,city_type_0,index_city_code_58,...,0.944889,0.396267,1.185234,0.571535,0.918798,0.955885,0.264788,0.693221,{other},{other}
199998,199998,month_3,-0.156776,-0.204960,-0.125995,-0.156312,channel_code_9,city_14,city_type_0,,...,,,-0.165588,,,-0.201123,,,{},{}


### столбцы с постфиксом _x означают данные по клиенту за третий месяц, столбцы с постфиксом _y озночают данные по клиенту за второй месяц, столбцы без постфикса - данные за первый месяц

## Создание трендовых переменных, которые учитывают изменения признаков в разрезе нескольких месяцев:

In [32]:
train_df['b_amt_avg_trend'] = train_df['balance_amt_day_avg_x'] - train_df[['balance_amt_day_avg','balance_amt_day_avg_y','balance_amt_day_avg_x']].mean(axis=1)

train_df['b_amt_min_trend'] = train_df['balance_amt_min_x'] - train_df[['balance_amt_min_y','balance_amt_min','balance_amt_min_x']].mean(axis=1)

train_df['mx_founder_trend'] = train_df['max_founderpres_x'] - train_df[['max_founderpres_x','max_founderpres','max_founderpres_y']].mean(axis=1)

train_df['sm_cred_e_op_trend_3m'] = train_df['sum_cred_e_oper_3m_x'] - train_df[['sum_cred_e_oper_3m_y','sum_cred_e_oper_3m_x','sum_cred_e_oper_3m']].mean(axis=1)

train_df['sm_deb_e_op_trend_1m'] = train_df['sum_deb_e_oper_1m_x'] - train_df[['sum_deb_e_oper_1m_y','sum_deb_e_oper_1m','sum_deb_e_oper_1m_x']].mean(axis=1)

In [33]:
train_df['min_founderpres_trend_long'] =train_df['min_founderpres_x'] - train_df['min_founderpres_y']

train_df['sum_deb_e_oper_3m_trend_long'] =train_df['sum_deb_e_oper_3m_x'] - train_df['sum_deb_e_oper_3m_y']

train_df['sum_cred_e_oper_1m_trend_short'] = train_df['sum_cred_e_oper_1m_x'] - train_df['sum_cred_e_oper_1m_y']

In [34]:
test_df['b_amt_avg_trend'] = test_df['balance_amt_day_avg_x'] - test_df[['balance_amt_day_avg','balance_amt_day_avg_y','balance_amt_day_avg_x']].mean(axis=1)

test_df['b_amt_min_trend'] = test_df['balance_amt_min_x'] - test_df[['balance_amt_min_y','balance_amt_min','balance_amt_min_x']].mean(axis=1)

test_df['mx_founder_trend'] = test_df['max_founderpres_x'] - test_df[['max_founderpres_x','max_founderpres','max_founderpres_y']].mean(axis=1)

test_df['sm_cred_e_op_trend_3m'] = test_df['sum_cred_e_oper_3m_x'] - test_df[['sum_cred_e_oper_3m_y','sum_cred_e_oper_3m_x','sum_cred_e_oper_3m']].mean(axis=1)

test_df['sm_deb_e_op_trend_1m'] = test_df['sum_deb_e_oper_1m_x'] - test_df[['sum_deb_e_oper_1m_y','sum_deb_e_oper_1m','sum_deb_e_oper_1m_x']].mean(axis=1)

In [37]:
test_df['min_founderpres_trend_long'] =test_df['min_founderpres_x'] - test_df['min_founderpres_y']

test_df['sum_deb_e_oper_3m_trend_long'] =test_df['sum_deb_e_oper_3m_x'] - test_df['sum_deb_e_oper_3m_y']

test_df['sum_cred_e_oper_1m_trend_short'] = test_df['sum_cred_e_oper_1m_x'] - test_df['sum_cred_e_oper_1m_y']

In [38]:
full_df = pd.concat([train_df, test_df])

## Обработка пропущенных значений:

In [39]:
for col in full_df.columns:
    print(col, full_df[col].isna().sum())

id 0
date_x 0
balance_amt_avg_x 17638
balance_amt_max_x 17638
balance_amt_min_x 17638
balance_amt_day_avg_x 17638
channel_code_x 6765
city_x 39002
city_type_x 39241
index_city_code_x 156893
ogrn_days_end_month_x 1046
ogrn_days_end_quarter_x 1046
ogrn_month_x 1046
ogrn_year_x 1046
ft_registration_date_x 596
max_founderpres_x 162092
min_founderpres_x 162092
ogrn_exist_months_x 1046
okved_x 1251
segment_x 235
sum_of_paym_2m_x 64956
sum_of_paym_6m_x 64956
sum_of_paym_1y_x 64956
sum_a_oper_1m_x 0
cnt_a_oper_1m_x 86857
sum_b_oper_1m_x 0
cnt_b_oper_1m_x 86857
sum_c_oper_1m_x 0
cnt_c_oper_1m_x 86857
sum_deb_d_oper_1m_x 0
cnt_deb_d_oper_1m_x 86857
sum_cred_d_oper_1m_x 0
cnt_cred_d_oper_1m_x 86857
sum_deb_e_oper_1m_x 0
cnt_deb_e_oper_1m_x 86857
cnt_days_deb_e_oper_1m_x 86857
sum_cred_e_oper_1m_x 0
cnt_cred_e_oper_1m_x 86857
cnt_days_cred_e_oper_1m_x 86857
sum_deb_f_oper_1m_x 0
cnt_deb_f_oper_1m_x 86857
cnt_days_deb_f_oper_1m_x 86857
sum_cred_f_oper_1m_x 0
cnt_cred_f_oper_1m_x 86857
cnt_days_cred

## Загрузка файла с заполнениными переменными из полного датасета:

In [41]:
knn_imputer_data = pd.read_csv('../data/knn_imputer7_data.csv')

In [43]:
knn_imputer_data

Unnamed: 0.1,Unnamed: 0,balance_amt_max_x,sum_cred_e_oper_1m_x,balance_amt_min_x,sum_cred_e_oper_3m_x,balance_amt_day_avg_x,cnt_b_oper_1m_x,balance_amt_min
0,0,0.810881,0.152623,0.474374,0.289020,0.771144,-0.095666,1.447917
1,1,-0.101677,-0.008444,-0.111107,0.029518,-0.081938,-0.095666,-0.113128
2,2,-0.179344,-0.134646,-0.109275,-0.148490,-0.145252,-0.095666,-0.122892
3,3,-0.198263,-0.146314,-0.123631,-0.158134,-0.153731,-0.095666,-0.126072
4,4,-0.161466,-0.100780,-0.123538,-0.091140,-0.137695,-0.095666,-0.126039
...,...,...,...,...,...,...,...,...
299995,299995,-0.196220,-0.146314,-0.123350,-0.158134,-0.150664,-0.095666,-0.125892
299996,299996,-0.190957,-0.134396,-0.123753,-0.153028,-0.151995,-0.095666,-0.126446
299997,299997,-0.130675,-0.003603,-0.100854,-0.016931,-0.097936,-0.095666,-0.124941
299998,299998,-0.197187,-0.145753,-0.123885,-0.157925,-0.153647,-0.095666,-0.126380


In [44]:
knn_imputer_data.isna().sum().sum()

0

In [45]:
columns_from_imputer_data = ['balance_amt_max_x', 'sum_cred_e_oper_1m_x', 'balance_amt_min_x', 'sum_cred_e_oper_3m_x', 'balance_amt_day_avg_x', 'cnt_b_oper_1m_x', 'balance_amt_min'] 

### Заполнение пропущенных значений на основе knn_imputer

In [46]:
for column in columns_from_imputer_data:
    full_df[column] = full_df[column].fillna(knn_imputer_data[column])

## Заполенение пропущенных значений на основе внутригрупповых характеристик:

In [47]:
gr = full_df.groupby(['okved_x', 'start_cluster_y', 'start_cluster'])

In [48]:
#Функия, которая реализует поиск моды в массиве данных
def calculate_mode(data):
    value_counts = {}
    for value in data:
        value_counts[value] = value_counts.get(value, 0) + 1
    mode_value = max(value_counts, key=value_counts.get)
    return mode_value

### Категориальные переменные заполнятся модой, а численные - средним значением

In [49]:
for col in full_df.columns:
    if full_df[col].dtypes in ['category', 'object']:
        full_df[col].fillna(gr[col].transform(lambda x: calculate_mode(x)), inplace=True)
    elif full_df[col].dtypes in ['float64']:
        full_df[col].fillna(gr[col].transform(lambda x: round(np.mean(x), 6)), inplace=True)
    

In [50]:
for col in full_df.columns:
    print(col, full_df[col].isna().sum(), full_df[col].dtypes)

id 0 int64
date_x 0 object
balance_amt_avg_x 852 float64
balance_amt_max_x 0 float64
balance_amt_min_x 0 float64
balance_amt_day_avg_x 0 float64
channel_code_x 440 object
city_x 19636 object
city_type_x 5810 object
index_city_code_x 156505 object
ogrn_days_end_month_x 1046 float64
ogrn_days_end_quarter_x 1046 float64
ogrn_month_x 1046 object
ogrn_year_x 1046 object
ft_registration_date_x 252 float64
max_founderpres_x 8156 float64
min_founderpres_x 8156 float64
ogrn_exist_months_x 1046 float64
okved_x 1251 object
segment_x 235 object
sum_of_paym_2m_x 4232 float64
sum_of_paym_6m_x 4232 float64
sum_of_paym_1y_x 4232 float64
sum_a_oper_1m_x 0 float64
cnt_a_oper_1m_x 1504 float64
sum_b_oper_1m_x 0 float64
cnt_b_oper_1m_x 0 float64
sum_c_oper_1m_x 0 float64
cnt_c_oper_1m_x 1504 float64
sum_deb_d_oper_1m_x 0 float64
cnt_deb_d_oper_1m_x 1504 float64
sum_cred_d_oper_1m_x 0 float64
cnt_cred_d_oper_1m_x 1504 float64
sum_deb_e_oper_1m_x 0 float64
cnt_deb_e_oper_1m_x 1504 float64
cnt_days_deb_e_ope

### пропущенных данных стало сильно меньше в датасете, но в некоторых столбцах их все еще много

In [51]:
new_gr = full_df.groupby(['start_cluster_y', 'segment_x'])

In [52]:
for col in full_df.columns:
    if full_df[col].dtypes in ['category', 'object']:
        full_df[col].fillna(new_gr[col].transform(lambda x: calculate_mode(x)), inplace=True)
    elif full_df[col].dtypes in ['float64']:
        full_df[col].fillna(new_gr[col].transform(lambda x: round(np.mean(x), 6)), inplace=True)

In [53]:
for col in full_df.columns:
    print(col, full_df[col].isna().sum(), full_df[col].dtypes)

id 0 int64
date_x 0 object
balance_amt_avg_x 231 float64
balance_amt_max_x 0 float64
balance_amt_min_x 0 float64
balance_amt_day_avg_x 0 float64
channel_code_x 225 object
city_x 7351 object
city_type_x 232 object
index_city_code_x 23854 object
ogrn_days_end_month_x 225 float64
ogrn_days_end_quarter_x 225 float64
ogrn_month_x 225 object
ogrn_year_x 225 object
ft_registration_date_x 225 float64
max_founderpres_x 234 float64
min_founderpres_x 234 float64
ogrn_exist_months_x 225 float64
okved_x 225 object
segment_x 235 object
sum_of_paym_2m_x 233 float64
sum_of_paym_6m_x 233 float64
sum_of_paym_1y_x 233 float64
sum_a_oper_1m_x 0 float64
cnt_a_oper_1m_x 230 float64
sum_b_oper_1m_x 0 float64
cnt_b_oper_1m_x 0 float64
sum_c_oper_1m_x 0 float64
cnt_c_oper_1m_x 230 float64
sum_deb_d_oper_1m_x 0 float64
cnt_deb_d_oper_1m_x 230 float64
sum_cred_d_oper_1m_x 0 float64
cnt_cred_d_oper_1m_x 230 float64
sum_deb_e_oper_1m_x 0 float64
cnt_deb_e_oper_1m_x 230 float64
cnt_days_deb_e_oper_1m_x 230 float64


### пропущенных данных стало совсем мало, относительно общего числа данных, оставшиеся значения можно заполнить без расчета внутригрупповых характеристик

In [54]:
for col in full_df.columns:
    if full_df[col].dtypes in ['category', 'object']:
        m = full_df[col].mode()[0]
        full_df[col].fillna(m, inplace=True)
    elif full_df[col].dtypes in ['float64']:
        mean_value = full_df[col].mean()
        full_df[col].fillna(round(mean_value, 6), inplace=True)

In [55]:
full_df.isna().sum().sum()

0

## Формирование данных на обучающую и тестовую выборки:

In [56]:
train =  full_df.iloc[0:200000, :]

In [57]:
test = full_df.iloc[200000:, :]

In [69]:
train

Unnamed: 0,id,date_x,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,...,start_cluster,end_cluster,b_amt_avg_trend,b_amt_min_trend,mx_founder_trend,sm_cred_e_op_trend_3m,sm_deb_e_op_trend_1m,min_founderpres_trend_long,sum_deb_e_oper_3m_trend_long,sum_cred_e_oper_1m_trend_short
0,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,"{α, γ}",{other},-1.368036e-01,-0.961911,0.018291,-0.095323,1.478963e-01,0.018214,4.459154e-02,0.053812
1,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,index_city_code_46,...,{other},{other},-1.558725e-03,0.001702,0.019162,-0.066901,-2.515012e-02,0.015135,-4.046813e-02,0.027058
2,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,index_city_code_0,...,{α},{α},3.341753e-03,0.006926,0.018655,0.000579,2.963236e-03,0.016502,1.290530e-04,0.005142
3,3,month_3,-0.156522,-0.204718,-0.125759,-0.156058,channel_code_14,city_21,city_type_0,index_city_code_46,...,{α},{α},8.857290e-05,0.000046,0.018291,0.000000,-1.497786e-05,0.018214,-2.817038e-05,0.000000
4,4,month_3,-0.141798,-0.170262,-0.125672,-0.141289,channel_code_8,city_21,city_type_0,index_city_code_34,...,{α},{α},-5.188196e-03,-0.005667,0.018610,-0.010794,-1.402492e-02,0.016777,-1.524840e-02,-0.017025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,199995,month_3,-0.039281,-0.053694,-0.044193,-0.038454,channel_code_9,city_20,city_type_0,index_city_code_28,...,"{α, η}",{other},-2.200766e-03,-0.081787,0.018291,0.025172,4.190134e-02,0.018214,3.586864e-02,0.016276
199996,199996,month_3,0.293117,0.189316,0.857952,0.294974,channel_code_14,city_3595,city_type_2778,index_city_code_48,...,"{α, γ}","{α, γ}",-1.238400e-01,-0.310421,0.018291,0.000304,-1.157151e-03,0.018214,-1.271225e-03,-0.002461
199997,199997,month_3,0.032941,0.140726,-0.125362,0.033992,channel_code_8,city_0,city_type_0,index_city_code_58,...,{other},{other},8.667113e-02,-0.001784,0.018291,-0.037429,4.115517e-01,0.018214,3.356986e-02,0.748860
199998,199998,month_3,-0.156776,-0.204960,-0.125995,-0.156312,channel_code_9,city_14,city_type_0,index_city_code_0,...,{},{},-8.089765e-07,-0.000005,0.012870,0.000000,6.672784e-07,0.011652,3.986086e-07,0.000000


In [70]:
test

Unnamed: 0,id,date_x,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,...,start_cluster,end_cluster,b_amt_avg_trend,b_amt_min_trend,mx_founder_trend,sm_cred_e_op_trend_3m,sm_deb_e_op_trend_1m,min_founderpres_trend_long,sum_deb_e_oper_3m_trend_long,sum_cred_e_oper_1m_trend_short
0,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,index_city_code_0,...,{α},{α},0.071039,-0.000567,0.018662,0.012607,-0.133965,0.010949,-0.045721,-0.041005
1,200001,month_6,-0.156722,-0.204920,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,index_city_code_0,...,{α},{α},0.000000,0.000000,0.018614,0.000000,0.000000,0.018814,0.000000,0.000000
2,200002,month_6,-0.048015,0.448252,-0.125995,-0.047215,channel_code_12,city_14,city_type_0,index_city_code_78,...,{other},{other},-0.191179,0.000000,0.019342,0.440499,-0.602453,0.020166,0.161884,-1.598759
3,200003,month_6,-0.156579,-0.204813,-0.125501,-0.156115,channel_code_7,city_31,city_type_0,index_city_code_0,...,{α},{α},0.000015,0.000055,0.018707,0.000000,0.000000,0.015216,0.000000,0.000000
4,200004,month_6,-0.153379,-0.201932,-0.125995,-0.154155,channel_code_7,city_0,city_type_0,index_city_code_0,...,{},{},0.000000,0.000000,0.012373,0.000306,0.000161,0.014558,0.000118,0.001648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,299995,month_6,-0.153707,-0.202806,-0.125498,-0.153234,channel_code_14,city_22,city_type_0,index_city_code_29,...,{α},{α},0.000004,0.000000,0.019342,0.000000,0.000000,0.020166,0.000000,0.000000
99996,299996,month_6,-0.154929,-0.197878,-0.125873,-0.154459,channel_code_1,city_96,city_type_0,index_city_code_66,...,{α},{α},0.000985,0.000081,0.019342,0.002709,0.007263,0.020166,0.004075,0.009893
99997,299997,month_6,-0.105294,-0.141429,-0.104590,-0.104671,channel_code_17,city_85,city_type_0,index_city_code_103,...,{α},{α},0.012125,0.012284,0.019342,0.018949,-0.002418,0.020166,-0.000645,0.005714
99998,299998,month_6,-0.155350,-0.203711,-0.125995,-0.155980,channel_code_9,city_25,city_type_0,index_city_code_30,...,{},{},0.000000,0.000000,0.000000,0.000102,0.000000,0.013292,0.000000,0.000549


## Удаление ненужных признаков

In [58]:
X = train.drop(columns=['id', 'date_x', 'date', 'date_y', 'end_cluster', 'end_cluster_y', 'end_cluster_x', 'start_cluster_x'])
test = test.drop(columns=['id', 'date_x', 'date', 'date_y', 'end_cluster', 'end_cluster_y', 'end_cluster_x', 'start_cluster_x'])
y = train['start_cluster_x']

In [59]:
#Окончательное формирование выборок для обучения и тестирования,
#данных мало, поэтому размер тестирующей выборки увеличен до 30%
x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.3,
                                                  random_state=42)

## Байесовский подбор гиперпараметров

In [73]:
import optuna
def objective(trial):
    params = {
        "iterations": trial.suggest_int('iterations', 50, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.7, log=True),
        "depth": trial.suggest_int("depth", 2, 16),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 1700),
        "l2_leaf_reg":trial.suggest_float('l2_leaf_reg', 0.3, 40)
    }

    model = CatBoostClassifier(**params, silent=True, task_type='GPU', cat_features = cat_cols )
    model.fit(x_train, y_train)
    predictions = model.predict_proba(x_val)
    y_pred_normalized = predictions / predictions.sum(axis=1, keepdims=True)
    metric = weighted_roc_auc(y_val, y_pred_normalized, model.classes_, weights_dict)
    return metric

In [60]:
#оптимизируемая функция:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=7)

[I 2024-03-14 03:58:02,990] A new study created in memory with name: no-name-6c19a0b6-b351-46ba-bd7b-4c8563f139c5
[I 2024-03-14 03:58:35,248] Trial 0 finished with value: 0.9852698009536496 and parameters: {'iterations': 403, 'learning_rate': 0.022386385013344536, 'depth': 7, 'min_data_in_leaf': 1499, 'l2_leaf_reg': 14.897860482626347}. Best is trial 0 with value: 0.9852698009536496.
[I 2024-03-14 03:59:55,700] Trial 1 finished with value: 0.9820631277378724 and parameters: {'iterations': 470, 'learning_rate': 0.010928314778967491, 'depth': 9, 'min_data_in_leaf': 1514, 'l2_leaf_reg': 29.15905656172346}. Best is trial 0 with value: 0.9852698009536496.
[I 2024-03-14 04:00:56,282] Trial 2 finished with value: 0.9830574310209172 and parameters: {'iterations': 828, 'learning_rate': 0.008889042611731787, 'depth': 7, 'min_data_in_leaf': 1088, 'l2_leaf_reg': 30.175488242997503}. Best is trial 0 with value: 0.9852698009536496.
[I 2024-03-14 04:01:14,970] Trial 3 finished with value: 0.987433866

In [75]:
## Обучение модели с наилучшими гиперпараметрами
#{'iterations': 208, 'learning_rate': 0.429760652915373, 'depth': 7, 'min_data_in_leaf': 1333, 'l2_leaf_reg': 11.36230611320782}

In [63]:
#создание списка категориальных переменных для обработки катбустом
cat_f = []
for col in X.columns:
    if X[col].dtypes in ['category', 'object']:
        cat_f.append(col)

In [None]:
GIGA_model  = CatBoostClassifier(random_state= 42,
                                 cat_features = cat_f,                                
                                 iterations=208,
                                 task_type='GPU',
                                 learning_rate=0.429760652915373,
                                 l2_leaf_reg =11.36230611320782,
                                 depth=7,
                                 min_data_in_leaf = 1333,
                                 loss_function='MultiClassOneVsAll',
                                 custom_metric=['AUC:hints=skip_train~false'])
GIGA_model.fit(x_train,y_train, 
              eval_set=(x_val, y_val),
              plot=True,
              verbose=25
              )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [45]:
#предсказание стартового кластера клиента на начало шестого месяца, 
#основываясь на данных за 4, 5 и шестые месяца
yp_proba = GIGA_model.predict_proba(test)
y_pred =  yp_proba / yp_proba.sum(axis=1, keepdims=True)
max_column_indices = np.argmax(y_pred, axis=1)
max_column_names = [GIGA_model.classes_[idx] for idx in max_column_indices]
test['start_cluster_x'] = max_column_names

## Распределение стартового кластера на шестой месяц: 

In [46]:
test['start_cluster_x'].value_counts()

start_cluster_x
{α}          68044
{α, η}        8309
{}            6668
{other}       5694
{α, γ}        5139
{α, β}        1970
{α, δ}        1365
{α, ε}         800
{α, θ}         759
{α, ψ}         443
{α, μ}         287
{α, ε, η}      206
{α, λ}         147
{α, ε, θ}      117
{α, ε, ψ}       46
{λ}              6
Name: count, dtype: int64

## Сохранение датасетов с заполнением пропущенных значений

In [47]:
test.to_csv('../data/test_df_without_nan.csv', index=False)

In [48]:
train.to_csv('../data/train_df_without_nan.csv', index=False)