In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoost, CatBoostClassifier

## Загрузка данных

In [16]:
train_df = pd.read_parquet("../data/train_data.pqt")
test_df = pd.read_parquet("../data/test_data.pqt")

## Создание расширенного датасета для тестовой и обучающих выборок, датасеты хранят данные о пользователе за три последовательных месяца

In [17]:
m1_df = train_df[train_df['date'] == 'month_1']
m2_df = train_df[train_df['date'] == 'month_2']
m3_df = train_df[train_df['date'] == 'month_3']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
train_df = df
m1_df = test_df[test_df['date'] == 'month_4']
m2_df = test_df[test_df['date'] == 'month_5']
m3_df = test_df[test_df['date'] == 'month_6']
df = m3_df.merge(m2_df,on='id',how='left')
df = df.merge(m1_df,on='id',how='left')
test_df = df 

In [18]:
train_df

Unnamed: 0,id,date_x,balance_amt_avg_x,balance_amt_max_x,balance_amt_min_x,balance_amt_day_avg_x,channel_code_x,city_x,city_type_x,index_city_code_x,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,1,month_3,-0.090605,-0.114275,-0.114119,-0.089937,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
2,2,month_3,-0.148737,-0.187003,-0.112416,-0.148249,channel_code_12,city_613,city_type_306,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.178674,0.252657,0.440474,{α},{α}
3,3,month_3,-0.156522,-0.204718,-0.125759,-0.156058,channel_code_14,city_21,city_type_0,index_city_code_46,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α},{α}
4,4,month_3,-0.141798,-0.170262,-0.125672,-0.141289,channel_code_8,city_21,city_type_0,,...,0.957443,0.672129,-0.078233,0.558209,0.707687,-0.178408,0.252946,0.440474,{α},{α}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,199995,month_3,-0.039281,-0.053694,-0.044193,-0.038454,channel_code_9,city_20,city_type_0,index_city_code_28,...,0.944497,0.384773,-0.161095,0.547319,0.429909,-0.201123,0.250924,0.374540,"{α, η}",{other}
199996,199996,month_3,0.293117,0.189316,0.857952,0.294974,channel_code_14,city_3595,city_type_2778,index_city_code_48,...,0.944889,0.396267,-0.151699,0.553767,0.574354,-0.201123,0.250924,0.374540,"{α, γ}","{α, γ}"
199997,199997,month_3,0.032941,0.140726,-0.125362,0.033992,channel_code_8,city_0,city_type_0,index_city_code_58,...,0.944889,0.396267,1.185234,0.571535,0.918798,0.955885,0.264788,0.693221,{other},{other}
199998,199998,month_3,-0.156776,-0.204960,-0.125995,-0.156312,channel_code_9,city_14,city_type_0,,...,,,-0.165588,,,-0.201123,,,{},{}


## столбцы с постфиксом _x означают данные по клиенту за третий месяц, столбцы с постфиксом _y озночают данные по клиенту за второй месяц, столбцы без постфикса - данные за первый месяц

## Создание трендовых переменных, которые учитывают изменения признаков в разрезе нескольких месяцев:

In [19]:
train_df['b_amt_avg_trend'] = train_df['balance_amt_day_avg_x'] - train_df[['balance_amt_day_avg','balance_amt_day_avg_y','balance_amt_day_avg_x']].mean(axis=1)

train_df['b_amt_min_trend'] = train_df['balance_amt_min_x'] - train_df[['balance_amt_min_y','balance_amt_min','balance_amt_min_x']].mean(axis=1)

train_df['mx_founder_trend'] = train_df['max_founderpres_x'] - train_df[['max_founderpres_x','max_founderpres','max_founderpres_y']].mean(axis=1)

train_df['sm_cred_e_op_trend_3m'] = train_df['sum_cred_e_oper_3m_x'] - train_df[['sum_cred_e_oper_3m_y','sum_cred_e_oper_3m_x','sum_cred_e_oper_3m']].mean(axis=1)

train_df['sm_deb_e_op_trend_1m'] = train_df['sum_deb_e_oper_1m_x'] - train_df[['sum_deb_e_oper_1m_y','sum_deb_e_oper_1m','sum_deb_e_oper_1m_x']].mean(axis=1)

In [20]:
train_df['min_founderpres_trend_long'] =train_df['min_founderpres_x'] - train_df['min_founderpres_y']

train_df['sum_deb_e_oper_3m_trend_long'] =train_df['sum_deb_e_oper_3m_x'] - train_df['sum_deb_e_oper_3m_y']

train_df['sum_cred_e_oper_1m_trend_short'] = train_df['sum_cred_e_oper_1m_x'] - train_df['sum_cred_e_oper_1m_y']

In [21]:
test_df['b_amt_avg_trend'] = test_df['balance_amt_day_avg_x'] - test_df[['balance_amt_day_avg','balance_amt_day_avg_y','balance_amt_day_avg_x']].mean(axis=1)

test_df['b_amt_min_trend'] = test_df['balance_amt_min_x'] - test_df[['balance_amt_min_y','balance_amt_min','balance_amt_min_x']].mean(axis=1)

test_df['mx_founder_trend'] = test_df['max_founderpres_x'] - test_df[['max_founderpres_x','max_founderpres','max_founderpres_y']].mean(axis=1)

test_df['sm_cred_e_op_trend_3m'] = test_df['sum_cred_e_oper_3m_x'] - test_df[['sum_cred_e_oper_3m_y','sum_cred_e_oper_3m_x','sum_cred_e_oper_3m']].mean(axis=1)

test_df['sm_deb_e_op_trend_1m'] = test_df['sum_deb_e_oper_1m_x'] - test_df[['sum_deb_e_oper_1m_y','sum_deb_e_oper_1m','sum_deb_e_oper_1m_x']].mean(axis=1)

In [22]:
test_df['min_founderpres_trend_long'] =test_df['min_founderpres_x'] - test_df['min_founderpres_y']

test_df['sum_deb_e_oper_3m_trend_long'] =test_df['sum_deb_e_oper_3m_x'] - test_df['sum_deb_e_oper_3m_y']

test_df['sum_cred_e_oper_1m_trend_short'] = test_df['sum_cred_e_oper_1m_x'] - test_df['sum_cred_e_oper_1m_y']

In [23]:
full_df = pd.concat([train_df, test_df])

In [24]:
nan_columns = ['balance_amt_max_x', 'sum_cred_e_oper_1m_x', 'balance_amt_min_x', 'sum_cred_e_oper_3m_x', 'balance_amt_day_avg_x', 'cnt_b_oper_1m_x', 'balance_amt_min'] 
data_for_knn = full_df[nan_columns]

In [25]:
data_for_knn.isna().sum()

balance_amt_max_x        17638
sum_cred_e_oper_1m_x         0
balance_amt_min_x        17638
sum_cred_e_oper_3m_x         0
balance_amt_day_avg_x    17638
cnt_b_oper_1m_x          86857
balance_amt_min          49299
dtype: int64

In [26]:
full_df[nan_columns]

Unnamed: 0,balance_amt_max_x,sum_cred_e_oper_1m_x,balance_amt_min_x,sum_cred_e_oper_3m_x,balance_amt_day_avg_x,cnt_b_oper_1m_x,balance_amt_min
0,0.740253,0.148894,0.430042,0.280609,0.695747,0.435995,1.287207
1,-0.114275,-0.008843,-0.114119,0.026920,-0.089937,0.435995,-0.114040
2,-0.187003,-0.132435,-0.112416,-0.147099,-0.148249,0.435995,-0.122805
3,-0.204718,-0.143862,-0.125759,-0.156528,-0.156058,,-0.125660
4,-0.170262,-0.099270,-0.125672,-0.091034,-0.141289,0.435995,-0.125630
...,...,...,...,...,...,...,...
99995,-0.202806,-0.143862,-0.125498,-0.156528,-0.153234,,-0.125498
99996,-0.197878,-0.132191,-0.125873,-0.151536,-0.154459,0.435995,-0.125995
99997,-0.141429,-0.004102,-0.104590,-0.018488,-0.104671,0.435995,-0.124644
99998,-0.203711,-0.143313,-0.125995,-0.156324,-0.155980,0.435995,


## Для точечного заполнения пропущенных значений обучим KNN находить похожие векторы объектов

### Важно до обучения KNN отмасштабировать данные (обучаться модель будет долго, но все работает корректно)

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
knn = data_for_knn.copy()

scaler = StandardScaler()
 
knn = pd.DataFrame(scaler.fit_transform(knn), columns = knn.columns)

knn_imputer = KNNImputer(n_neighbors = 10, weights = 'uniform')
 

fd = pd.DataFrame(knn_imputer.fit_transform(knn), columns = knn.columns)


In [37]:
fd.isna().sum()

balance_amt_max_x        0
sum_cred_e_oper_1m_x     0
balance_amt_min_x        0
sum_cred_e_oper_3m_x     0
balance_amt_day_avg_x    0
cnt_b_oper_1m_x          0
balance_amt_min          0
dtype: int64

In [None]:
fd.to_csv('../data/knn_imputer7_data.csv')

## Мы очень хотели применить этот метод и для других столбцов датасета, но к сожалению, даже на малых данных модель долго работает