#### Welcome to *whisker max* solution
##### Импорт библиотек и чтение исходного датасета

In [1]:
!pip3 install catboost optuna xgboost lightgbm



In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("dataset.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130818 entries, 0 to 130817
Data columns (total 5 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   clientbankpartner_pin   130818 non-null  int64 
 1   client_pin              130818 non-null  int64 
 2   partner_src_type_ccode  130818 non-null  int64 
 3   client_start_date       130818 non-null  object
 4   partnerrolestart_date   130818 non-null  object
dtypes: int64(3), object(2)
memory usage: 5.0+ MB


In [5]:
df["client_start_date"] = pd.to_datetime(df["client_start_date"])
df["partnerrolestart_date"] = pd.to_datetime(df["partnerrolestart_date"])
df.describe()

Unnamed: 0,clientbankpartner_pin,client_pin,partner_src_type_ccode,client_start_date,partnerrolestart_date
count,130818.0,130818.0,130818.0,130818,130818
mean,126038.565006,185136.559197,3.63729,2020-04-02 21:34:45.896435712,2019-09-17 20:55:53.153235712
min,1.0,5579.0,0.0,2019-03-01 00:00:00,2018-12-01 00:00:00
25%,10093.0,71995.75,4.0,2019-12-14 00:00:00,2019-05-01 00:00:00
50%,125573.0,205454.5,4.0,2020-04-26 00:00:00,2019-09-01 00:00:00
75%,191599.0,272905.5,4.0,2020-08-11 00:00:00,2020-02-01 00:00:00
max,333515.0,333513.0,5.0,2020-11-30 00:00:00,2020-11-01 00:00:00
std,93263.794349,101297.060675,1.328811,,


In [6]:
# Тестовая выборка
test_date = pd.to_datetime("2020-09-01")
test_partner_pins = df[df["client_start_date"] >= test_date]["clientbankpartner_pin"].unique()

### Обучающий датасет

**Итоговые признаки:**
* Общее кол-во привлечений за все время
* Кол-во привлечений в среднем в месяц/неделю
* Сколько дней проходит между первым-вторым, предпоследним-последним привлечением, медиана от всех разниц
* Сколько дней прошло с последнего/медианного привлечения
* Сколько дней партнер всего привлекает людей
* Сколько людей привлекли за последние 30/60/90/180/270/365 дней

In [7]:
def days_between_invites(days: list):
    if len(days) == 1:
        return -100,-100,-100,-100,-100
    diff_days = []
    for i in range(1, len(days)):
        diff_days.append((days[i] - days[i-1]).days)
    if len(diff_days) == 1:
        return diff_days[0], -100, sum(diff_days) / len(days), diff_days[-1], -100
    return diff_days[0], diff_days[1], sum(diff_days) / len(days), diff_days[-1], diff_days[-2]

def clients_invited_last_n_days(days, end_date):
      return lambda x: sum((end_date - d).days <= days for d in x)

def make_dataset(input_df:pd.DataFrame, end_date, is_train=False):
    input_df = input_df.copy()
    output_df = input_df.groupby("clientbankpartner_pin").agg({"client_pin": "count", "partnerrolestart_date": "min", "client_start_date": ["min", "max", "median", lambda x: sorted(list(x))]})

    # Кол-во привлечений в неделю/месяц
    output_df["average_month_invites"] = output_df["client_pin"]["count"]/ output_df["client_start_date"]["<lambda_0>"].agg(lambda x: len(set(map(lambda y: y.month, x))) )
    output_df["average_week_invites"] = output_df["client_pin"]["count"]/ output_df["client_start_date"]["<lambda_0>"].agg(lambda x: len(set(map(lambda y: y.week, x))) )
    # Сколько дней проходит между первым-вторым, предпоследним-последним привлечением
    output_df[["diff_day_first", "diff_day_second", "diff_days_mean", "diff_days_last", "diff_days_prelast"]] = output_df["client_start_date"]["<lambda_0>"].apply(days_between_invites).tolist()
    # Сколько дней прошло с последнего/медианного привлечения
    output_df["diff_test_date_and_last_invite"] = end_date - output_df["client_start_date"]["max"]
    output_df["end_date_clientstart_median"] = end_date - output_df["client_start_date"]["median"]
    # Сколько дней партнер всего привлекает людей
    output_df["end_date_clientstart_max-min"] = output_df["client_start_date"]["max"] - output_df["client_start_date"]["min"]
    output_df["difference_median_and_max-min"] = output_df["end_date_clientstart_max-min"] - output_df["end_date_clientstart_median"]

    # Перевод дат в целые числа
    output_df["diff_first_lastdt"] = output_df["diff_day_first"] - output_df["diff_days_last"]
    output_df["diff_between_end_date_and_first_client"] = end_date - output_df["client_start_date"]["min"]
    output_df["diff_test_date_and_last_invite"] =output_df["diff_test_date_and_last_invite"].dt.days
    output_df["end_date_clientstart_median"] =output_df["end_date_clientstart_median"].dt.days
    output_df["end_date_clientstart_max-min"] =output_df["end_date_clientstart_max-min"].dt.days
    output_df["difference_median_and_max-min"] =output_df["difference_median_and_max-min"].dt.days
    output_df["diff_between_end_date_and_first_client"] = output_df["diff_between_end_date_and_first_client"].dt.days

    # Сколько людей привлекли за последние 30/60/90/180/270/365 дней
    output_df["clients_per_day"] = output_df["diff_between_end_date_and_first_client"] / output_df["client_pin"]["count"]
    output_df["clients_invited_last_30_days"] = output_df["client_start_date"]["<lambda_0>"].apply(clients_invited_last_n_days(30, end_date))
    output_df["clients_invited_last_60_days"] = output_df["client_start_date"]["<lambda_0>"].apply(clients_invited_last_n_days(60, end_date))
    output_df["clients_invited_last_90_days"] = output_df["client_start_date"]["<lambda_0>"].apply(clients_invited_last_n_days(90, end_date))
    output_df["clients_invited_last_180_days"] = output_df["client_start_date"]["<lambda_0>"].apply(clients_invited_last_n_days(180, end_date))
    output_df["clients_invited_last_270_days"] = output_df["client_start_date"]["<lambda_0>"].apply(clients_invited_last_n_days(270, end_date))
    output_df["clients_invited_last_365_days"] = output_df["client_start_date"]["<lambda_0>"].apply(clients_invited_last_n_days(365, end_date))

    output_df = output_df.drop(columns=[("client_start_date", "<lambda_0>"), ("partnerrolestart_date", "min"), ("client_start_date", "min"), ("client_start_date", "max"), ("client_start_date", "median"), ('client_pin','count')])

    if is_train:
        output_df["churn"] = list(map(lambda x: int(x in test_partner_pins), output_df.index.values))
    return output_df

train_dataset = make_dataset(df[df["client_start_date"] < test_date], pd.to_datetime("2020-09-01"), True)
test_dataset = make_dataset(df[df["clientbankpartner_pin"].isin(test_partner_pins)], pd.to_datetime("2020-12-01"))

In [8]:
train_dataset

Unnamed: 0_level_0,average_month_invites,average_week_invites,diff_day_first,diff_day_second,diff_days_mean,diff_days_last,diff_days_prelast,diff_test_date_and_last_invite,end_date_clientstart_median,end_date_clientstart_max-min,...,diff_first_lastdt,diff_between_end_date_and_first_client,clients_per_day,clients_invited_last_30_days,clients_invited_last_60_days,clients_invited_last_90_days,clients_invited_last_180_days,clients_invited_last_270_days,clients_invited_last_365_days,churn
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
clientbankpartner_pin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,1.00,1.0,187.0,7.0,64.666667,7.0,187.0,88,95,194,...,180.0,282,94.000000,0,0,1,2,2,3,1
2,1.75,1.4,2.0,26.0,58.714286,22.0,114.0,63,279,411,...,-20.0,474,67.714286,0,0,2,2,3,4,1
5,1.00,1.0,17.0,32.0,31.400000,30.0,78.0,225,333,157,...,-13.0,382,76.400000,0,0,0,0,2,4,0
6,1.00,1.0,-100.0,-100.0,-100.000000,-100.0,-100.0,337,337,0,...,0.0,337,337.000000,0,0,0,0,0,1,0
9,1.00,1.0,23.0,-100.0,11.500000,23.0,-100.0,224,235,23,...,0.0,247,123.500000,0,0,0,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333400,1.00,1.0,-100.0,-100.0,-100.000000,-100.0,-100.0,33,33,0,...,0.0,33,33.000000,0,1,1,1,1,1,1
333459,1.00,1.0,-100.0,-100.0,-100.000000,-100.0,-100.0,29,29,0,...,0.0,29,29.000000,1,1,1,1,1,1,1
333492,1.00,1.0,-100.0,-100.0,-100.000000,-100.0,-100.0,12,12,0,...,0.0,12,12.000000,1,1,1,1,1,1,1
333511,4.00,2.0,0.0,2.0,2.125000,1.0,6.0,22,33,17,...,-1.0,39,4.875000,3,8,8,8,8,8,0


### Простые модели

In [9]:
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(train_dataset.drop(["churn"], axis=1), train_dataset['churn'], random_state=20)

In [11]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.3f}".format(rmse))
    print("R2: {:.3f}".format(r2))
    print("ROC AUC: {:.5f}".format(roc_auc))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

#### Catboost

In [12]:
from catboost import CatBoostClassifier

In [13]:
model1 = CatBoostClassifier()
model1.fit(x_train, y_train)

Learning rate set to 0.022007
0:	learn: 0.6815034	total: 66.1ms	remaining: 1m 6s
1:	learn: 0.6715729	total: 79.1ms	remaining: 39.5s
2:	learn: 0.6607198	total: 90.8ms	remaining: 30.2s
3:	learn: 0.6505033	total: 104ms	remaining: 25.8s
4:	learn: 0.6404601	total: 129ms	remaining: 25.7s
5:	learn: 0.6313640	total: 144ms	remaining: 23.9s
6:	learn: 0.6228389	total: 158ms	remaining: 22.5s
7:	learn: 0.6143779	total: 175ms	remaining: 21.7s
8:	learn: 0.6071310	total: 190ms	remaining: 20.9s
9:	learn: 0.6002430	total: 213ms	remaining: 21.1s
10:	learn: 0.5932430	total: 229ms	remaining: 20.6s
11:	learn: 0.5862178	total: 255ms	remaining: 21s
12:	learn: 0.5801465	total: 280ms	remaining: 21.3s
13:	learn: 0.5738401	total: 299ms	remaining: 21.1s
14:	learn: 0.5687066	total: 329ms	remaining: 21.6s
15:	learn: 0.5622294	total: 352ms	remaining: 21.7s
16:	learn: 0.5567883	total: 368ms	remaining: 21.3s
17:	learn: 0.5516727	total: 393ms	remaining: 21.5s
18:	learn: 0.5467433	total: 438ms	remaining: 22.6s
19:	learn:

<catboost.core.CatBoostClassifier at 0x7c3155852170>

In [14]:
estimate_model(model1)

Testing performance
RMSE: 0.437
R2: 0.171
ROC AUC: 0.79055
Score: 0.8093
Local Score: 0.8619
Best params:  {}


In [15]:
model2 = CatBoostClassifier(iterations=3000, loss_function="MultiClassOneVsAll", learning_rate=0.05, l2_leaf_reg=14, max_depth=8)
model2.fit(x_train, y_train)

0:	learn: 0.6763611	total: 120ms	remaining: 5m 59s
1:	learn: 0.6600647	total: 224ms	remaining: 5m 35s
2:	learn: 0.6467230	total: 330ms	remaining: 5m 29s
3:	learn: 0.6336712	total: 462ms	remaining: 5m 45s
4:	learn: 0.6213586	total: 527ms	remaining: 5m 15s
5:	learn: 0.6101490	total: 607ms	remaining: 5m 2s
6:	learn: 0.5996576	total: 726ms	remaining: 5m 10s
7:	learn: 0.5897288	total: 853ms	remaining: 5m 18s
8:	learn: 0.5812944	total: 1.01s	remaining: 5m 36s
9:	learn: 0.5724540	total: 1.12s	remaining: 5m 35s
10:	learn: 0.5644483	total: 1.24s	remaining: 5m 35s
11:	learn: 0.5575787	total: 1.36s	remaining: 5m 39s
12:	learn: 0.5508336	total: 1.47s	remaining: 5m 37s
13:	learn: 0.5444774	total: 1.55s	remaining: 5m 31s
14:	learn: 0.5385806	total: 1.7s	remaining: 5m 37s
15:	learn: 0.5329335	total: 1.87s	remaining: 5m 48s
16:	learn: 0.5274868	total: 1.98s	remaining: 5m 48s
17:	learn: 0.5224734	total: 2.11s	remaining: 5m 50s
18:	learn: 0.5183725	total: 2.28s	remaining: 5m 58s
19:	learn: 0.5142797	tot

<catboost.core.CatBoostClassifier at 0x7c3155889c60>

In [16]:
estimate_model(model2)

Testing performance
RMSE: 0.434
R2: 0.182
ROC AUC: 0.79222
Score: 0.8119
Local Score: 0.9090
Best params:  {'iterations': 3000, 'learning_rate': 0.05, 'l2_leaf_reg': 14, 'loss_function': 'MultiClassOneVsAll', 'max_depth': 8}


#### LightGBM

In [17]:
from lightgbm import LGBMClassifier

In [18]:
model3 = LGBMClassifier()
model3.fit(x_train.to_numpy(), y_train.to_numpy())

[LightGBM] [Info] Number of positive: 2217, number of negative: 3698
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3745
[LightGBM] [Info] Number of data points in the train set: 5915, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374810 -> initscore=-0.511637
[LightGBM] [Info] Start training from score -0.511637


In [19]:
estimate_model(model3)

Testing performance
RMSE: 0.436
R2: 0.173
ROC AUC: 0.79064
Score: 0.8098
Local Score: 0.8898
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}


#### XgBoost

In [20]:
from xgboost import XGBClassifier

In [21]:
model4 = XGBClassifier()
model4.fit(x_train, y_train)

In [22]:
estimate_model(model4)

Testing performance
RMSE: 0.452
R2: 0.114
ROC AUC: 0.77435
Score: 0.7961
Local Score: 0.9319
Best params:  {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity':

#### Подбор параметров с GridSearch

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
params = {
    "depth": [6,8,10],
    "iterations": [200, 400, 1000],
    "learning_rate": [0.1, 0.08, 0.2],
    "min_data_in_leaf": [12, 40, 80]
}
model5 = CatBoostClassifier()
search = GridSearchCV(model5, params)
search.fit(x_train, y_train)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
200:	learn: 0.1561242	total: 11.5s	remaining: 45.7s
201:	learn: 0.1556093	total: 11.6s	remaining: 45.6s
202:	learn: 0.1551172	total: 11.6s	remaining: 45.6s
203:	learn: 0.1546893	total: 11.6s	remaining: 45.5s
204:	learn: 0.1543465	total: 11.7s	remaining: 45.4s
205:	learn: 0.1536584	total: 11.8s	remaining: 45.3s
206:	learn: 0.1533774	total: 11.8s	remaining: 45.2s
207:	learn: 0.1528047	total: 11.8s	remaining: 45.1s
208:	learn: 0.1526940	total: 11.9s	remaining: 45s
209:	learn: 0.1525546	total: 11.9s	remaining: 45s
210:	learn: 0.1520329	total: 12s	remaining: 44.9s
211:	learn: 0.1514918	total: 12s	remaining: 44.8s
212:	learn: 0.1508919	total: 12.1s	remaining: 44.7s
213:	learn: 0.1503103	total: 12.1s	remaining: 44.6s
214:	learn: 0.1499454	total: 12.2s	remaining: 44.6s
215:	learn: 0.1493076	total: 12.2s	remaining: 44.5s
216:	learn: 0.1487388	total: 12.3s	remaining: 44.4s
217:	learn: 0.1484673	total: 12.3s	remaini

In [25]:
estimate_model(search)

Testing performance
RMSE: 0.435
R2: 0.178
ROC AUC: 0.79205
Score: 0.8109
Local Score: 0.8651
Best params:  {'cv': None, 'error_score': nan, 'estimator': <catboost.core.CatBoostClassifier object at 0x7c316d519ab0>, 'n_jobs': None, 'param_grid': {'depth': [6, 8, 10], 'iterations': [200, 400, 1000], 'learning_rate': [0.1, 0.08, 0.2], 'min_data_in_leaf': [12, 40, 80]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 0}


In [26]:
model5 = search.best_estimator_
estimate_model(model5)

Testing performance
RMSE: 0.435
R2: 0.178
ROC AUC: 0.79205
Score: 0.8109
Local Score: 0.8651
Best params:  {'iterations': 200, 'learning_rate': 0.1, 'depth': 6, 'min_data_in_leaf': 12}


In [27]:
params = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [100, 400, 600],
    "max_depth": [6, 8, 10, -1],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt', 'dart'],
    'objective' : ['binary'],
    }
model6 = LGBMClassifier()
search = GridSearchCV(model6, params, scoring='roc_auc')
search.fit(x_train.to_numpy(), y_train.to_numpy())

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[LightGBM] [Info] Total Bins 3649
[LightGBM] [Info] Number of data points in the train set: 4732, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374683 -> initscore=-0.512178
[LightGBM] [Info] Start training from score -0.512178
[LightGBM] [Info] Number of positive: 1774, number of negative: 2958
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000631 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3664
[LightGBM] [Info] Number of data points in the train set: 4732, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374894 -> initscore=-0.511276
[LightGBM] [Info] Start training from score -0.511276
[LightGBM] [Info] Number of positive: 1774, number of negative: 2958
[LightGBM] [Info] Auto-choosing ro

In [28]:
model6 = search.best_estimator_
estimate_model(model6)

Testing performance
RMSE: 0.433
R2: 0.186
ROC AUC: 0.79207
Score: 0.8129
Local Score: 0.8051
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.005, 'max_depth': 6, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 600, 'n_jobs': None, 'num_leaves': 8, 'objective': 'binary', 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [29]:
params = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}
model7 = XGBClassifier()
search = GridSearchCV(model7, params, scoring='roc_auc')
search.fit(x_train, y_train)

In [30]:
model7 = search.best_estimator_
estimate_model(model7)

Testing performance
RMSE: 0.434
R2: 0.180
ROC AUC: 0.78683
Score: 0.8114
Local Score: 0.8044
Best params:  {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.01, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 4, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 180, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': Non

- Мы посчитали нужным также провести tuning моделей с помощью optuna вместо GridSearchCV так как это более продвинутым методом, использующим байесовскую оптимизацию для эффективного поиска наилучшего набора гиперпараметров. Таким образом мы избавимся от избыточного перебора и сэкономим время
- Лучше всего себя показали модели Catboost и LightGBM. Их мы и используем в финальном решении

#### Подбор параметров с optuna

In [31]:
import optuna

In [32]:
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1600),
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 0.01, 18),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
        "used_ram_limit": "16gb",
        # "max_leaves": trial.suggest_int("max_leaves", 15, 63),
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    optuna_model = CatBoostClassifier(**params)
    optuna_model.fit(x_train, y_train, verbose=0)

    y_pred = optuna_model.predict(x_test)
    accuracy = roc_auc_score(y_test, y_pred)
    return accuracy

In [33]:
study = optuna.create_study(study_name="catboost_optuna", storage="sqlite:///db.sqlite3", direction='maximize')
study.optimize(objective, n_trials=1000)

[I 2024-05-15 21:16:30,689] A new study created in RDB with name: catboost_optuna
[I 2024-05-15 21:16:32,621] Trial 0 finished with value: 0.7889087666530629 and parameters: {'iterations': 206, 'objective': 'Logloss', 'colsample_bylevel': 0.02400232924353648, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'l2_leaf_reg': 16, 'learning_rate': 0.11444688261280059}. Best is trial 0 with value: 0.7889087666530629.
[I 2024-05-15 21:16:42,653] Trial 1 finished with value: 0.7948163760475426 and parameters: {'iterations': 1246, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.04066995390495744, 'depth': 2, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 12, 'learning_rate': 0.07125695214429512, 'subsample': 0.6287745854157244}. Best is trial 1 with value: 0.7948163760475426.
[I 2024-05-15 21:16:48,652] Trial 2 finished with value: 0.7956068898815348 and parameters: {'iterations': 643, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.0446269421

In [34]:
params = study.best_params
model8 = CatBoostClassifier(**params)
model8.fit(x_train, y_train)

0:	learn: 0.6824264	total: 1.47ms	remaining: 282ms
1:	learn: 0.6609275	total: 10.5ms	remaining: 998ms
2:	learn: 0.6589184	total: 11.8ms	remaining: 743ms
3:	learn: 0.6414663	total: 14.8ms	remaining: 697ms
4:	learn: 0.6263693	total: 16.8ms	remaining: 629ms
5:	learn: 0.6100118	total: 40.3ms	remaining: 1.25s
6:	learn: 0.5967095	total: 44ms	remaining: 1.16s
7:	learn: 0.5956027	total: 45.1ms	remaining: 1.04s
8:	learn: 0.5846592	total: 46.7ms	remaining: 949ms
9:	learn: 0.5776898	total: 48.5ms	remaining: 883ms
10:	learn: 0.5659615	total: 50.5ms	remaining: 831ms
11:	learn: 0.5587905	total: 52.3ms	remaining: 784ms
12:	learn: 0.5521252	total: 53.9ms	remaining: 742ms
13:	learn: 0.5410820	total: 81.1ms	remaining: 1.03s
14:	learn: 0.5344731	total: 83ms	remaining: 980ms
15:	learn: 0.5306864	total: 85.8ms	remaining: 943ms
16:	learn: 0.5243201	total: 89.8ms	remaining: 925ms
17:	learn: 0.5197967	total: 95.3ms	remaining: 921ms
18:	learn: 0.5123116	total: 136ms	remaining: 1.24s
19:	learn: 0.5081119	total:

<catboost.core.CatBoostClassifier at 0x7c3132ce0ac0>

In [35]:
estimate_model(model8)

Testing performance
RMSE: 0.422
R2: 0.226
ROC AUC: 0.80449
Score: 0.8220
Local Score: 0.8313
Best params:  {'iterations': 192, 'learning_rate': 0.03966323710608413, 'depth': 12, 'l2_leaf_reg': 6, 'bagging_temperature': 2.016588205604849, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'colsample_bylevel': 0.09853109024911047, 'objective': 'CrossEntropy'}


In [36]:
def objective(trial):
    param = {
        "iterations": trial.suggest_int("iterations", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 40, 3000, step=10),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 99, step=3),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 99, step=3),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1, step=0.1),
        "objective": "binary",
        "boosting_type": "dart"
    }

    optuna_model = LGBMClassifier(**param,verbosity=-1)
    optuna_model.fit(x_train.to_numpy(), y_train.to_numpy())

    preds = optuna_model.predict(x_test)
    accuracy = roc_auc_score(y_test, preds)
    return accuracy

In [37]:
study = optuna.create_study(study_name="lgbm_optuna", storage="sqlite:///db.sqlite3", direction='maximize')
study.optimize(objective, n_trials=1600)

[I 2024-05-15 21:57:26,071] A new study created in RDB with name: lgbm_optuna
[I 2024-05-15 21:57:26,512] Trial 0 finished with value: 0.79263267941701 and parameters: {'iterations': 1178, 'learning_rate': 0.14717009938815415, 'num_leaves': 1630, 'max_depth': 8, 'lambda_l1': 63, 'lambda_l2': 39, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.79263267941701.
[I 2024-05-15 21:57:26,937] Trial 1 finished with value: 0.7913015525155 and parameters: {'iterations': 524, 'learning_rate': 0.1674024629691993, 'num_leaves': 1030, 'max_depth': 9, 'lambda_l1': 72, 'lambda_l2': 39, 'feature_fraction': 0.8}. Best is trial 0 with value: 0.79263267941701.
[I 2024-05-15 21:57:27,319] Trial 2 finished with value: 0.7934231932510021 and parameters: {'iterations': 536, 'learning_rate': 0.1911093969009781, 'num_leaves': 570, 'max_depth': 12, 'lambda_l1': 60, 'lambda_l2': 45, 'feature_fraction': 0.4}. Best is trial 2 with value: 0.7934231932510021.
[I 2024-05-15 21:57:27,730] Trial 3

In [38]:
params = study.best_params
model9 = LGBMClassifier(**params)
model9.fit(x_train.to_numpy(), y_train.to_numpy())

[LightGBM] [Info] Number of positive: 2217, number of negative: 3698
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3745
[LightGBM] [Info] Number of data points in the train set: 5915, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374810 -> initscore=-0.511637
[LightGBM] [Info] Start training from score -0.511637


In [39]:
estimate_model(model9)

Testing performance
RMSE: 0.428
R2: 0.204
ROC AUC: 0.79399
Score: 0.8169
Local Score: 0.8221
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.25472064827104673, 'max_depth': 11, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 800, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'iterations': 830, 'lambda_l1': 9, 'lambda_l2': 93, 'feature_fraction': 1.0}


#### Предсказание

In [40]:
!mkdir results

In [41]:
val = model5.predict_proba(test_dataset)[:, 0]
preds = pd.DataFrame({"clientbankpartner_pin": test_dataset.index, "score": val})
preds.to_csv("results/model5_pred.csv", index=False)

In [42]:
val = model6.predict_proba(test_dataset)[:, 0]
preds = pd.DataFrame({"clientbankpartner_pin": test_dataset.index, "score": val})
preds.to_csv("results/model6_pred.csv", index=False)

In [43]:
val = model7.predict_proba(test_dataset)[:, 0]
preds = pd.DataFrame({"clientbankpartner_pin": test_dataset.index, "score": val})
preds.to_csv("results/model7_pred.csv", index=False)

In [44]:
val = model8.predict_proba(test_dataset)[:, 0]
preds = pd.DataFrame({"clientbankpartner_pin": test_dataset.index, "score": val})
preds.to_csv("results/model8_pred.csv", index=False)

#### KFold prediction

In [45]:
from sklearn.model_selection import KFold

In [46]:
preds = np.zeros(test_dataset.shape[0], dtype=np.float64)
n_splits =10
kf = KFold(n_splits=n_splits,random_state=44,shuffle=True)
roc_auc = []
n=0
my_x, my_y = train_dataset.drop(["churn"], axis=1), train_dataset['churn']
for train_index, test_index in kf.split(my_x):
    train_x, test_x = my_x.iloc[train_index], my_x.iloc[test_index]
    train_y, test_y = my_y.iloc[train_index], my_y.iloc[test_index]

    model = CatBoostClassifier(**model8.get_params())
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],verbose=False)
    preds+=model.predict_proba(test_dataset)[:, 0]
    roc_auc.append(roc_auc_score(test_y, model.predict(test_x)))

    print(f"fold: {n+1} ==> roc_auc: {roc_auc[n]}")
    n+=1
print("average roc_auc", sum(roc_auc) / n_splits)

fold: 1 ==> roc_auc: 0.7642154357270636
fold: 2 ==> roc_auc: 0.7802736290525962
fold: 3 ==> roc_auc: 0.7733678300121105
fold: 4 ==> roc_auc: 0.7698304498269896
fold: 5 ==> roc_auc: 0.7523892548107968
fold: 6 ==> roc_auc: 0.7809713200484422
fold: 7 ==> roc_auc: 0.7869726327992381
fold: 8 ==> roc_auc: 0.7831626848691695
fold: 9 ==> roc_auc: 0.7961991158842513
fold: 10 ==> roc_auc: 0.7820775805391189
average roc_auc 0.7769459933569778


In [47]:
val = preds / n_splits
preds = pd.DataFrame({"clientbankpartner_pin": test_dataset.index, "score": val})
preds.to_csv("results/model8_10folds.csv", index=False)

#### Тоже самое для LGBM

In [48]:
preds = np.zeros(test_dataset.shape[0], dtype=np.float64)
n_splits =10
kf = KFold(n_splits=n_splits,random_state=35,shuffle=True)
roc_auc = []
n=0
my_x, my_y = train_dataset.drop(["churn"], axis=1), train_dataset['churn']
for train_index, test_index in kf.split(my_x):
    train_x, test_x = my_x.iloc[train_index], my_x.iloc[test_index]
    train_y, test_y = my_y.iloc[train_index], my_y.iloc[test_index]

    model = LGBMClassifier(**model9.get_params(), verbosity=-1)
    model.fit(train_x.to_numpy(),train_y.to_numpy(),eval_set=[(test_x.to_numpy(),test_y.to_numpy())])
    preds+=model.predict_proba(test_dataset)[:, 0]
    roc_auc.append(roc_auc_score(test_y, model.predict(test_x)))

    print(f"fold: {n+1} ==> roc_auc: {roc_auc[n]}")
    n+=1
print("average roc_auc", sum(roc_auc) / n_splits)

fold: 1 ==> roc_auc: 0.7963662535310256
fold: 2 ==> roc_auc: 0.7911158672206134
fold: 3 ==> roc_auc: 0.754597801665568
fold: 4 ==> roc_auc: 0.7699043597068688
fold: 5 ==> roc_auc: 0.7589015857243407
fold: 6 ==> roc_auc: 0.7480010062961503
fold: 7 ==> roc_auc: 0.789115670321896
fold: 8 ==> roc_auc: 0.7533292828530452
fold: 9 ==> roc_auc: 0.7883974839707858
fold: 10 ==> roc_auc: 0.7707571948651866
average roc_auc 0.7720486506155481


In [49]:
val = preds / n_splits
preds = pd.DataFrame({"clientbankpartner_pin": test_dataset.index, "score": val})
preds.to_csv("results/model9_10folds.csv", index=False)

#### Объединение предсказаний разных моделей

In [51]:
file1 = pd.read_csv("results/model8_10folds.csv")
file2 = pd.read_csv("results/model9_10folds.csv")
file1["score"] = file1["score"] * 0.7 + file2["score"] * 0.3
file1.to_csv("results/united_model8_model9.csv", index=False)