In [37]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.metrics import mean_squared_error

# Для визуализации графиков CatBoost надо установить ipywidgets
#!pip install ipywidgets
# jupyter nbextension enable --py widgetsnbextension
import ipywidgets

pd.options.mode.chained_assignment = None

In [71]:
PATH = "datasets/train_dataset_VK/"
submission_filename = "submission.csv"

## Загрузка данных

In [72]:
# Обучающая выборка разбита на файлы
datset_filenames = ["rich_test_1M_1.csv", "rich_test_1M_2.csv", "rich_test_1M_3.csv", "rich_test_1M_4.csv", "rich_test_1M_5.csv"]
datset_df = pd.DataFrame()
for filename in datset_filenames:
    partfile_df = pd.read_csv(PATH + filename)
    datset_df = pd.concat([datset_df, partfile_df], axis=0)
datset_df.shape

(4315675, 21)

In [73]:
datset_df

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,is_t_exist,friendship,age_x,...,sex_2_x,city_freq_x,university_freq_x,school_freq_x,age_y,sex_1_y,sex_2_y,city_freq_y,university_freq_y,school_freq_y
0,42949673176,0,244,169.2,6.706259e-05,0.00000,0.0,1,0.181489,35,...,1,13,154,101,121,1,0,13,154,101
1,42949673176,0,98,225.9,,0.00000,0.0,1,0.228191,35,...,1,13,154,101,30,0,1,98,154,16
2,42949673176,97,98,221.2,,0.00000,0.0,1,0.569724,32,...,1,98,154,5,30,0,1,98,154,16
3,42949673176,9,98,0.0,4.284449e-02,0.00000,0.0,0,0.477254,29,...,1,98,7,101,30,0,1,98,154,16
4,42949673176,33,98,148.3,1.011391e+00,0.00000,0.0,1,0.637653,32,...,0,98,154,16,30,0,1,98,154,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870616,266287972376,249,250,286.8,7.138040e-03,0.00000,0.0,1,0.022189,35,...,0,14,89,93,35,1,0,1,1,93
870617,266287972376,274,269,205.7,1.794510e+00,1.94591,1.0,1,1.094208,19,...,1,127,89,1,26,0,1,127,89,1
870618,266287972376,277,269,156.2,8.951802e-01,0.00000,1.0,1,1.076704,20,...,1,127,89,1,26,0,1,127,89,1
870619,266287972376,242,243,303.5,,0.00000,1.0,1,0.008248,35,...,1,127,89,1,34,1,0,127,10,1


### Готовим данные для модели

In [74]:
train_df = datset_df[datset_df["x1"].notna()]
test_df = datset_df[datset_df["x1"].isna()]
test_df.shape, train_df.shape

((863413, 21), (3452262, 21))

In [75]:
# train_df.columns
data_columns = ['u', 'v', 't', 'x2', 'x3', 'is_t_exist', 'friendship', 'age_x', 'sex_1_x', 'sex_2_x', 'city_freq_x', 'university_freq_x', 'school_freq_x', 'age_y', 'sex_1_y', 'sex_2_y', 'city_freq_y', 'university_freq_y', 'school_freq_y']
target_column = "x1"

x_train_df = train_df[data_columns]
y_train_df = train_df[target_column]
y_test = test_df[data_columns]

x_train_df.shape, y_train_df.shape

((3452262, 19), (3452262,))

In [76]:
x_train, x_val, y_train, y_val = train_test_split(x_train_df, y_train_df, test_size=0.2, random_state=53, shuffle=True)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((2761809, 19), (690453, 19), (2761809,), (690453,))

### Обучаем модель

In [77]:
from catboost import CatBoostRegressor, Pool
from catboost import cv
cat = CatBoostRegressor()

cat.fit(x_train, y_train, eval_set=(x_val, y_val), verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x20d271f8508>

In [78]:
# Производим кроссвалидацию (минимальная)
params = {"iterations": 100,
          "depth": 2,
          "loss_function": "RMSE",
          "verbose": False}
cv_dataset = Pool(data=x_train,
                  label=y_train)
scores = cv(cv_dataset,
            params, 
            fold_count=2, 
            plot="True")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/2]

bestTest = 0.9223846089
bestIteration = 99

Training on fold [1/2]

bestTest = 0.9230336149
bestIteration = 99



In [79]:
# Формируем предсказания на тестовых данных
test_df["predict"] = cat.predict(y_test)
test_df

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,is_t_exist,friendship,age_x,...,city_freq_x,university_freq_x,school_freq_x,age_y,sex_1_y,sex_2_y,city_freq_y,university_freq_y,school_freq_y,predict
1,42949673176,0,98,225.9,,0.0,0.0,1,0.228191,35,...,13,154,101,30,0,1,98,154,16,0.169366
2,42949673176,97,98,221.2,,0.0,0.0,1,0.569724,32,...,98,154,5,30,0,1,98,154,16,0.374552
5,42949673176,121,98,233.5,,0.0,0.0,1,0.538926,25,...,9,1,16,30,0,1,98,154,16,0.310310
21,42949673176,35,298,299.6,,0.0,0.0,1,0.316065,30,...,98,154,16,32,0,1,98,154,101,0.215561
22,42949673176,94,298,253.9,,0.0,0.0,1,0.165649,28,...,98,154,4,32,0,1,98,154,101,0.162873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870595,266287972376,183,214,392.9,,0.0,0.0,1,0.618966,35,...,22,89,93,38,0,1,14,19,3,0.437776
870598,266287972376,85,94,12.8,,0.0,0.0,1,0.202401,103,...,127,10,1,41,0,1,127,10,3,0.231767
870608,266287972376,220,219,260.3,,0.0,0.0,1,0.059710,36,...,20,89,93,34,1,0,22,3,1,0.068618
870611,266287972376,144,220,54.4,,0.0,0.0,1,0.027634,35,...,20,89,2,36,0,1,20,89,93,0.050701


### Обновляем знания в submission.csv

In [80]:
# Скачиваем действующий сабмит
submission = pd.read_csv(PATH + submission_filename)
submission.shape

(810976, 4)

In [81]:
# Объединяем с предсказаниями из теста
result_submission = submission.merge(test_df[["ego_id", "u", "v", "predict"]], on=["ego_id", "u", "v"], how='left')
result_submission.shape

(810976, 5)

In [82]:
# Обновляем сабмит тмеми предсказаниями которые получены от модели
result_submission["x1"] = result_submission.apply(lambda x: x["x1"] if pd.isna(x["predict"]) else x["predict"], axis=1)
result_submission = result_submission[["ego_id", "u", "v", "x1"]]

In [83]:
# Сохраняем итоговый сабмит
result_submission.to_csv(PATH + "result_submission.csv", index=False)