# Import libs

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import Pool, CatBoostRegressor
from scipy.signal import savgol_filter
from sklearn.model_selection import train_test_split

import os, sys
sys.path.append("../..")

import warnings
warnings.filterwarnings("ignore")

# Import Data

#### define paths

In [2]:
path_clusters_data = "../../data/clusters.parquet"

#### load datasets

In [3]:
df_clusters = pd.read_parquet(path_clusters_data)

In [4]:
df_clusters = df_clusters.drop(axis=1, columns=['int_index'])

In [5]:
df_clusters['real_weight_3_month_ago'] = df_clusters['real_weight'].shift(3)

In [6]:
df_clusters['real_weight_2_month_ago'] = df_clusters['real_weight'].shift(2)

In [7]:
df_clusters['real_weight_1_month_ago'] = df_clusters['real_weight'].shift(1)

In [8]:
df_clusters

Unnamed: 0,HASH,real_weight,real_wagon_count,price_urals_usd,rub-usd,rub-uan,density,k_density_before,k_density_after,cluster,real_weight_3_month_ago,real_weight_2_month_ago,real_weight_1_month_ago
2020-03-01,0,17490.0,249.0,29.51,73,10,70.240964,-0.038446,-0.047423,1,,,
2020-04-01,0,16559.0,237.0,16.61,74,10,69.869198,-0.038446,-0.047423,1,,,17490.0
2020-05-01,0,5443.0,78.0,30.65,72,10,69.782051,-0.038446,-0.047423,1,,17490.0,16559.0
2020-06-01,0,10975.0,157.0,42.36,69,9,69.904459,-0.038446,-0.047423,1,17490.0,16559.0,5443.0
2020-07-01,0,14957.0,214.0,44.28,71,10,69.892523,-0.038446,-0.047423,1,16559.0,5443.0,10975.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-01,75088,0.0,0.0,67.98,60,8,0.000000,0.000000,2.181818,1,0.0,0.0,0.0
2022-12-01,75088,0.0,0.0,52.23,65,9,0.000000,0.000000,2.181818,1,0.0,0.0,0.0
2023-01-01,75088,0.0,0.0,52.21,68,10,0.000000,0.000000,2.181818,1,0.0,0.0,0.0
2023-02-01,75088,0.0,0.0,51.91,72,10,0.000000,0.000000,2.181818,1,0.0,0.0,0.0


In [9]:
test = df_clusters[df_clusters.index.year == 2023].groupby(['HASH',df_clusters[df_clusters.index.year == 2023].index.year])[['real_weight']].median()

In [10]:
test.reset_index().drop(axis=1, columns='level_1')

Unnamed: 0,HASH,real_weight
0,0,18324.0
1,1,636.0
2,2,1327.0
3,3,867.0
4,4,334.0
...,...,...
75079,75084,0.0
75080,75085,0.0
75081,75086,0.0
75082,75087,0.0


In [11]:
df_clusters[df_clusters.HASH == 0]['mean_val'] = 18324

In [12]:
df_clusters['mean_last_10_month_weight'] = None

In [13]:
df_clusters['mean_last_10_month_wagon'] = None

In [14]:
dp = [0,0,0,0,0,0,0,0,0,0]
mem = []
for val in range(0, df_clusters.shape[0]):
    dp.pop(0)
    value = df_clusters['real_weight'][val]
    dp.append(value)
    mem.append(np.mean(dp))

In [15]:
df_clusters['mean_last_10_month_weight'] = mem

In [16]:
dp = [0,0,0,0,0,0,0,0,0,0]
mem = []
for val in range(0, df_clusters.shape[0]):
    dp.pop(0)
    value = df_clusters['real_wagon_count'][val]
    dp.append(value)
    mem.append(np.mean(dp))

In [17]:
df_clusters['mean_last_10_month_wagon'] = mem

In [18]:
df_clusters = df_clusters[df_clusters.index.year != 2020]

In [19]:
df_clusters.corr()

Unnamed: 0,HASH,real_weight,real_wagon_count,price_urals_usd,rub-usd,rub-uan,density,k_density_before,k_density_after,cluster,real_weight_3_month_ago,real_weight_2_month_ago,real_weight_1_month_ago,mean_last_10_month_weight,mean_last_10_month_wagon
HASH,1.0,-0.061916,-0.063812,-1.069441e-13,7.492039e-13,-1.126186e-13,-0.261499,0.04296545,0.02284606,0.03738008,-0.062969,-0.062534,-0.062284,-0.065239,-0.067148
real_weight,-0.06191644,1.0,0.998855,-3.243156e-06,0.000453035,0.0004462677,0.114739,-0.001212237,-0.003853949,0.0001575912,0.94088,0.953042,0.971436,0.964777,0.963733
real_wagon_count,-0.06381219,0.998855,1.0,-0.0001733794,0.0004577889,0.0004270685,0.11518,-0.002619296,-0.002590879,0.0002441738,0.940244,0.952249,0.970444,0.964234,0.964671
price_urals_usd,-1.069441e-13,-3e-06,-0.000173,1.0,0.08916184,0.1735996,0.006564,1.462176e-15,-3.065509e-15,3.020811e-15,0.000402,0.000595,0.000295,0.000464,0.000416
rub-usd,7.492039e-13,0.000453,0.000458,0.08916184,1.0,0.9802044,0.012746,1.750856e-14,7.412751e-15,1.584605e-14,0.000478,-0.00018,-0.000472,0.000392,0.000417
rub-uan,-1.126186e-13,0.000446,0.000427,0.1735996,0.9802044,1.0,0.014086,-8.564703e-16,-6.531053e-15,4.992715e-16,0.000649,2.1e-05,-0.000214,0.000507,0.000529
density,-0.2614993,0.114739,0.11518,0.006563892,0.01274609,0.01408601,1.0,0.06287438,-0.06829303,0.02036936,0.106748,0.108007,0.110365,0.109117,0.109983
k_density_before,0.04296545,-0.001212,-0.002619,1.462176e-15,1.750856e-14,-8.564703e-16,0.062874,1.0,-0.05838578,0.7411117,-0.003119,-0.00245,-0.001831,-0.004227,-0.005543
k_density_after,0.02284606,-0.003854,-0.002591,-3.065509e-15,7.412751e-15,-6.531053e-15,-0.068293,-0.05838578,1.0,-0.1797615,-0.005666,-0.00509,-0.004516,-0.006002,-0.004364
cluster,0.03738008,0.000158,0.000244,3.020811e-15,1.584605e-14,4.992715e-16,0.020369,0.7411117,-0.1797615,1.0,-0.000945,-0.000559,-0.000192,-0.001686,-0.001846


In [20]:
df_clusters.head()

Unnamed: 0,HASH,real_weight,real_wagon_count,price_urals_usd,rub-usd,rub-uan,density,k_density_before,k_density_after,cluster,real_weight_3_month_ago,real_weight_2_month_ago,real_weight_1_month_ago,mean_last_10_month_weight,mean_last_10_month_wagon
2021-01-01,0,11189.0,155.0,54.89,74,11,72.187097,-0.038446,-0.047423,1,16098.0,17213.0,13516.0,12785.7,182.4
2021-02-01,0,8968.0,127.0,61.47,74,11,70.614173,-0.038446,-0.047423,1,17213.0,13516.0,11189.0,12026.6,171.4
2021-03-01,0,11867.0,171.0,64.29,74,11,69.397661,-0.038446,-0.047423,1,13516.0,11189.0,8968.0,12669.0,180.7
2021-04-01,0,14138.0,203.0,63.02,76,11,69.64532,-0.038446,-0.047423,1,11189.0,8968.0,11867.0,12985.3,185.3
2021-05-01,0,10179.0,146.0,67.26,74,11,69.719178,-0.038446,-0.047423,1,8968.0,11867.0,14138.0,12507.5,178.5


In [21]:
X = df_clusters.drop(axis=1, columns=['HASH','real_weight','real_wagon_count']).reset_index(drop=True)

# Apply t-SNE over dataset

In [22]:
# from sklearn.manifold import TSNE

In [23]:
# train_X = X.sample(250000)

In [24]:
# model_tsne = TSNE(n_components=3, learning_rate='auto',
#                  init='pca', perplexity=3).fit_transform(train_X)

# Train test split

In [25]:
y = df_clusters['real_wagon_count'].values

In [26]:
X = df_clusters.drop(axis=1, columns=["HASH",'real_weight','real_wagon_count']).reset_index(drop=True)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [38]:
# initialize Pool
train_pool = Pool(X_train, 
                  y_train,
                  cat_features=['cluster'])
test_pool = Pool(X_test, 
                 y_test,
                 cat_features=['cluster']) 

# Cpecify model params

In [56]:
params = {"iterations":500,"depth":4,"learning_rate":0.02,"loss_function":'MAE'}

In [57]:
model = CatBoostRegressor(**params)

In [None]:
model.fit(train_pool, plot=True, eval_set=test_pool, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 12.2309518	test: 12.8438448	best: 12.8438448 (0)	total: 674ms	remaining: 5m 36s
1:	learn: 12.2169447	test: 12.8298158	best: 12.8298158 (1)	total: 1.34s	remaining: 5m 34s
2:	learn: 12.1934675	test: 12.8063206	best: 12.8063206 (2)	total: 2.06s	remaining: 5m 41s
3:	learn: 12.1312951	test: 12.7437149	best: 12.7437149 (3)	total: 2.8s	remaining: 5m 46s
4:	learn: 12.0999786	test: 12.7122254	best: 12.7122254 (4)	total: 3.48s	remaining: 5m 44s
5:	learn: 12.0775805	test: 12.6897736	best: 12.6897736 (5)	total: 4.09s	remaining: 5m 36s
6:	learn: 12.0650237	test: 12.6771872	best: 12.6771872 (6)	total: 4.62s	remaining: 5m 25s
7:	learn: 12.0529199	test: 12.6650571	best: 12.6650571 (7)	total: 5.27s	remaining: 5m 23s
8:	learn: 12.0198154	test: 12.6319084	best: 12.6319084 (8)	total: 5.84s	remaining: 5m 18s
9:	learn: 11.9954159	test: 12.6074185	best: 12.6074185 (9)	total: 6.34s	remaining: 5m 10s
10:	learn: 11.9814988	test: 12.5934821	best: 12.5934821 (10)	total: 6.92s	remaining: 5m 7s
11:	learn:

90:	learn: 9.4175217	test: 10.0067375	best: 10.0067375 (90)	total: 48.2s	remaining: 3m 36s
91:	learn: 9.3699353	test: 9.9586414	best: 9.9586414 (91)	total: 48.7s	remaining: 3m 36s
92:	learn: 9.3654956	test: 9.9541248	best: 9.9541248 (92)	total: 49.2s	remaining: 3m 35s
93:	learn: 9.3096748	test: 9.8982280	best: 9.8982280 (93)	total: 49.9s	remaining: 3m 35s
94:	learn: 9.3050184	test: 9.8934685	best: 9.8934685 (94)	total: 50.4s	remaining: 3m 34s
95:	learn: 9.2986293	test: 9.8869555	best: 9.8869555 (95)	total: 50.9s	remaining: 3m 34s
96:	learn: 9.2940801	test: 9.8823827	best: 9.8823827 (96)	total: 51.4s	remaining: 3m 33s
97:	learn: 9.2923219	test: 9.8806031	best: 9.8806031 (97)	total: 51.9s	remaining: 3m 33s
98:	learn: 9.2903339	test: 9.8786010	best: 9.8786010 (98)	total: 52.5s	remaining: 3m 32s
99:	learn: 9.2368232	test: 9.8249253	best: 9.8249253 (99)	total: 53s	remaining: 3m 31s
100:	learn: 9.2328568	test: 9.8209079	best: 9.8209079 (100)	total: 53.5s	remaining: 3m 31s
101:	learn: 9.22877