In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from joblib import dump

In [3]:
sns.set_theme(style="whitegrid")

In [4]:
X_train = np.load('data/X_train.npy')
y_train = np.load('data/y_train.npy')
X_val = np.load('data/X_val.npy')
y_val = np.load('data/y_val.npy')
X_test = np.load('data/X_test.npy')

In [None]:
X_train = X_train[]

In [5]:
def custom_root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(np.sum(np.square(np.log1p(1 + y_pred) - np.log1p(1 + y_true))) / len(y_true))

In [6]:
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_leaf=4, min_samples_split=5)

In [7]:
xgboost_model = XGBRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.1,
    min_samples_leaf=2,
    min_samples_split=5
)
base_model = XGBRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.1,
    min_samples_leaf=2,
    min_samples_split=5
)

In [9]:
catboost_model = CatBoostRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.01,
)

In [10]:
lightboost_model = LGBMRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.1,
    min_samples_leaf=2,
    min_samples_split=5
)

In [11]:
stacking_model = StackingRegressor(
    estimators=[
        ("XGBoost", xgboost_model),
        ("LightGBM", lightboost_model),
        ("CatBoost", catboost_model),
        ("Random_forest", rf_model)
    ],
    final_estimator=base_model
)

In [12]:
stacking_model.fit(X_train, y_train)

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



0:	learn: 3.1529037	total: 130ms	remaining: 38.9s
1:	learn: 3.1370768	total: 143ms	remaining: 21.3s
2:	learn: 3.1211624	total: 154ms	remaining: 15.2s
3:	learn: 3.1056616	total: 161ms	remaining: 11.9s
4:	learn: 3.0900948	total: 166ms	remaining: 9.81s
5:	learn: 3.0749488	total: 170ms	remaining: 8.33s
6:	learn: 3.0598134	total: 175ms	remaining: 7.31s
7:	learn: 3.0450194	total: 180ms	remaining: 6.56s
8:	learn: 3.0305875	total: 184ms	remaining: 5.96s
9:	learn: 3.0158589	total: 188ms	remaining: 5.46s
10:	learn: 3.0022565	total: 192ms	remaining: 5.04s
11:	learn: 2.9883510	total: 195ms	remaining: 4.68s
12:	learn: 2.9743416	total: 199ms	remaining: 4.39s
13:	learn: 2.9609421	total: 203ms	remaining: 4.14s
14:	learn: 2.9479474	total: 207ms	remaining: 3.94s
15:	learn: 2.9344364	total: 212ms	remaining: 3.76s
16:	learn: 2.9217967	total: 216ms	remaining: 3.59s
17:	learn: 2.9086584	total: 219ms	remaining: 3.43s
18:	learn: 2.8956672	total: 222ms	remaining: 3.29s
19:	learn: 2.8831598	total: 226ms	remaini

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



0:	learn: 3.1566034	total: 10.8ms	remaining: 3.24s
1:	learn: 3.1408589	total: 15ms	remaining: 2.23s
2:	learn: 3.1257833	total: 18.4ms	remaining: 1.82s
3:	learn: 3.1102505	total: 22.7ms	remaining: 1.68s
4:	learn: 3.0946545	total: 29.4ms	remaining: 1.73s
5:	learn: 3.0797161	total: 34ms	remaining: 1.67s
6:	learn: 3.0649732	total: 37.6ms	remaining: 1.57s
7:	learn: 3.0498048	total: 41ms	remaining: 1.5s
8:	learn: 3.0350916	total: 44.3ms	remaining: 1.43s
9:	learn: 3.0211385	total: 47.4ms	remaining: 1.38s
10:	learn: 3.0076358	total: 50.6ms	remaining: 1.33s
11:	learn: 2.9937733	total: 53.7ms	remaining: 1.29s
12:	learn: 2.9798094	total: 57.8ms	remaining: 1.27s
13:	learn: 2.9665462	total: 61.9ms	remaining: 1.26s
14:	learn: 2.9528509	total: 65.4ms	remaining: 1.24s
15:	learn: 2.9394412	total: 68.4ms	remaining: 1.21s
16:	learn: 2.9263158	total: 71.8ms	remaining: 1.2s
17:	learn: 2.9133818	total: 75ms	remaining: 1.17s
18:	learn: 2.9006721	total: 78.1ms	remaining: 1.15s
19:	learn: 2.8886031	total: 81ms

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



In [13]:
y_pred = stacking_model.predict(X_val)
y_pred = np.array(y_pred, dtype=np.int32)
custom_root_mean_squared_log_error(y_val, y_pred)

0.14636913400163598

In [14]:
mean_squared_log_error(y_val, y_pred)

0.025790431340044388

In [15]:
y_pred_test = stacking_model.predict(X_test)
y_pred_test = np.array(y_pred_test, dtype=np.int32)

In [16]:
test_pd = pd.read_csv('data/test.csv')

In [25]:
result = pd.concat([test_pd['id'], pd.DataFrame(y_pred_test, columns=["Rings"])], axis=1)

In [27]:
result.to_csv('data/stacking_version/result_with_stacking.csv', index=False)

In [21]:
dump(stacking_model, 'models/stacking_model.joblib')

['models/stacking_model.joblib']