In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from joblib import dump

In [20]:
sns.set_theme(style="whitegrid")

In [21]:
X_train = np.load('data/X_train.npy')
y_train = np.load('data/y_train.npy')
X_val = np.load('data/X_val.npy')
y_val = np.load('data/y_val.npy')
X_test = np.load('data/X_test.npy')

In [22]:
# Drop only column number 1
X_train = np.delete(X_train, 1, axis=1)
X_val = np.delete(X_val, 1, axis=1)
X_test = np.delete(X_test, 1, axis=1)

In [23]:
def custom_root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(np.sum(np.square(np.log1p(1 + y_pred) - np.log1p(1 + y_true))) / len(y_true))

In [24]:
rf_model = RandomForestRegressor(n_estimators=300, max_depth=10, min_samples_leaf=4, min_samples_split=5)

In [25]:
xgboost_model = XGBRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.01
)
base_model = XGBRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.01
)

In [26]:
catboost_model = CatBoostRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.01,
)

In [27]:
lightboost_model = LGBMRegressor(
    n_estimators=300,
    max_depth=None,
    learning_rate=0.01
)

In [28]:
stacking_model = StackingRegressor(
    estimators=[
        ("XGBoost", xgboost_model),
        ("LightGBM", lightboost_model),
        ("CatBoost", catboost_model),
        ("Random_forest", rf_model)
    ],
    final_estimator=base_model
)

In [29]:
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1220
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 9
[LightGBM] [Info] Start training from score 9.697111
0:	learn: 3.1528368	total: 4.84ms	remaining: 1.45s
1:	learn: 3.1366480	total: 8.69ms	remaining: 1.29s
2:	learn: 3.1207907	total: 12.4ms	remaining: 1.23s
3:	learn: 3.1051154	total: 16.1ms	remaining: 1.19s
4:	learn: 3.0897177	total: 19.8ms	remaining: 1.17s
5:	learn: 3.0741365	total: 23.5ms	remaining: 1.15s
6:	learn: 3.0594529	total: 28ms	remaining: 1.17s
7:	learn: 3.0445483	total: 32.5ms	remaining: 1.19s
8:	learn: 3.0297558	total: 36.4ms	remaining: 1.18s
9:	learn: 3.0153476	total: 40.7ms	remaining: 1.18s
10:	learn: 3.0011475	total: 44.7ms	remaining: 1.17s
11:	learn: 2.9871800	total: 48.7ms	remaining: 1.17s
12:	learn: 2.9739739	total: 52.4ms	remaining: 1

In [30]:
y_pred = stacking_model.predict(X_val)
y_pred = np.array(y_pred, dtype=np.int32)
custom_root_mean_squared_log_error(y_val, y_pred)

0.14559541587624023

In [31]:
mean_squared_log_error(y_val, y_pred)

0.025475679053777

In [32]:
y_pred_test = stacking_model.predict(X_test)
y_pred_test = np.array(y_pred_test, dtype=np.int32)

In [33]:
test_pd = pd.read_csv('data/test.csv')

In [34]:
result = pd.concat([test_pd['id'], pd.DataFrame(y_pred_test, columns=["Rings"])], axis=1)

In [35]:
result.to_csv('data/stacking_version/result_with_stacking.csv', index=False)

In [36]:
dump(stacking_model, 'models/stacking_model.joblib')

['models/stacking_model.joblib']