In [1]:
%load_ext autoreload
%autoreload 2
import copy
import gc
import sys

sys.path.append('..')

import numpy as np
import pandas as pd
import scipy.stats as sps

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV, 
                                     cross_val_score)
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

from hyperopt import hp, tpe, STATUS_OK, Trials
from hyperopt.fmin import fmin

from src.utils.cross_validation import TimeSeriesGroupSplit
from src.utils.downcasting import downcast_dtypes

sns.set(font_scale=1.2)
%matplotlib inline

In [2]:
random_state = 42

# Stacking

In this notebook we will produce predictions by stacking of LightGBM, XGBoost, Ridge.

## Preparing datasets

In [3]:
# for column date_block_num load original train
train_original = pd.read_feather('../data/processed/train.ftr')

train_lgb = pd.read_csv('../models/oof/lgb/train.csv')
test_lgb = pd.read_csv('../models/oof/lgb/test.csv')

train_ridge = pd.read_csv('../models/oof/ridge/train.csv')
test_ridge = pd.read_csv('../models/oof/ridge/test.csv')

In [4]:
train_ridge.drop(columns=['target'], inplace=True)

In [5]:
X_train = pd.concat((train_lgb, train_ridge), axis=1)
X_test = pd.concat((test_lgb, test_ridge), axis=1)
X_train.head()

Unnamed: 0,lgb,lgb_clipped,target,ridge,ridge_clipped
0,0.699932,0.699932,3.0,0.718979,0.718979
1,0.421344,0.421344,1.0,0.373143,0.373143
2,0.754161,0.754161,1.0,0.661532,0.661532
3,0.057733,0.057733,1.0,0.449149,0.449149
4,0.460093,0.460093,2.0,0.68201,0.68201


In [6]:
train_size = train_lgb.shape[0]

X_train['date_block_num'] = train_original.iloc[-train_size:].date_block_num.values
X_test['date_block_num'] = X_train.date_block_num.max() + 1
del train_original
gc.collect();

### Removing target

In [7]:
y = X_train.target
X_train.drop(columns=['target'], inplace=True)

### Creation of validation split

In [8]:
y_valid = y[X_train.date_block_num == 33]
X_valid = X_train[X_train.date_block_num == 33]

y_train = y[X_train.date_block_num < 33]
X_train = X_train[X_train.date_block_num < 33]

## Hyperparameters tuning

In [9]:
ts = TimeSeriesGroupSplit(n_splits=5)

In [10]:
ridge_params = {
    'random_state': random_state
}

In [11]:
# pipe_ridge = Pipeline(
#     [('standartization', StandardScaler()),
#      ('regression', Ridge(**ridge_params))]
# )

# default_score = cross_val_score(
#     pipe_ridge, 
#     X_train, y_train, groups=X_train.date_block_num,
#     n_jobs=1, 
#     scoring='neg_root_mean_squared_error', 
#     verbose=0,
#     cv=ts
# ).mean()
# print(f'Current score: {-default_score:.5f}')

Score: $0.87776$.

In [12]:
pipe_ridge = Pipeline([('standartization', StandardScaler()), 
                       ('regression', Ridge(**ridge_params))])

param_grid = {
    'regression__alpha': np.logspace(4, 6, 10),
}

gs = GridSearchCV(
    pipe_ridge, 
    param_grid,          
    n_jobs=1, 
    scoring='neg_root_mean_squared_error', 
    verbose=10,
    refit=False,
    cv=ts
)

gs.fit(X_train, y_train, groups=X_train.date_block_num)

gs.best_params_

In [13]:
ridge_params['alpha'] = 129155.0

In [14]:
# pipe_ridge = Pipeline([('standartization', StandardScaler()), 
#                        ('regression', Ridge(**ridge_params))])

# current_score = cross_val_score(
#     pipe_ridge, 
#     X_train, y_train, groups=X_train.date_block_num,
#     n_jobs=1, 
#     scoring='neg_root_mean_squared_error', 
#     verbose=0,
#     cv=ts
# ).mean()
# print(f'Current score: {-current_score:.5f}')

Score: $0.87694$.

In [15]:
ridge_params

{'random_state': 42, 'alpha': 129155.0}

## Validation

In [None]:
model = Pipeline([('standartization', StandardScaler()), 
                  ('regression', Ridge(**ridge_params))])
model.fit(X_train, y_train)

In [None]:
y_predicted = np.clip(model.predict(X_valid), 0, 20)
validation_score = mean_squared_error(y_valid, y_predicted)
print(f'Validation score: {validation_score:.5f}')

Score: $0.84454$

Let's look at predicted values charasteristics.

In [None]:
pd.Series(y_predicted).describe()

Let's look at feature importances.

In [None]:
ridge_coefs = model.named_steps['regression'].coef_
top_size = 10

sorted_indices = np.argsort(ridge_coefs)
sorted_names = X_train.columns[sorted_indices]

In [None]:
plt.figure(figsize=(16, 9))
sns.barplot(y=sorted_names[::-1], 
            x=ridge_coefs[sorted_indices][::-1])
plt.xlabel('Коэффициент перед признаком')
plt.title(f'Значимости признаков для Ridge-регрессии')
plt.savefig('../reports/figures/stacking/importances.png', 
            facecolor='white', bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
plt.figure(figsize=(16, 9))
image = plt.imread('../reports/figures/stacking/importances.png')
plt.imshow(image, interpolation='spline36')
plt.axis('off')
plt.show()

## Submit

In this section we will train result model and submit prediction. Don't forget to clip values according to [evaluation tab](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/overview/evaluation) (but ay be for tree-based methods it is not necessary).

In [16]:
X_train = pd.concat((X_train, X_valid))
y_train = pd.concat((y_train, y_valid))
gc.collect();

In [17]:
X_train.shape

(4704879, 5)

In [22]:
model = Pipeline([('standartization', StandardScaler()), 
                  ('regression', Ridge(**ridge_params))])
bags = 5

bagged_predictions = np.zeros(X_test.shape[0])
for n in tqdm(range(bags)):
    model.set_params(**{'regression__random_state': random_state+n})
    model.fit(X_train, y_train)
    bagged_predictions += np.clip(model.predict(X_test), 0, 20)
    gc.collect()
    
bagged_predictions /= bags

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Create submission.

In [23]:
pd.Series(bagged_predictions).describe()

count    214200.000000
mean          0.354726
std           0.675709
min           0.000000
25%           0.086238
50%           0.189018
75%           0.390521
max          20.000000
dtype: float64

In [26]:
submission = pd.read_csv('../data/raw/sample_submission.csv')
submission['item_cnt_month'] = bagged_predictions
submission.to_csv('../models/stacking/submission.csv', index=False)

!kaggle competitions submit competitive-data-science-predict-future-sales -f ../models/ridge/submission.csv -m "Stacking of LightGBM, Ridge and "

Result is $0.99535$. It is pretty far from top positions.