This notebook loads X_train, y_train and X_test. X_train and y_train are used for cross validating the model performance and X_test is used for creating predictions to be submitted.

In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import joblib

In [2]:
%%time
X_train = pd.read_pickle("../data/interim/X_train.pkl")

Wall time: 4.01 s


In [3]:
%%time
y_train = pd.read_pickle("../data/interim/y_train.pkl")

Wall time: 2.44 s


In [4]:
TRAIN_START = 0
TRAIN_END = 365
VAL_START = 366
VAL_END = 365 + 28

In [5]:
X_tr = X_train[(X_train.index.get_level_values(1) >= TRAIN_START) & (X_train.index.get_level_values(1) <= TRAIN_END)]
X_tr.shape

(11128850, 17)

In [6]:
y_tr = y_train[(y_train.index.get_level_values(1) >= TRAIN_START) & (y_train.index.get_level_values(1) <= TRAIN_END)]
y_tr.shape

(11128850,)

In [7]:
X_val = X_train[(X_train.index.get_level_values(1) >= VAL_START) & (X_train.index.get_level_values(1) <= VAL_END)]
X_val.shape

(853720, 17)

In [8]:
y_val = y_train[(y_train.index.get_level_values(1) >= VAL_START) & (y_train.index.get_level_values(1) <= VAL_END)]
y_val.shape

(853720,)

In [9]:
CATEGORICAL_COLS = X_tr.select_dtypes(exclude=['category']).columns.tolist()

In [10]:
params = {"num_leaves" : 2**5 - 1,
          "max_bin" : 100,
          "subsample" : 0.5,
          "colsample" : 0.5,}

In [16]:
model = LGBMRegressor(n_estimators = 1000,
                      learning_rate = 0.1,
                      objective =  "tweedie",
                      tweedie_variance_power = 1.1,
                      n_jobs = -1,
                      silent = -1,
                      verbose = -1)
model.set_params(**params)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample=0.5,
              colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
              max_bin=100, max_depth=-1, min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
              n_jobs=-1, num_leaves=31, objective='tweedie', random_state=None,
              reg_alpha=0.0, reg_lambda=0.0, silent=-1, subsample=0.5,
              subsample_for_bin=200000, subsample_freq=0,
              tweedie_variance_power=1.1, verbose=-1)

In [17]:
%%time
model.fit(X_tr, y_tr, 
          eval_set = [(X_tr, y_tr), (X_val, y_val)], 
          categorical_feature = CATEGORICAL_COLS,
          eval_metric = 'l2',
          verbose = 10,
          early_stopping_rounds = 100)

New categorical_feature is ['month', 'sell_price', 'snap_CA', 'snap_TX', 'snap_WI', 'wday', 'weekday', 'year']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds
[10]	training's l2: 11.575	training's tweedie: 9.00302	valid_1's l2: 14.3771	valid_1's tweedie: 10.4788
[20]	training's l2: 11.0179	training's tweedie: 8.83699	valid_1's l2: 13.707	valid_1's tweedie: 10.3166
[30]	training's l2: 10.7991	training's tweedie: 8.77598	valid_1's l2: 13.4548	valid_1's tweedie: 10.2616
[40]	training's l2: 10.6919	training's tweedie: 8.75141	valid_1's l2: 13.3297	valid_1's tweedie: 10.2398
[50]	training's l2: 10.6349	training's tweedie: 8.73857	valid_1's l2: 13.2657	valid_1's tweedie: 10.2287
[60]	training's l2: 10.5674	training's tweedie: 8.73024	valid_1's l2: 13.2137	valid_1's tweedie: 10.2226
[70]	training's l2: 10.5135	training's tweedie: 8.72397	valid_1's l2: 13.1737	valid_1's tweedie: 10.2176
[80]	training's l2: 10.4578	training's tweedie: 8.71838	valid_1's l2: 13.1345	valid_1's tweedie: 10.2136
[90]	training's l2: 10.411	training's tweedie: 8.71381	valid_1's l2: 13.1126	valid_1's tweedie: 10.2

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample=0.5,
              colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
              max_bin=100, max_depth=-1, min_child_samples=20,
              min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
              n_jobs=-1, num_leaves=31, objective='tweedie', random_state=None,
              reg_alpha=0.0, reg_lambda=0.0, silent=-1, subsample=0.5,
              subsample_for_bin=200000, subsample_freq=0,
              tweedie_variance_power=1.1, verbose=-1)

In [None]:
#joblib.dump(model, "../models/model.pkl")