In [7]:
from itertools import product
import time
start_time = time.time()
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
import numpy as np
import gc
from tqdm import tqdm

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import (LinearRegression, SGDRegressor)
import lightgbm as lgb
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.linear_model import (LinearRegression, SGDRegressor)

from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

Validation = False
reduce_size = False

seed = 0

# Data path
data_path = '../readonly/final_project_data/'
submission_path = './'

In [8]:
import types
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val
list(imports())

[<module 'builtins' (built-in)>,
 <module 'time' (built-in)>,
 <module 'numpy' from '/home/zed/miniconda3/envs/courseproj/lib/python3.5/site-packages/numpy/__init__.py'>,
 <module 'pandas' from '/home/zed/miniconda3/envs/courseproj/lib/python3.5/site-packages/pandas/__init__.py'>,
 <module 'builtins' (built-in)>,
 <module 'lightgbm' from '/home/zed/miniconda3/envs/courseproj/lib/python3.5/site-packages/lightgbm/__init__.py'>,
 <module 'pip' from '/home/zed/miniconda3/envs/courseproj/lib/python3.5/site-packages/pip/__init__.py'>,
 <module 'gc' (built-in)>,
 <module 'sklearn.preprocessing' from '/home/zed/miniconda3/envs/courseproj/lib/python3.5/site-packages/sklearn/preprocessing/__init__.py'>,
 <module 'types' from '/home/zed/miniconda3/envs/courseproj/lib/python3.5/types.py'>]

In [9]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int16)
    return df

In [10]:
print('%0.2f min: Start loading data'%((time.time() - start_time)/60))

sale_train = pd.read_csv('%s/sales_train.csv.gz' % data_path)
test  = pd.read_csv('%s/test.csv.gz' % data_path)

0.02 min: Start loading data


## Strange Outliers

In [11]:
# Look at line 2909818, very low item_price and high item_cnt_day
sale_train[sale_train['item_id'] == 11373].sort_values(['item_price'])

# Look at line 885138, very high item_price
sale_train[sale_train['item_id'] == 11365].sort_values(['item_price'])

# Correct sale_train values

# Replace with median
sale_train['item_price'][2909818] = np.nan

sale_train['item_cnt_day'][2909818] = np.nan

sale_train['item_price'][2909818] = sale_train[(sale_train['shop_id'] ==12) & (sale_train['item_id'] == 11373) & (sale_train['date_block_num'] == 33)]['item_price'].median()

sale_train['item_cnt_day'][2909818] = round(sale_train[(sale_train['shop_id'] ==12) & (sale_train['item_id'] == 11373) & (sale_train['date_block_num'] == 33)]['item_cnt_day'].median())

sale_train['item_price'][885138] = np.nan

sale_train['item_price'][885138] = sale_train[(sale_train['item_id'] == 11365) & (sale_train['shop_id'] ==12) & (sale_train['date_block_num'] == 8)]['item_price'].median()


In [12]:
# Consider only shops in test set
test_nrow = test.shape[0]

sale_train = sale_train.merge(test[['shop_id']].drop_duplicates(), how = 'inner')
sale_train['date'] = pd.to_datetime(sale_train['date'], format = '%d.%m.%Y')

print('%0.2f min: Finish loading data'%((time.time() - start_time)/60))

0.12 min: Finish loading data


## Data Aggregation

In [13]:
# For every month we create a grid from all shops/items combinations from that month

grid = []
for block_num in sale_train['date_block_num'].unique():
    cur_shops = sale_train[sale_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sale_train[sale_train['date_block_num']==block_num]['item_id'].unique()
    # All combinations
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#Turn the grid into pandas dataframe
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

print('%0.2f min: Finish creating the grid'%((time.time() - start_time)/60))

0.17 min: Finish creating the grid


In [14]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# Clip the label between 0 and 20
sale_train['item_cnt_day'] = sale_train['item_cnt_day'].clip(0,20)
# Group the number of items sold by month rather than by day
gb_cnt = sale_train.groupby(index_cols)['item_cnt_day'].agg(['sum']).reset_index().rename(columns = {'sum': 'item_cnt_month'})

gb_cnt['item_cnt_month'] = gb_cnt['item_cnt_month'].clip(0,20).astype(np.uint8)

#Join aggregated data to the grid
train = pd.merge(grid,gb_cnt,how='left',on=index_cols).fillna(0)
train['item_cnt_month'] = train['item_cnt_month'].astype(np.uint8)
train = downcast_dtypes(train)

#Sort the data
train.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
print('%0.2f min: Finish joining gb_cnt'%((time.time() - start_time)/60))

# Sanity check
print(sale_train['item_cnt_day'].sum())
print(train['item_cnt_month'].sum())
print(gb_cnt['item_cnt_month'].sum())

0.24 min: Finish joining gb_cnt
2934456.0
2671279
2671279


In [15]:
# Merge items with train

item = pd.read_csv('%s/items.csv' % data_path)

train = train.merge(item[['item_id', 'item_category_id']], on = ['item_id'], how = 'left')
test = test.merge(item[['item_id', 'item_category_id']], on = ['item_id'], how = 'left')

print('%0.2f min: Finish adding item_category_id'%((time.time() - start_time)/60))

0.26 min: Finish adding item_category_id


In [16]:
# Translate item categories from Russian
item_cat = pd.read_csv('%s/item_categories.csv' % data_path)

l_cat = list(item_cat.item_category_name)
for ind in range(0,1):
    l_cat[ind] = 'PC Headsets / Headphones'

for ind in range(1,8):
    l_cat[ind] = 'Access'
    
l_cat[8] = 'Tickets (figure)'
l_cat[9] = 'Delivery of goods'

for ind in range(10,18):
    l_cat[ind] = 'Consoles'

for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'

l_cat[25] = 'Accessories for games'

for ind in range(26,28):
    l_cat[ind] = 'phone games'

for ind in range(28,32):
    l_cat[ind] = 'CD games'

for ind in range(32,37):
    l_cat[ind] = 'Card'

for ind in range(37,43):
    l_cat[ind] = 'Movie'

for ind in range(43,55):
    l_cat[ind] = 'Books'

for ind in range(55,61):
    l_cat[ind] = 'Music'

for ind in range(61,73):
    l_cat[ind] = 'Gifts'

for ind in range(73,79):
    l_cat[ind] = 'Soft'

for ind in range(79,81):
    l_cat[ind] = 'Office'

for ind in range(81,83):
    l_cat[ind] = 'Clean'

l_cat[83] = 'Elements of a food'

In [17]:
# Merge item category encoding with train
lb = preprocessing.LabelEncoder()
item_cat['item_cat_id_fix'] = lb.fit_transform(l_cat)

train = train.merge(item_cat[['item_cat_id_fix', 'item_category_id']], on = ['item_category_id'], how = 'left')
test = test.merge(item_cat[['item_cat_id_fix', 'item_category_id']], on = ['item_category_id'], how = 'left')

del item, item_cat, grid, gb_cnt

gc.collect()

print('%0.2f min: Finish adding item_cat_id_fix'%((time.time() - start_time)/60))

0.27 min: Finish adding item_cat_id_fix


## Mean Encodings

In [18]:
# For Trainset
print('%0.2f min: Start adding mean-encoding for item_cnt_month'%((time.time() - start_time)/60))

target = 'item_cnt_month'
global_mean =  train[target].mean()
y_tr = train[target].values

mean_encoded_col = ['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix']

for col in tqdm(mean_encoded_col):
    col_tr = train[[col] + [target]]
    corrcoefs = pd.DataFrame(columns = ['Cor'])

    # Mean encodings - KFold scheme
    kf = KFold(n_splits = 5, shuffle = False, random_state = seed)
    col_tr[col + '_cnt_month_mean_Kfold'] = global_mean

    for tr_ind, val_ind in kf.split(col_tr):
        # Identify train and test rows based on indexes
        X_tr, X_val = col_tr.iloc[tr_ind], col_tr.iloc[val_ind]
        # Calculate mean and save it
        means = X_val[col].map(X_tr.groupby(col)[target].mean())
        X_val[col + '_cnt_month_mean_Kfold'] = means
        col_tr.iloc[val_ind] = X_val
        
    col_tr.fillna(global_mean, inplace = True)

    corrcoefs.loc[col + '_cnt_month_mean_Kfold'] = np.corrcoef(y_tr, col_tr[col + '_cnt_month_mean_Kfold'])[0][1]
    
    # Mean encodings - Leave-one-out scheme

    item_id_target_sum = col_tr.groupby(col)[target].sum()
    item_id_target_count = col_tr.groupby(col)[target].count()

    col_tr[col + '_cnt_month_sum'] = col_tr[col].map(item_id_target_sum)
    col_tr[col + '_cnt_month_count'] = col_tr[col].map(item_id_target_count)
    col_tr[col + '_target_mean_LOO'] = (col_tr[col + '_cnt_month_sum'] - col_tr[target]) / (col_tr[col + '_cnt_month_count'] - 1)

    col_tr.fillna(global_mean, inplace = True)
    corrcoefs.loc[col + '_target_mean_LOO'] = np.corrcoef(y_tr, col_tr[col + '_target_mean_LOO'])[0][1]
    
    # Mean encodings - Smoothing

    item_id_target_mean = col_tr.groupby(col)[target].mean()
    item_id_target_count = col_tr.groupby(col)[target].count()
    
    col_tr[col + '_cnt_month_mean'] = col_tr[col].map(item_id_target_mean)
    col_tr[col + '_cnt_month_count'] = col_tr[col].map(item_id_target_count)

    alpha = 100
    col_tr[col + '_cnt_month_mean_Smooth'] = (col_tr[col + '_cnt_month_mean'] *  col_tr[col + '_cnt_month_count'] + global_mean * alpha) / (alpha + col_tr[col + '_cnt_month_count'])
    col_tr[col + '_cnt_month_mean_Smooth'].fillna(global_mean, inplace=True)
    corrcoefs.loc[col + '_cnt_month_mean_Smooth'] = np.corrcoef(y_tr, col_tr[col + '_cnt_month_mean_Smooth'])[0][1]
    
    # Mean encodings - Expanding mean scheme

    cumsum = col_tr.groupby(col)[target].cumsum() - col_tr[target]
    sumcnt = col_tr.groupby(col).cumcount()

    col_tr[col + '_cnt_month_mean_Expanding'] = cumsum / sumcnt
    col_tr[col + '_cnt_month_mean_Expanding'].fillna(global_mean, inplace=True)
    corrcoefs.loc[col + '_cnt_month_mean_Expanding'] = np.corrcoef(y_tr, col_tr[col + '_cnt_month_mean_Expanding'])[0][1]

    # Add the best encoding out of the 4 to the train set
    train = pd.concat([train, col_tr[corrcoefs['Cor'].idxmax()]], axis = 1)

    print(corrcoefs.sort_values('Cor'))
    print('%0.2f min: Finish encoding %s'%((time.time() - start_time)/60, col))

print('%0.2f min: Finish adding mean-encoding'%((time.time() - start_time)/60))    

  0%|          | 0/4 [00:00<?, ?it/s]

0.27 min: Start adding mean-encoding for item_cnt_month


 25%|██▌       | 1/4 [00:07<00:23,  7.98s/it]

                                       Cor
shop_id_cnt_month_mean_Kfold      0.173370
shop_id_target_mean_LOO           0.175547
shop_id_cnt_month_mean_Smooth     0.175572
shop_id_cnt_month_mean_Expanding  0.175746
0.40 min: Finish encoding shop_id


 50%|█████     | 2/4 [00:17<00:16,  8.34s/it]

                                       Cor
item_id_cnt_month_mean_Kfold      0.315862
item_id_cnt_month_mean_Smooth     0.479840
item_id_target_mean_LOO           0.481937
item_id_cnt_month_mean_Expanding  0.565646
0.55 min: Finish encoding item_id


 75%|███████▌  | 3/4 [00:25<00:08,  8.39s/it]

                                                Cor
item_category_id_cnt_month_mean_Kfold      0.274072
item_category_id_cnt_month_mean_Smooth     0.292732
item_category_id_target_mean_LOO           0.292778
item_category_id_cnt_month_mean_Expanding  0.296104
0.70 min: Finish encoding item_category_id


100%|██████████| 4/4 [00:34<00:00,  8.61s/it]

                                               Cor
item_cat_id_fix_cnt_month_mean_Kfold      0.157323
item_cat_id_fix_target_mean_LOO           0.171593
item_cat_id_fix_cnt_month_mean_Smooth     0.171639
item_cat_id_fix_cnt_month_mean_Expanding  0.176845
0.84 min: Finish encoding item_cat_id_fix
0.84 min: Finish adding mean-encoding





## Feature Engineering

In [19]:
# Combine trainset and testset 
print('%0.2f min: Start combining data'%((time.time() - start_time)/60))

# If I do not use a val set, I combine month 34 with the train set
if Validation == False:
    test['date_block_num'] = 34
    all_data = pd.concat([train, test], axis = 0)
    all_data = all_data.drop(columns = ['ID'])

else:
    all_data = train

del train, test, col_tr

gc.collect()

all_data = downcast_dtypes(all_data)

0.84 min: Start combining data


### Lag based Features

In [20]:
# Creating item/shop pair lags lag-based features 

print('%0.2f min: Start adding lag-based feature'%((time.time() - start_time)/60))

index_cols = ['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix', 'date_block_num']
cols_to_rename = list(all_data.columns.difference(index_cols))
print('Features that will be shifted:')
print(cols_to_rename)
# Lag range
shift_range = [1, 2, 3, 4, 6, 12]

# this loop just adds the future months
for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    # Rename the shifted feature
    train_shift = train_shift.rename(columns=foo)
    # the shift works because date_block num is considered as index; the block nums outside the range are not considered given the left join
    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

gc.collect()

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 
# Take all the lag columns (the ones which end with a lag number)
lag_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]

all_data = downcast_dtypes(all_data)

print('%0.2f min: Finish generating lag features'%((time.time() - start_time)/60))

  0%|          | 0/6 [00:00<?, ?it/s]

0.87 min: Start adding lag-based feature
Features that will be shifted:
['item_cat_id_fix_cnt_month_mean_Expanding', 'item_category_id_cnt_month_mean_Expanding', 'item_cnt_month', 'item_id_cnt_month_mean_Expanding', 'shop_id_cnt_month_mean_Expanding']


100%|██████████| 6/6 [00:27<00:00,  4.59s/it]


1.34 min: Finish generating lag features


In [21]:
# Creating date features 

print('%0.2f min: Start getting date features'%((time.time() - start_time)/60))

dates_train = sale_train[['date', 'date_block_num']].drop_duplicates()

dates_test = dates_train[dates_train['date_block_num'] == 34-12]


dates_test['date_block_num'] = 34
dates_test['date'] = dates_test['date'] + pd.DateOffset(years=1)

dates_all = pd.concat([dates_train, dates_test])

dates_all['dow'] = dates_all['date'].dt.dayofweek
dates_all['year'] = dates_all['date'].dt.year
dates_all['month'] = dates_all['date'].dt.month
dates_all = pd.get_dummies(dates_all, columns=['dow'])

dow_col = ['dow_' + str(x) for x in range(7)]

date_features = dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()
date_features['days_of_month'] = date_features[dow_col].sum(axis=1)
date_features['year'] = date_features['year'] - 2013

date_features = date_features[['month', 'year', 'days_of_month', 'date_block_num']]
all_data = all_data.merge(date_features, on = 'date_block_num', how = 'left')
date_columns = date_features.columns.difference(set(index_cols))

print('%0.2f min: Finish getting date features'%((time.time() - start_time)/60))

1.34 min: Start getting date features
1.36 min: Finish getting date features


## Scale Feature Columns

In [22]:
train = all_data[all_data['date_block_num']!= all_data['date_block_num'].max()]
test = all_data[all_data['date_block_num']== all_data['date_block_num'].max()]

sc = StandardScaler()

to_drop_cols = ['date_block_num']
feature_columns = list(set(lag_cols + index_cols + list(date_columns)).difference(to_drop_cols))

# Scale the test set based on the Scaler fitted for the train
train[feature_columns] = sc.fit_transform(train[feature_columns])
test[feature_columns] = sc.transform(test[feature_columns])

all_data_scaled = pd.concat([train, test], axis = 0)
all_data_scaled = downcast_dtypes(all_data)

del train, test, date_features, sale_train

gc.collect()

print('%0.2f min: Finish scaling features'%((time.time() - start_time)/60))

1.92 min: Finish scaling features


## First- Level Model

In [23]:
# Save date_block_num, as it can't use them as features, but it will be needed to split the dataset into parts

dates = all_data['date_block_num']
last_block = dates.max()

print('%0.2f min: Start training First level models'%((time.time() - start_time)/60))

start_first_level_total = time.perf_counter()

scoringMethod = 'r2'

1.92 min: Start training First level models


In [24]:
# Train meta-features M = 15 (12 + 15 = 27)
num_first_level_models = 5

months_to_generate_meta_features = range(27,last_block +1)

mask = dates.isin(months_to_generate_meta_features)

target = 'item_cnt_month'

y_all_level2 = all_data[target][mask].values
X_all_level2 = np.zeros([y_all_level2.shape[0], num_first_level_models])

# Now fill X_train_level2 with metafeatures
slice_start = 0

for cur_block_num in tqdm(months_to_generate_meta_features):

    print('-' * 50)
    print('Start training for month%d'% cur_block_num)

    start_cur_month = time.perf_counter()
    # train: all data until current block (excluded), test: current block
    cur_X_train_scaled = all_data_scaled.loc[dates <  cur_block_num][feature_columns]
    cur_X_test_scaled =  all_data_scaled.loc[dates == cur_block_num][feature_columns]

    cur_y_train_scaled = all_data_scaled.loc[dates <  cur_block_num, target].values
    cur_y_test_scaled =  all_data_scaled.loc[dates == cur_block_num, target].values
    
    cur_X_train = all_data.loc[dates <  cur_block_num][feature_columns]
    cur_X_test =  all_data.loc[dates == cur_block_num][feature_columns]

    cur_y_train = all_data.loc[dates <  cur_block_num, target].values
    cur_y_test =  all_data.loc[dates == cur_block_num, target].values

    # Create Numpy arrays of train, test and target dataframes to feed into models

    train_x_scaled = cur_X_train_scaled.values
    train_y_scaled = cur_y_train_scaled.ravel()
    test_x_scaled = cur_X_test_scaled.values
    test_y_scaled = cur_y_test_scaled.ravel()
    
    train_x = cur_X_train.values
    train_y = cur_y_train.ravel()
    test_x = cur_X_test.values
    test_y = cur_y_test.ravel()

    preds = []

    sgdr= SGDRegressor(
        penalty = 'l2' ,
        random_state = seed )

    lgb_params = {
                  'feature_fraction': 0.75,
                  'metric': 'rmse',
                  'nthread':1,
                  'min_data_in_leaf': 2**7,
                  'bagging_fraction': 0.75,
                  'learning_rate': 0.03,
                  'objective': 'mse',
                  'bagging_seed': 2**7,
                  'num_leaves': 2**7,
                  'bagging_freq':1,
                  'verbose':0
                  }
    print('Training Model %d: %s'%(len(preds), 'sgdr'))

    start = time.perf_counter()

    sgdr.fit(train_x_scaled, train_y_scaled)
    pred_test = sgdr.predict(test_x_scaled)
    preds.append(pred_test)

    run = time.perf_counter() - start
    print('{} runs for {:.2f} seconds.'.format(sgdr.__class__.__name__, run))
    print()

    print('Training Model %d: %s'%(len(preds), 'lightgbm'))

    start = time.perf_counter()

    estimator = lgb.train(lgb_params, lgb.Dataset(train_x_scaled, label=train_y_scaled), 300)
    pred_test = estimator.predict(test_x_scaled)
    preds.append(pred_test)

    run = time.perf_counter() - start

    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))
    print()

    print('Training Model %d: %s'%(len(preds), 'keras'))
    
    start = time.perf_counter()

    def baseline_model():
        # create model
        model = Sequential()
        model.add(Dense(20, input_dim=train_x.shape[1], kernel_initializer='uniform', activation='softplus'))
        model.add(Dense(1, kernel_initializer='uniform', activation = 'relu'))

        # Compile model
        # Nadam = Adam RMSprop with Nesterov momentum.
        model.compile(loss='mse', optimizer='Nadam', metrics=['mse'])

        return model

    estimator = KerasRegressor(build_fn=baseline_model, verbose=1, epochs=5, batch_size = 55000)
    estimator.fit(train_x_scaled, train_y_scaled)
    pred_test = estimator.predict(test_x_scaled)
    preds.append(pred_test)

    run = time.perf_counter() - start

    print('{} runs for {:.2f} seconds.'.format('keras', run))

    cur_month_run_total = time.perf_counter() - start_cur_month

    print('Total running time was {:.2f} minutes.'.format(cur_month_run_total/60))
    
    print('Training Model %d: %s'%(len(preds), 'catboost'))
    
    start = time.perf_counter()    
    #cat_features = [0, 1, 3, 6, 8, 40, 41, 42]
    catboost_model = CatBoostRegressor(
    iterations=2000,
    max_ctr_complexity=4,
    random_seed=0,
    od_type='Iter',
    od_wait=25,
    verbose=50,
    depth=4
    )
    
    catboost_model.fit(
    train_x_scaled, train_y_scaled,
    )
    
    pred_test = catboost_model.predict(test_x_scaled)
    preds.append(pred_test)

    run = time.perf_counter() - start

    print('{} runs for {:.2f} seconds.'.format('catboost', run))
    
    print('Training Model %d: %s'%(len(preds), 'random forest'))
    
    start = time.perf_counter()
    
    rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
    rf_model.fit(train_x_scaled, train_y_scaled)
    
    pred_test = rf_model.predict(test_x_scaled)
    preds.append(pred_test)

    run = time.perf_counter() - start

    print('{} runs for {:.2f} seconds.'.format('random forest', run))

    cur_month_run_total = time.perf_counter() - start_cur_month

    print('Total running time was {:.2f} minutes.'.format(cur_month_run_total/60))

    print('-' * 50)

    slice_end = slice_start + cur_X_test.shape[0]
    X_all_level2[ slice_start : slice_end , :] = np.c_[preds].transpose()
    slice_start = slice_end

  0%|          | 0/8 [00:00<?, ?it/s]

--------------------------------------------------
Start training for month27
Training Model 0: sgdr
SGDRegressor runs for 4.78 seconds.

Training Model 1: lightgbm
lightgbm runs for 123.80 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 13.69 seconds.
Total running time was 2.39 minutes.
Training Model 3: catboost
0:	learn: 1.2636855	total: 245ms	remaining: 8m 10s
50:	learn: 0.9845347	total: 9.27s	remaining: 5m 54s
100:	learn: 0.9467891	total: 18.9s	remaining: 5m 54s
150:	learn: 0.9350999	total: 28.6s	remaining: 5m 50s
200:	learn: 0.9289819	total: 38.6s	remaining: 5m 45s
250:	learn: 0.9248226	total: 48.7s	remaining: 5m 39s
300:	learn: 0.9193514	total: 59s	remaining: 5m 33s
350:	learn: 0.9152618	total: 1m 9s	remaining: 5m 26s
400:	learn: 0.9125111	total: 1m 19s	remaining: 5m 18s
450:	learn: 0.9090242	total: 1m 30s	remaining: 5m 9s
500:	learn: 0.9064601	total: 1m 40s	remaining: 5m
550:	learn: 0.9043502	total: 1m 50s	remaining: 4m 50s
60

 12%|█▎        | 1/8 [15:05<1:45:35, 905.02s/it]

random forest runs for 346.99 seconds.
Total running time was 15.08 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month28
Training Model 0: sgdr
SGDRegressor runs for 5.11 seconds.

Training Model 1: lightgbm
lightgbm runs for 131.56 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 11.79 seconds.
Total running time was 2.50 minutes.
Training Model 3: catboost
0:	learn: 1.2574700	total: 196ms	remaining: 6m 31s
50:	learn: 0.9834394	total: 9.73s	remaining: 6m 11s
100:	learn: 0.9464049	total: 19.7s	remaining: 6m 11s
150:	learn: 0.9353168	total: 30.1s	remaining: 6m 8s
200:	learn: 0.9290600	total: 41s	remaining: 6m 6s
250:	learn: 0.9238613	total: 52.1s	remaining: 6m 3s
300:	learn: 0.9193515	total: 1m 3s	remaining: 5m 58s
350:	learn: 0.9153783	total: 1m 14s	remaining: 5m 52s
400:	learn: 0.9119612	total: 1m 25s	remaining: 5m 42s
450:	learn: 0.9095185	total: 1m 3

 25%|██▌       | 2/8 [31:58<1:33:45, 937.53s/it]

random forest runs for 414.38 seconds.
Total running time was 16.89 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month29
Training Model 0: sgdr
SGDRegressor runs for 5.46 seconds.

Training Model 1: lightgbm
lightgbm runs for 145.55 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 12.49 seconds.
Total running time was 2.75 minutes.
Training Model 3: catboost
0:	learn: 1.2501455	total: 256ms	remaining: 8m 31s
50:	learn: 0.9801081	total: 14.4s	remaining: 9m 9s
100:	learn: 0.9435125	total: 28.4s	remaining: 8m 54s
150:	learn: 0.9323570	total: 42.5s	remaining: 8m 40s
200:	learn: 0.9263091	total: 56.6s	remaining: 8m 26s
250:	learn: 0.9211305	total: 1m 10s	remaining: 8m 13s
300:	learn: 0.9170376	total: 1m 24s	remaining: 7m 57s
350:	learn: 0.9120739	total: 1m 38s	remaining: 7m 44s
400:	learn: 0.9086450	total: 1m 53s	remaining: 7m 32s
450:	learn: 0.9060953	total

 38%|███▊      | 3/8 [52:16<1:25:07, 1021.59s/it]

random forest runs for 437.42 seconds.
Total running time was 20.30 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month30
Training Model 0: sgdr
SGDRegressor runs for 6.83 seconds.

Training Model 1: lightgbm
lightgbm runs for 163.01 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 14.11 seconds.
Total running time was 3.09 minutes.
Training Model 3: catboost
0:	learn: 1.2423586	total: 269ms	remaining: 8m 57s
50:	learn: 0.9741294	total: 13s	remaining: 8m 16s
100:	learn: 0.9383155	total: 26.1s	remaining: 8m 11s
150:	learn: 0.9273342	total: 39.8s	remaining: 8m 6s
200:	learn: 0.9213312	total: 54.2s	remaining: 8m 5s
250:	learn: 0.9169896	total: 1m 8s	remaining: 7m 56s
300:	learn: 0.9115606	total: 1m 22s	remaining: 7m 46s
350:	learn: 0.9073029	total: 1m 37s	remaining: 7m 37s
400:	learn: 0.9034487	total: 1m 52s	remaining: 7m 26s
450:	learn: 0.9005806	total: 2m

 50%|█████     | 4/8 [1:12:43<1:12:13, 1083.30s/it]

random forest runs for 464.71 seconds.
Total running time was 20.45 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month31
Training Model 0: sgdr
SGDRegressor runs for 7.18 seconds.

Training Model 1: lightgbm
lightgbm runs for 156.34 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 13.57 seconds.
Total running time was 2.98 minutes.
Training Model 3: catboost
0:	learn: 1.2326826	total: 255ms	remaining: 8m 29s
50:	learn: 0.9654811	total: 12.9s	remaining: 8m 13s
100:	learn: 0.9298075	total: 25.9s	remaining: 8m 6s
150:	learn: 0.9185771	total: 39.1s	remaining: 7m 58s
200:	learn: 0.9130789	total: 52.6s	remaining: 7m 50s
250:	learn: 0.9089173	total: 1m 6s	remaining: 7m 42s
300:	learn: 0.9046505	total: 1m 20s	remaining: 7m 32s
350:	learn: 0.8997569	total: 1m 34s	remaining: 7m 22s
400:	learn: 0.8967851	total: 1m 47s	remaining: 7m 8s
450:	learn: 0.8944587	total: 

 62%|██████▎   | 5/8 [1:32:43<55:55, 1118.46s/it]  

random forest runs for 460.95 seconds.
Total running time was 20.01 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month32
Training Model 0: sgdr
SGDRegressor runs for 6.96 seconds.

Training Model 1: lightgbm
lightgbm runs for 165.39 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 14.21 seconds.
Total running time was 3.14 minutes.
Training Model 3: catboost
0:	learn: 1.2272734	total: 280ms	remaining: 9m 19s
50:	learn: 0.9614218	total: 13.1s	remaining: 8m 18s
100:	learn: 0.9259372	total: 26.6s	remaining: 8m 19s
150:	learn: 0.9148786	total: 40.4s	remaining: 8m 15s
200:	learn: 0.9092729	total: 54.8s	remaining: 8m 10s
250:	learn: 0.9050834	total: 1m 10s	remaining: 8m 12s
300:	learn: 0.9015660	total: 1m 26s	remaining: 8m 6s
350:	learn: 0.8974736	total: 1m 41s	remaining: 7m 58s
400:	learn: 0.8936530	total: 1m 56s	remaining: 7m 44s
450:	learn: 0.8906903	total

 75%|███████▌  | 6/8 [1:53:39<38:39, 1159.54s/it]

random forest runs for 473.12 seconds.
Total running time was 20.92 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month33
Training Model 0: sgdr
SGDRegressor runs for 6.81 seconds.

Training Model 1: lightgbm
lightgbm runs for 173.46 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 15.54 seconds.
Total running time was 3.29 minutes.
Training Model 3: catboost
0:	learn: 1.2261800	total: 266ms	remaining: 8m 52s
50:	learn: 0.9626543	total: 14.2s	remaining: 9m 2s
100:	learn: 0.9277334	total: 29.1s	remaining: 9m 7s
150:	learn: 0.9171047	total: 45.2s	remaining: 9m 13s
200:	learn: 0.9113385	total: 1m 1s	remaining: 9m 8s
250:	learn: 0.9062300	total: 1m 17s	remaining: 8m 57s
300:	learn: 0.9022438	total: 1m 32s	remaining: 8m 43s
350:	learn: 0.8979668	total: 1m 47s	remaining: 8m 27s
400:	learn: 0.8947017	total: 2m 3s	remaining: 8m 11s
450:	learn: 0.8916455	total: 2

 88%|████████▊ | 7/8 [2:15:36<20:06, 1206.98s/it]

random forest runs for 509.83 seconds.
Total running time was 21.96 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month34
Training Model 0: sgdr
SGDRegressor runs for 7.14 seconds.

Training Model 1: lightgbm
lightgbm runs for 176.09 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
keras runs for 16.11 seconds.
Total running time was 3.35 minutes.
Training Model 3: catboost
0:	learn: 1.2211179	total: 277ms	remaining: 9m 13s
50:	learn: 0.9604053	total: 13.6s	remaining: 8m 39s
100:	learn: 0.9260022	total: 27.6s	remaining: 8m 38s
150:	learn: 0.9152031	total: 42s	remaining: 8m 34s
200:	learn: 0.9096075	total: 56.7s	remaining: 8m 27s
250:	learn: 0.9043921	total: 1m 11s	remaining: 8m 21s
300:	learn: 0.8995935	total: 1m 27s	remaining: 8m 14s
350:	learn: 0.8960139	total: 1m 43s	remaining: 8m 6s
400:	learn: 0.8929815	total: 1m 58s	remaining: 7m 53s
450:	learn: 0.8899923	total: 

100%|██████████| 8/8 [2:37:33<00:00, 1181.73s/it]

random forest runs for 507.86 seconds.
Total running time was 21.95 minutes.
--------------------------------------------------





## Train and Test Split

In [25]:
# Split train and test
test_nrow = len(preds[0])

X_train_level2 = X_all_level2[ : -test_nrow, :]
X_test_level2 = X_all_level2[ -test_nrow: , :]
y_train_level2 = y_all_level2[ : -test_nrow]
y_test_level2 = y_all_level2[ -test_nrow : ]

print('%0.2f min: Finish training First level models'%((time.perf_counter() - start_first_level_total)/60))

157.57 min: Finish training First level models


## Ensembling

In [26]:
pred_list = {}

# Second level learning model via linear regression

print('Training Second level learning model via linear regression')

lr = LinearRegression()
lr.fit(X_train_level2, y_train_level2)

# Compute R-squared on the train and test sets.

test_preds_lr_stacking = lr.predict(X_test_level2)
train_preds_lr_stacking = lr.predict(X_train_level2)

print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_lr_stacking))))

pred_list['test_preds_lr_stacking'] = test_preds_lr_stacking
if Validation:
    print('Test R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_lr_stacking))))

Training Second level learning model via linear regression
Train R-squared for train_preds_lr_stacking is 1.086452


In [27]:
# Second level learning model via SGDRegressor

print('Training Second level learning model via SGDRegressor')

sgdr= SGDRegressor(
    penalty = 'l2' ,
    random_state = seed )

sgdr.fit(X_train_level2, y_train_level2)

test_preds_sgdr_stacking = sgdr.predict(X_test_level2)
train_preds_sgdr_stacking = sgdr.predict(X_train_level2)

print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_sgdr_stacking))))
pred_list['test_preds_sgdr_stacking'] = test_preds_sgdr_stacking

if Validation:
    print('Test R-squared for %s is %f' %('test_preds_sgdr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_sgdr_stacking))))

print('%0.2f min: Finish training second level model'%((time.time() - start_time)/60))

Training Second level learning model via SGDRegressor
Train R-squared for train_preds_lr_stacking is 48607830241593840283272034282433702526976.000000
159.51 min: Finish training second level model


## Submission

In [28]:
if not Validation:
    submission = pd.read_csv('%s/sample_submission.csv' % data_path)
    ver = 6
    for pred_ver in ['lr_stacking', 'sgdr_stacking']:
        print(pred_list['test_preds_' + pred_ver].clip(0,20).mean())
        submission['item_cnt_month'] = pred_list['test_preds_' + pred_ver].clip(0,20)
        submission[['ID', 'item_cnt_month']].to_csv('%s/ver%d_%s.csv' % (submission_path, ver, pred_ver), index = False)

print('%0.2f min: Finish running scripts'%((time.time() - start_time)/60))

0.2748479954636271
0.015032679738562092
159.52 min: Finish running scripts
