In [1]:
import pandas as  pd
import numpy as np
import warnings
import itertools
import xgboost as xgb
from tqdm import tqdm
from numpy import loadtxt
import time
import gc
warnings.filterwarnings('ignore')
kernel_with_output = True
np.random.seed(10)

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 231)
pd.set_option('display.max_columns', 100)

In [3]:
Validation = False
start_time = time.time()

In [4]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int16)
    return df

In [5]:
if kernel_with_output:
    item =pd.read_csv("../data/items.csv")
    item_cat = pd.read_csv("../data/item_categories.csv")
#     df_shops = pd.read_csv("../data/shops.csv")
    sale_train = pd.read_csv("../data/sales_train.csv.gz")
    test = pd.read_csv("../data/test.csv.gz")
    temp = pd.read_csv("../data/sample_submission.csv.gz")


In [6]:
import pickle as pkl
train = pkl.load(open("../results/06/train.pkl", "rb"))
test = pkl.load(open("../results/06/test.pkl","rb"))

In [7]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8598044 entries, 0 to 8598043
Data columns (total 6 columns):
shop_id             int16
item_id             int16
date_block_num      int16
item_cnt_month      int16
item_category_id    int16
item_cat_id_fix     int16
dtypes: int16(6)
memory usage: 164.0 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 5 columns):
ID                  214200 non-null int16
shop_id             214200 non-null int16
item_id             214200 non-null int16
item_category_id    214200 non-null int16
item_cat_id_fix     214200 non-null int16
dtypes: int16(5)
memory usage: 3.7 MB


### 2. Add item/shop pair mean-encodings

##### 2.1 Combine trainset and testset

In [8]:
%time
if Validation == False:
    test['date_block_num'] = 34
    all_data = pd.concat([train, test], axis = 0)
    print(all_data.head())
    all_data = all_data.drop(columns = ['ID'])
else:
    all_data = train

Wall time: 0 ns
   ID  date_block_num  item_cat_id_fix  item_category_id  item_cnt_month  \
0 NaN               0               11                37             1.0   
1 NaN               0               11                40             2.0   
2 NaN               0                0                 5             1.0   
3 NaN               0                0                 5             2.0   
4 NaN               0                0                 2             1.0   

   item_id  shop_id  
0    22154       59  
1    22151       59  
2     5603       59  
3     5587       59  
4     5613       59  


In [9]:
all_data.shape
all_data.isnull().sum() # test has only 5 cols, so when merge, these cols will be na

(8812244, 6)

date_block_num           0
item_cat_id_fix          0
item_category_id         0
item_cnt_month      214200
item_id                  0
shop_id                  0
dtype: int64

In [10]:
all_data = downcast_dtypes(all_data)

In [11]:
all_data.head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id
0,0,11,37,1.0,22154,59
1,0,11,40,2.0,22151,59
2,0,0,5,1.0,5603,59
3,0,0,5,2.0,5587,59
4,0,0,2,1.0,5613,59


In [12]:
all_data.loc[all_data.date_block_num==1].head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id
247698,1,7,19,1.0,4906,59
247699,1,7,23,1.0,4907,59
247700,1,7,19,1.0,4909,59
247701,1,7,21,1.0,4910,59
247702,1,15,75,2.0,4920,59


##### 2.2 Creating item/shop pair lags lag-based featuers

In [13]:
%time
index_cols = ['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix', 'date_block_num']
cols_to_rename = list(all_data.columns.difference(index_cols))
print(cols_to_rename)
shift_range = [1, 2, 3, 4, 5, 6, 7, 8,12]

Wall time: 0 ns
['item_cnt_month']


In [14]:
# add int_cnt_month of x month before
for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)
    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)
del train_shift
gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:50<00:00,  6.02s/it]


284

In [15]:
all_data.head(100)
all_data.date_block_num.value_counts()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_8,item_cnt_month_lag_12
0,0,11,37,1.0,22154,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,11,40,2.0,22151,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,5,1.0,5603,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,5,2.0,5587,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,2,1.0,5613,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0,0,2,2.0,5623,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0,0,2,1.0,5629,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0,0,2,4.0,5643,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,3,30,1.0,5994,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,7,19,1.0,5992,59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


11    303772
5     297813
6     295445
4     289636
2     283140
7     280016
10    277722
3     276228
9     273874
8     273134
12    267406
23    261129
14    260208
13    256044
18    255922
15    254680
1     254456
16    252640
17    252360
0     247698
19    247025
22    246697
21    241449
26    241285
24    239194
20    236775
25    234000
33    221802
27    221482
30    215496
34    214200
28    212503
29    210494
31    208444
32    208075
Name: date_block_num, dtype: int64

In [16]:
all_data[all_data.date_block_num<12].shape
all_data[all_data.date_block_num<12].shape[0] / all_data.shape[0]
all_data[all_data.date_block_num>=12].shape

(3352934, 15)

0.38048583312037204

(5459310, 15)

In [17]:
%time
all_data = all_data[all_data['date_block_num'] >= 12] # Don't use old data from year 2013, remove 3352934(38%) data
lag_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]
all_data = downcast_dtypes(all_data)


Wall time: 0 ns


In [18]:
lag_cols

['item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_lag_4',
 'item_cnt_month_lag_5',
 'item_cnt_month_lag_6',
 'item_cnt_month_lag_7',
 'item_cnt_month_lag_8',
 'item_cnt_month_lag_12']

In [19]:
all_data.shape

(5459310, 15)

##### 2.3 Creating date features

In [20]:
dates_train = sale_train[['date', 'date_block_num']].drop_duplicates()

In [21]:
dates_train["date"] = pd.to_datetime(dates_train["date"])

In [22]:
# choose 22 because date_block_num is Nov 2014, We will make this data to be Nov 2015
dates_test = dates_train[dates_train['date_block_num'] == 34-12] 

In [23]:
dates_train.head()
dates_train.info()
dates_train.date_block_num.value_counts()
dates_test.head()
dates_test.info()
dates_test.date_block_num.value_counts()

Unnamed: 0,date,date_block_num
0,2013-02-01,0
1,2013-03-01,0
2,2013-05-01,0
3,2013-06-01,0
4,2013-01-15,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1034 entries, 0 to 2882514
Data columns (total 2 columns):
date              1034 non-null datetime64[ns]
date_block_num    1034 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 24.2 KB


33    31
21    31
2     31
4     31
6     31
7     31
9     31
11    31
12    31
14    31
18    31
19    31
16    31
0     31
23    31
31    31
30    31
24    31
26    31
28    31
8     30
3     30
5     30
29    30
20    30
27    30
22    30
15    30
32    30
17    30
10    30
25    28
13    28
1     28
Name: date_block_num, dtype: int64

Unnamed: 0,date,date_block_num
2106209,2014-11-19,22
2106210,2014-03-11,22
2106211,2014-01-11,22
2106212,2014-04-11,22
2106213,2014-11-22,22


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 2106209 to 2106402
Data columns (total 2 columns):
date              30 non-null datetime64[ns]
date_block_num    30 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 720.0 bytes


22    30
Name: date_block_num, dtype: int64

In [24]:
dates_test['date_block_num'] = 34

In [25]:
dates_test['date'] = dates_test['date'] + pd.DateOffset(years=1) # increase 1 year

In [26]:
dates_test.head()
dates_test.info()
dates_test.date_block_num.value_counts()

Unnamed: 0,date,date_block_num
2106209,2015-11-19,34
2106210,2015-03-11,34
2106211,2015-01-11,34
2106212,2015-04-11,34
2106213,2015-11-22,34


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 2106209 to 2106402
Data columns (total 2 columns):
date              30 non-null datetime64[ns]
date_block_num    30 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 720.0 bytes


34    30
Name: date_block_num, dtype: int64

In [27]:
dates_all = pd.concat([dates_train, dates_test])

In [28]:
dates_all['dow'] = dates_all['date'].dt.dayofweek
dates_all['year'] = dates_all['date'].dt.year
dates_all['month'] = dates_all['date'].dt.month

In [29]:
dates_all = pd.get_dummies(dates_all, columns=['dow'])

In [30]:
dow_col = ['dow_' + str(x) for x in range(7)]

dow_col

['dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6']

In [31]:
date_features = dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()

In [32]:
dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()

Unnamed: 0,year,month,date_block_num,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,2013,1,0,3,4,3,3,2,2,3
1,2013,1,1,0,0,1,0,0,0,0
2,2013,1,2,0,0,0,1,0,0,0
3,2013,1,3,0,0,0,0,1,0,0
4,2013,1,4,0,0,0,0,0,1,0
5,2013,1,5,0,0,0,0,0,0,1
6,2013,1,6,1,0,0,0,0,0,0
7,2013,1,7,0,1,0,0,0,0,0
8,2013,1,8,0,0,1,0,0,0,0
9,2013,1,9,0,0,0,1,0,0,0


In [33]:
date_features['days_of_month'] = date_features[dow_col].sum(axis=1)

In [34]:
date_features["days_of_month"].describe()

count    420.000000
mean       2.533333
std        5.097740
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max       20.000000
Name: days_of_month, dtype: float64

In [35]:
date_features['year'] = date_features['year'] - 2013 # We already choose data from 2013, so we do it
# to make year start from 0. Maybe this helps to make the value of "year" smaller

In [36]:
date_features = date_features[['month', 'year', 'days_of_month', 'date_block_num']]

In [37]:
all_data = all_data.merge(date_features, on = 'date_block_num', how = 'left')

In [38]:
all_data.head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_8,item_cnt_month_lag_12,month,year,days_of_month
0,12,11,40,2.0,5325,59,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,20
1,12,11,40,2.0,5325,59,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1,1
2,12,11,40,2.0,5325,59,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1,1
3,12,11,40,2.0,5325,59,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,1
4,12,11,40,2.0,5325,59,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,1,1


In [39]:
date_columns = date_features.columns.difference(set(index_cols))

In [40]:
date_columns

Index(['days_of_month', 'month', 'year'], dtype='object')

In [41]:
# %time
# dates_train = sale_train[['date', 'date_block_num']].drop_duplicates()
# dates_train["date"] = pd.to_datetime(dates_train["date"])
# dates_test = dates_train[dates_train['date_block_num'] == 34-12]
# dates_test['date_block_num'] = 34
# dates_test['date'] = dates_test['date'] + pd.DateOffset(years=1) # increase 1 year

In [42]:
# dates_all = pd.concat([dates_train, dates_test])
# dates_all['dow'] = dates_all['date'].dt.dayofweek
# dates_all['year'] = dates_all['date'].dt.year
# dates_all['month'] = dates_all['date'].dt.month
# dates_all = pd.get_dummies(dates_all, columns=['dow'])
# dow_col = ['dow_' + str(x) for x in range(7)]
# date_features = dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()
# date_features['days_of_month'] = date_features[dow_col].sum(axis=1)
# date_features['year'] = date_features['year'] - 2013
# date_features = date_features[['month', 'year', 'days_of_month', 'date_block_num']]
# all_data = all_data.merge(date_features, on = 'date_block_num', how = 'left')
# date_columns = date_features.columns.difference(set(index_cols))
# print('%0.2f min: Finish getting date features'%((time.time() - start_time)/60))

In [43]:
import gc
gc.collect()

222

#### 2.4 Scale feature columns

In [44]:
from sklearn.preprocessing import StandardScaler

In [53]:
all_data = downcast_dtypes(all_data)

In [54]:
train = all_data[all_data["date_block_num"]!= all_data["date_block_num"].max()]
test = all_data[all_data["date_block_num"]== all_data["date_block_num"].max()]

In [55]:
test.head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_8,item_cnt_month_lag_12,month,year,days_of_month
62941320,34,7,19,0.0,5037,5,0.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,1.0,1,2,1
62941321,34,7,19,0.0,5037,5,0.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,1.0,2,2,1
62941322,34,7,19,0.0,5037,5,0.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,1.0,3,2,1
62941323,34,7,19,0.0,5037,5,0.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,1.0,4,2,1
62941324,34,7,19,0.0,5037,5,0.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,1.0,5,2,1


In [56]:
sc = StandardScaler()

In [57]:
to_drop_cols = ["date_block_num"]

In [58]:
index_cols

['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix', 'date_block_num']

In [59]:
feature_columns = list(set(lag_cols + index_cols + list(date_columns)).difference(to_drop_cols))

In [60]:
feature_columns

['shop_id',
 'item_category_id',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_4',
 'item_cnt_month_lag_2',
 'month',
 'year',
 'item_cnt_month_lag_12',
 'item_cnt_month_lag_6',
 'item_cnt_month_lag_5',
 'item_cnt_month_lag_7',
 'item_cnt_month_lag_3',
 'item_id',
 'item_cnt_month_lag_8',
 'days_of_month',
 'item_cat_id_fix']

In [61]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62941320 entries, 0 to 62941319
Data columns (total 18 columns):
date_block_num           int16
item_cat_id_fix          int16
item_category_id         int16
item_cnt_month           float32
item_id                  int16
shop_id                  int16
item_cnt_month_lag_1     float32
item_cnt_month_lag_2     float32
item_cnt_month_lag_3     float32
item_cnt_month_lag_4     float32
item_cnt_month_lag_5     float32
item_cnt_month_lag_6     float32
item_cnt_month_lag_7     float32
item_cnt_month_lag_8     float32
item_cnt_month_lag_12    float32
month                    int16
year                     int16
days_of_month            int16
dtypes: float32(10), int16(8)
memory usage: 3.8 GB


In [62]:
train[feature_columns] = sc.fit_transform(train[feature_columns])

MemoryError: 

In [63]:
gc.collect()

20

In [None]:
test[feature_columns] = sc.fit_transform(test[feature_columns])

In [None]:
all_data = pd.concat([train, test], axis = 0)
all_data = downcast_dtypes(all_data)

In [None]:
import pickle as pkl
pkl.dump(all_data, open("../results/08/step2.pkl","wb"))

In [None]:
gc.collect()
print('%0.2f min: Finish scaling features'%((time.time() - start_time)/60))

### 3. First-level model

###### Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts

In [85]:
dates = all_data["date_block_num"]
last_block = dates.max() # last_block 34

In [86]:
print("%0.2f min: Start training First level models" %((time.time() - start_time)/60))

401.24 min: Start training First level models


In [87]:
start_first_level_total = time.perf_counter()
scoringMethod = 'r2'
num_first_level_models = 3 

In [88]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [89]:
# Train meta-features M = 15 (12 + 15 = 27)

In [90]:
months_to_generate_meta_features = range(27,last_block +1)
mask = dates.isin(months_to_generate_meta_features)
Target = 'item_cnt_month'
y_all_level2 = all_data[Target][mask].values # choose item_cnt_month where date_block_num from 27..34
X_all_level2 = np.zeros([y_all_level2.shape[0], num_first_level_models])

In [91]:
slice_start = 0

In [None]:
%time
SEED = 0
for cur_block_num in tqdm(months_to_generate_meta_features):
    print('-' * 50)
    print('Start training for month%d'% cur_block_num)
    start_cur_month = time.perf_counter()
    cur_X_train = all_data.loc[dates <  cur_block_num][feature_columns]
    cur_X_test =  all_data.loc[dates == cur_block_num][feature_columns]
    cur_y_train = all_data.loc[dates <  cur_block_num, Target].values
    cur_y_test =  all_data.loc[dates == cur_block_num, Target].values
    # Create Numpy arrays of train, test and target dataframes to feed into models
    train_x = cur_X_train.values
    train_y = cur_y_train.ravel()
    test_x = cur_X_test.values
    test_y = cur_y_test.ravel()
    
    preds = []
    from sklearn.linear_model import (LinearRegression, SGDRegressor)
    import lightgbm as lgb
    sgdr= SGDRegressor(
        penalty = 'l2' ,
        random_state = SEED )
    lgb_params = {
        'feature_fraction': 0.75,
        'metric': 'rmse',
        'nthread':1,
        'min_data_in_leaf': 2**7,
        'bagging_fraction': 0.75,
        'learning_rate': 0.03,
        'objective': 'mse',
        'bagging_seed': 2**7,
        'num_leaves': 2**7,
        'bagging_freq':1,
        'verbose':0}
    estimators = [sgdr]
    for estimator in estimators:
        print('Training Model %d: %s'%(len(preds), estimator.__class__.__name__))
        start = time.perf_counter()
        estimator.fit(train_x, train_y)
        pred_test = estimator.predict(test_x)
        preds.append(pred_test)

        pred_train = estimator.predict(train_x)
        print('Train RMSE for %s is %f' % (estimator.__class__.__name__, sqrt(mean_squared_error(cur_y_train, pred_train))))
        print('Test RMSE for %s is %f' % (estimator.__class__.__name__, sqrt(mean_squared_error(cur_y_test, pred_test))))

        run = time.perf_counter() - start
        print('{} runs for {:.2f} seconds.'.format(estimator.__class__.__name__, run))
        print()
#         import pickle
#         pickle.dump(estimator, open(filename, 'wb'))

    print('Training Model %d: %s'%(len(preds), 'lightgbm'))
    start = time.perf_counter()
    estimator = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 300)
    pred_test = estimator.predict(test_x)
    preds.append(pred_test)

    pred_train = estimator.predict(train_x)
    print('Train RMSE for %s is %f' % ('lightgbm', sqrt(mean_squared_error(cur_y_train, pred_train))))
    print('Test RMSE for %s is %f' % ('lightgbm', sqrt(mean_squared_error(cur_y_test, pred_test))))

    run = time.perf_counter() - start
    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))
    print()

    print('Training Model %d: %s'%(len(preds), 'keras'))
    start = time.perf_counter()
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.wrappers.scikit_learn import KerasRegressor

    def baseline_model():
        # create model
        model = Sequential()
        model.add(Dense(20, input_dim=train_x.shape[1], kernel_initializer='uniform', activation='softplus'))
        model.add(Dense(1, kernel_initializer='uniform', activation = 'relu'))
        # Compile model
        model.compile(loss='mse', optimizer='Nadam', metrics=['mse'])
        # model.compile(loss='mean_squared_error', optimizer='adam')
        return model

    estimator = KerasRegressor(build_fn=baseline_model, verbose=1, epochs=5, batch_size = 55000)
    estimator.fit(train_x, train_y)
    pred_test = estimator.predict(test_x)
    preds.append(pred_test)
    run = time.perf_counter() - start
    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))

    cur_month_run_total = time.perf_counter() - start_cur_month
    print('Total running time was {:.2f} minutes.'.format(cur_month_run_total/60))
    print('-' * 50)

    slice_end = slice_start + cur_X_test.shape[0]
    X_all_level2[ slice_start : slice_end , :] = np.c_[preds].transpose()
    slice_start = slice_end

  0%|                                                                                            | 0/8 [00:00<?, ?it/s]

--------------------------------------------------
Start training for month27
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.974260
Test RMSE for SGDRegressor is 0.992076
SGDRegressor runs for 9.56 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.857592
Test RMSE for lightgbm is 0.925844
lightgbm runs for 418.89 seconds.

Training Model 2: keras


Using TensorFlow backend.








Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


Epoch 5/5


<keras.callbacks.History at 0x1e1bbce6ef0>

lightgbm runs for 52.30 seconds.
Total running time was 8.04 minutes.
--------------------------------------------------


 12%|██████████▍                                                                        | 1/8 [08:02<56:16, 482.42s/it]

--------------------------------------------------
Start training for month28
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.974763
Test RMSE for SGDRegressor is 0.901237
SGDRegressor runs for 11.06 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.859644
Test RMSE for lightgbm is 0.868869
lightgbm runs for 415.68 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




<keras.callbacks.History at 0x1e18259b2e8>

lightgbm runs for 44.59 seconds.
Total running time was 7.88 minutes.
--------------------------------------------------


 25%|████████████████████▊                                                              | 2/8 [15:55<47:57, 479.51s/it]

--------------------------------------------------
Start training for month29
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.971967
Test RMSE for SGDRegressor is 0.825820
SGDRegressor runs for 12.17 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.855837
Test RMSE for lightgbm is 0.805780
lightgbm runs for 324.87 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e188898438>

lightgbm runs for 11.37 seconds.
Total running time was 5.83 minutes.
--------------------------------------------------


 38%|███████████████████████████████▏                                                   | 3/8 [21:45<36:43, 440.66s/it]

--------------------------------------------------
Start training for month30
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.967234
Test RMSE for SGDRegressor is 0.758693
SGDRegressor runs for 4.44 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.852244
Test RMSE for lightgbm is 0.730400
lightgbm runs for 341.95 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5




<keras.callbacks.History at 0x1e188dabf60>

lightgbm runs for 49.86 seconds.
Total running time was 6.61 minutes.
--------------------------------------------------


 50%|█████████████████████████████████████████▌                                         | 4/8 [28:21<28:29, 427.48s/it]

--------------------------------------------------
Start training for month31
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.949632
Test RMSE for SGDRegressor is 0.833529
SGDRegressor runs for 10.48 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.844984
Test RMSE for lightgbm is 0.803555
lightgbm runs for 454.64 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5




<keras.callbacks.History at 0x1e188f6b978>

lightgbm runs for 51.58 seconds.
Total running time was 8.63 minutes.
--------------------------------------------------


 62%|███████████████████████████████████████████████████▉                               | 5/8 [36:59<22:43, 454.66s/it]

--------------------------------------------------
Start training for month32
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.945583
Test RMSE for SGDRegressor is 0.984203
SGDRegressor runs for 12.83 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.843797
Test RMSE for lightgbm is 0.938377
lightgbm runs for 517.39 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5


Epoch 4/5
Epoch 5/5




<keras.callbacks.History at 0x1e1a2af8e48>

lightgbm runs for 55.19 seconds.
Total running time was 9.78 minutes.
--------------------------------------------------


 75%|██████████████████████████████████████████████████████████████▎                    | 6/8 [46:47<16:28, 494.39s/it]

--------------------------------------------------
Start training for month33
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.951246
Test RMSE for SGDRegressor is 0.908068
SGDRegressor runs for 11.84 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.844980
Test RMSE for lightgbm is 0.835610
lightgbm runs for 504.42 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e1a2c759b0>

lightgbm runs for 13.29 seconds.
Total running time was 8.85 minutes.
--------------------------------------------------


 88%|████████████████████████████████████████████████████████████████████████▋          | 7/8 [55:38<08:25, 505.41s/it]

--------------------------------------------------
Start training for month34
Training Model 0: SGDRegressor


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=0, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

Train RMSE for SGDRegressor is 0.948541
Test RMSE for SGDRegressor is 0.866021
SGDRegressor runs for 5.97 seconds.

Training Model 1: lightgbm
Train RMSE for lightgbm is 0.843268
Test RMSE for lightgbm is 0.829173
lightgbm runs for 480.82 seconds.

Training Model 2: keras
Epoch 1/5

In [None]:
# Split train and test
test_nrow = len(preds[0])
X_train_level2 = X_all_level2[ : -test_nrow, :]
X_test_level2 = X_all_level2[ -test_nrow: , :]
y_train_level2 = y_all_level2[ : -test_nrow]
y_test_level2 = y_all_level2[ -test_nrow : ]
print('%0.2f min: Finish training First level models'%((time.perf_counter() - start_first_level_total)/60))

### 4. Ensembling

In [None]:
pred_list = {}

In [None]:
# A. Second level learning model via linear regression
print('Training Second level learning model via linear regression')
from sklearn.linear_model import (LinearRegression, SGDRegressor)
lr = LinearRegression()
lr.fit(X_train_level2, y_train_level2)
# Compute R-squared on the train and test sets.
# print('Train R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, lr.predict(X_train_level2)))))
test_preds_lr_stacking = lr.predict(X_test_level2)
train_preds_lr_stacking = lr.predict(X_train_level2)
print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_lr_stacking))))
pred_list['test_preds_lr_stacking'] = test_preds_lr_stacking
if Validation:
    print('Test R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_lr_stacking))))

In [None]:
# B. Second level learning model via SGDRegressor
print('Training Second level learning model via SGDRegressor')
sgdr= SGDRegressor(
    penalty = 'l2' ,
    random_state = SEED )
sgdr.fit(X_train_level2, y_train_level2)
# Compute R-squared on the train and test sets.
# print('Train R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, lr.predict(X_train_level2)))))
test_preds_sgdr_stacking = sgdr.predict(X_test_level2)
train_preds_sgdr_stacking = sgdr.predict(X_train_level2)
print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_sgdr_stacking))))
pred_list['test_preds_sgdr_stacking'] = test_preds_sgdr_stacking
if Validation:
    print('Test R-squared for %s is %f' %('test_preds_sgdr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_sgdr_stacking))))
print('%0.2f min: Finish training second level model'%((time.time() - start_time)/60))

In [None]:
# Submission -------------------------------------------------------------------
if not Validation:
    submission = pd.read_csv('../data/sample_submission.csv')
    ver = 6
    for pred_ver in ['lr_stacking', 'sgdr_stacking']:
        print(pred_list['test_preds_' + pred_ver].clip(0,20).mean())
        submission['item_cnt_month'] = pred_list['test_preds_' + pred_ver].clip(0,20)
        submission[['ID', 'item_cnt_month']].to_csv("../results/08.csv", index = False)
print('%0.2f min: Finish running scripts'%((time.time() - start_time)/60))

In [None]:
!shutdown -s 