In [1]:
import pandas as  pd
import numpy as np
import warnings
import itertools
import xgboost as xgb
from tqdm import tqdm
from numpy import loadtxt
import time
import gc
warnings.filterwarnings('ignore')
kernel_with_output = True
np.random.seed(10)

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 231)
pd.set_option('display.max_columns', 100)

In [3]:
Validation = False
start_time = time.time()

In [4]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int16)
    return df

In [5]:
if kernel_with_output:
    item =pd.read_csv("../data/items.csv")
    item_cat = pd.read_csv("../data/item_categories.csv")
#     df_shops = pd.read_csv("../data/shops.csv")
    sale_train = pd.read_csv("../data/sales_train.csv.gz")
    test = pd.read_csv("../data/test.csv.gz")
    temp = pd.read_csv("../data/sample_submission.csv.gz")


In [6]:
sale_train.loc[2909818] # id = 2909818 seems outliers

date              28.10.2015
date_block_num            33
shop_id                   12
item_id                11373
item_price          0.908714
item_cnt_day            2169
Name: 2909818, dtype: object

In [7]:
sale_train['item_price'][2909818] = sale_train[(sale_train['shop_id'] ==12) & (sale_train['item_id'] == 11373) & (sale_train['date_block_num'] == 33)]['item_price'].median()
sale_train['item_cnt_day'][2909818] = round(sale_train[(sale_train['shop_id'] ==12) & (sale_train['item_id'] == 11373) & (sale_train['date_block_num'] == 33)]['item_cnt_day'].median())

In [8]:
sale_train.loc[2909818] 

date              28.10.2015
date_block_num            33
shop_id                   12
item_id                11373
item_price               317
item_cnt_day               4
Name: 2909818, dtype: object

In [9]:
sale_train.loc[885138] # the price seems too high

date              17.09.2013
date_block_num             8
shop_id                   12
item_id                11365
item_price             59200
item_cnt_day               1
Name: 885138, dtype: object

In [10]:
sale_train['item_price'][885138] = sale_train[(sale_train['item_id'] == 11365) & (sale_train['shop_id'] ==12) & (sale_train['date_block_num'] == 8)]['item_price'].median()

In [11]:
sale_train.loc[885138] # the price seems too high

date              17.09.2013
date_block_num             8
shop_id                   12
item_id                11365
item_price              2770
item_cnt_day               1
Name: 885138, dtype: object

In [12]:
sale_train.shape

(2935849, 6)

In [13]:
test_nrow = test.shape[0]

In [14]:
sale_train = sale_train.merge(test[['shop_id']].drop_duplicates(), how = 'inner') 
sale_train['date'] = pd.to_datetime(sale_train['date'], format = '%d.%m.%Y')

After merge, we delete 522603 records from sale_train. The reason is we want drop all transactions belong to shops in the train set but not in the test. 

In [15]:
sale_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2413246 entries, 0 to 2413245
Data columns (total 6 columns):
date              datetime64[ns]
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 128.9 MB


There are 4 steps:

(1) Aggregate data

(2) Add item/shop pair mean-encoding

(3) First-level model

(4) Ensembling

### 1. Aggregate data 

In [16]:
grid = []
for block_num in sale_train['date_block_num'].unique():
    cur_shops = sale_train[sale_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sale_train[sale_train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
print('%0.2f min: Finish creating the grid'%((time.time() - start_time)/60))

index_cols = ['shop_id', 'item_id', 'date_block_num']
sale_train['item_cnt_day'] = sale_train['item_cnt_day'].clip(0,20)
gb_cnt = sale_train.groupby(index_cols)['item_cnt_day'].agg(['sum']).reset_index().rename(columns = {'sum': 'item_cnt_month'})
gb_cnt['item_cnt_month'] = gb_cnt['item_cnt_month'].clip(0,20).astype(np.int)
#join aggregated data to the grid
train = pd.merge(grid,gb_cnt,how='left',on=index_cols).fillna(0)
train['item_cnt_month'] = train['item_cnt_month'].astype(int)

0.67 min: Finish creating the grid


In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8598044 entries, 0 to 8598043
Data columns (total 4 columns):
shop_id           int32
item_id           int32
date_block_num    int32
item_cnt_month    int32
dtypes: int32(4)
memory usage: 196.8 MB


In [18]:
train = train.merge(item[['item_id', 'item_category_id']], on = ['item_id'], how = 'left')
test = test.merge(item[["item_id", "item_category_id"]], on = ["item_id"], how="left")

In [19]:
item_cat.head()
item_cat.info()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
item_category_name    84 non-null object
item_category_id      84 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [20]:
item_cat = pd.read_csv('../data/item_categories.csv')

l_cat = list(item_cat.item_category_name)

for ind in range(0,1):

    l_cat[ind] = 'PC Headsets / Headphones'

for ind in range(1,8):

    l_cat[ind] = 'Access'

l_cat[8] = 'Tickets (figure)'

l_cat[9] = 'Delivery of goods'

for ind in range(10,18):

    l_cat[ind] = 'Consoles'

for ind in range(18,25):

    l_cat[ind] = 'Consoles Games'

l_cat[25] = 'Accessories for games'

for ind in range(26,28):

    l_cat[ind] = 'phone games'

for ind in range(28,32):

    l_cat[ind] = 'CD games'

for ind in range(32,37):

    l_cat[ind] = 'Card'

for ind in range(37,43):

    l_cat[ind] = 'Movie'

for ind in range(43,55):

    l_cat[ind] = 'Books'

for ind in range(55,61):

    l_cat[ind] = 'Music'

for ind in range(61,73):

    l_cat[ind] = 'Gifts'

for ind in range(73,79):

    l_cat[ind] = 'Soft'

for ind in range(79,81):

    l_cat[ind] = 'Office'

for ind in range(81,83):

    l_cat[ind] = 'Clean'

l_cat[83] = 'Elements of a food'


In [21]:
from sklearn import preprocessing
lb = preprocessing.LabelEncoder()
item_cat['item_cat_id_fix'] = lb.fit_transform(l_cat)
train = train.merge(item_cat[['item_cat_id_fix', 'item_category_id']], on = ['item_category_id'], how = 'left')
test = test.merge(item_cat[['item_cat_id_fix', 'item_category_id']], on = ['item_category_id'], how = 'left')

In [22]:
item_cat.head(50)

Unnamed: 0,item_category_name,item_category_id,item_cat_id_fix
0,PC - Гарнитуры/Наушники,0,14
1,Аксессуары - PS2,1,0
2,Аксессуары - PS3,2,0
3,Аксессуары - PS4,3,0
4,Аксессуары - PSP,4,0
5,Аксессуары - PSVita,5,0
6,Аксессуары - XBOX 360,6,0
7,Аксессуары - XBOX ONE,7,0
8,Билеты (Цифра),8,16
9,Доставка товара,9,8


In [23]:
train = downcast_dtypes(train)
test = downcast_dtypes(test)

In [23]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10913850 entries, 0 to 10913849
Data columns (total 7 columns):
shop_id             int16
item_id             int16
date_block_num      int16
item_cnt_month      float32
item_price          float32
item_category_id    int16
item_cat_id_fix     int16
dtypes: float32(2), int16(5)
memory usage: 270.6 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 5 columns):
ID                  214200 non-null int16
shop_id             214200 non-null int16
item_id             214200 non-null int16
item_category_id    214200 non-null int16
item_cat_id_fix     214200 non-null int16
dtypes: int16(5)
memory usage: 3.7 MB


### 2. Add item/shop pair mean-encodings

##### 2.1 Combine trainset and testset

In [27]:
print('%0.2f min: Start combining data'%((time.time() - start_time)/60))
if Validation == False:
    test['date_block_num'] = 34
    all_data = pd.concat([train, test], axis = 0)
    all_data = all_data.drop(columns = ['ID'])
else:
    all_data = train

9.55 min: Start combining data


In [28]:
all_data.shape
all_data.isnull().sum() # test has only 5 cols, so when merge, these cols will be na

(8812244, 6)

date_block_num           0
item_cat_id_fix          0
item_category_id         0
item_cnt_month      214200
item_id                  0
shop_id                  0
dtype: int64

In [29]:
all_data = downcast_dtypes(all_data)

In [30]:
all_data.head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id
0,0,11,37,1.0,22154,59
1,0,11,40,2.0,22151,59
2,0,0,5,1.0,5603,59
3,0,0,5,2.0,5587,59
4,0,0,2,1.0,5613,59


##### 2.2 Creating item/shop pair lags lag-based featuers

In [31]:
print('%0.2f min: Start adding lag-based feature'%((time.time() - start_time)/60))
index_cols = ['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix', 'date_block_num']
cols_to_rename = list(all_data.columns.difference(index_cols))
print(cols_to_rename)
shift_range = [1, 2, 3, 4, 12]

9.63 min: Start adding lag-based feature
['item_cnt_month']


In [32]:
for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)
    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)
del train_shift
gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:23<00:00, 17.13s/it]


78

In [33]:
all_data = all_data[all_data['date_block_num'] >= 12] # Don't use old data from year 2013
lag_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]
all_data = downcast_dtypes(all_data)
print('%0.2f min: Finish generating lag features'%((time.time() - start_time)/60))

11.04 min: Finish generating lag features


In [34]:
lag_cols

['item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_cnt_month_lag_4',
 'item_cnt_month_lag_12']

In [35]:
all_data.shape

(5459310, 11)

##### 2.3 Creating date features

In [36]:
dates_train = sale_train[['date', 'date_block_num']].drop_duplicates()

In [37]:
# choose 22 because date_block_num is Nov 2014, We will make this data to be Nov 2015
dates_test = dates_train[dates_train['date_block_num'] == 34-12] 

In [38]:
dates_train.head()
dates_train.info()
dates_train.date_block_num.value_counts()
dates_test.head()
dates_test.info()
dates_test.date_block_num.value_counts()

Unnamed: 0,date,date_block_num
0,2013-01-02,0
1,2013-01-10,0
2,2013-01-04,0
3,2013-01-19,0
4,2013-01-31,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1034 entries, 0 to 281280
Data columns (total 2 columns):
date              1034 non-null datetime64[ns]
date_block_num    1034 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 24.2 KB


33    31
21    31
2     31
4     31
6     31
7     31
9     31
11    31
12    31
14    31
18    31
19    31
16    31
0     31
23    31
31    31
30    31
24    31
26    31
28    31
8     30
3     30
5     30
29    30
20    30
27    30
22    30
15    30
32    30
17    30
10    30
25    28
13    28
1     28
Name: date_block_num, dtype: int64

Unnamed: 0,date,date_block_num
30384,2014-11-19,22
30385,2014-11-18,22
30386,2014-11-27,22
30387,2014-11-26,22
30388,2014-11-22,22


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 30384 to 30546
Data columns (total 2 columns):
date              30 non-null datetime64[ns]
date_block_num    30 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 720.0 bytes


22    30
Name: date_block_num, dtype: int64

In [39]:
dates_test['date_block_num'] = 34
dates_test['date'] = dates_test['date'] + pd.DateOffset(years=1) # increase 1 year

In [40]:
dates_test.head()
dates_test.info()
dates_test.date_block_num.value_counts()

Unnamed: 0,date,date_block_num
30384,2015-11-19,34
30385,2015-11-18,34
30386,2015-11-27,34
30387,2015-11-26,34
30388,2015-11-22,34


<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 30384 to 30546
Data columns (total 2 columns):
date              30 non-null datetime64[ns]
date_block_num    30 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 720.0 bytes


34    30
Name: date_block_num, dtype: int64

In [41]:
dates_all = pd.concat([dates_train, dates_test])

In [42]:
dates_all.date_block_num.value_counts()

0     31
23    31
12    31
9     31
14    31
16    31
33    31
18    31
19    31
7     31
21    31
6     31
24    31
26    31
4     31
28    31
30    31
31    31
2     31
11    31
10    30
5     30
3     30
8     30
34    30
15    30
20    30
22    30
27    30
29    30
32    30
17    30
13    28
25    28
1     28
Name: date_block_num, dtype: int64

In [43]:
dates_all['dow'] = dates_all['date'].dt.dayofweek
dates_all['year'] = dates_all['date'].dt.year
dates_all['month'] = dates_all['date'].dt.month

In [44]:
dates_all = pd.get_dummies(dates_all, columns=['dow'])
dow_col = ['dow_' + str(x) for x in range(7)]

In [45]:
dow_col

['dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6']

In [46]:
date_features = dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()

In [47]:
dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()

Unnamed: 0,year,month,date_block_num,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
0,2013,1,0,4,5,5,5,4,4,4
1,2013,2,1,4,4,4,4,4,4,4
2,2013,3,2,4,4,4,4,5,5,5
3,2013,4,3,5,5,4,4,4,4,4
4,2013,5,4,4,4,5,5,5,4,4
5,2013,6,5,4,4,4,4,4,5,5
6,2013,7,6,5,5,5,4,4,4,4
7,2013,8,7,4,4,4,5,5,5,4
8,2013,9,8,5,4,4,4,4,4,5
9,2013,10,9,4,5,5,5,4,4,4


In [48]:
date_features['days_of_month'] = date_features[dow_col].sum(axis=1)

In [49]:
date_features["days_of_month"].describe()

count    35.000000
mean     30.400000
std       0.881176
min      28.000000
25%      30.000000
50%      31.000000
75%      31.000000
max      31.000000
Name: days_of_month, dtype: float64

In [50]:
date_features['year'] = date_features['year'] - 2013 # We already choose data from 2013, so we do it
# to make year start from 0. Maybe this helps to make the value of "year" smaller

In [51]:
date_features = date_features[['month', 'year', 'days_of_month', 'date_block_num']]

In [52]:
all_data = all_data.merge(date_features, on = 'date_block_num', how = 'left')

In [53]:
all_data.head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_12,month,year,days_of_month
0,12,11,40,2.0,5325,59,3.0,0.0,0.0,0.0,0.0,1,1,31
1,12,12,55,1.0,5444,59,1.0,0.0,0.0,0.0,0.0,1,1,31
2,12,12,55,1.0,5459,59,1.0,2.0,0.0,0.0,0.0,1,1,31
3,12,12,55,1.0,5511,59,0.0,1.0,0.0,0.0,0.0,1,1,31
4,12,3,30,3.0,5811,59,5.0,3.0,3.0,3.0,3.0,1,1,31


In [54]:
date_columns = date_features.columns.difference(set(index_cols))

In [55]:
date_columns

Index(['days_of_month', 'month', 'year'], dtype='object')

In [56]:
print('%0.2f min: Start getting date features'%((time.time() - start_time)/60))
dates_train = sale_train[['date', 'date_block_num']].drop_duplicates()
dates_test = dates_train[dates_train['date_block_num'] == 34-12]
dates_test['date_block_num'] = 34
dates_test['date'] = dates_test['date'] + pd.DateOffset(years=1) # increase 1 year

11.10 min: Start getting date features


In [57]:
dates_all = pd.concat([dates_train, dates_test])
dates_all['dow'] = dates_all['date'].dt.dayofweek
dates_all['year'] = dates_all['date'].dt.year
dates_all['month'] = dates_all['date'].dt.month
dates_all = pd.get_dummies(dates_all, columns=['dow'])
dow_col = ['dow_' + str(x) for x in range(7)]
date_features = dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()
date_features['days_of_month'] = date_features[dow_col].sum(axis=1)
date_features['year'] = date_features['year'] - 2013
date_features = date_features[['month', 'year', 'days_of_month', 'date_block_num']]
all_data = all_data.merge(date_features, on = 'date_block_num', how = 'left')
date_columns = date_features.columns.difference(set(index_cols))
print('%0.2f min: Finish getting date features'%((time.time() - start_time)/60))

11.15 min: Finish getting date features


#### 2.4 Scale feature columns

In [58]:
from sklearn.preprocessing import StandardScaler

In [59]:
train = all_data[all_data["date_block_num"]!= all_data["date_block_num"].max()]
test = all_data[all_data["date_block_num"]== all_data["date_block_num"].max()]

In [60]:
test.head()

Unnamed: 0,date_block_num,item_cat_id_fix,item_category_id,item_cnt_month,item_id,shop_id,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_12,month_x,year_x,days_of_month_x,month_y,year_y,days_of_month_y
5245110,34,7,19,0.0,5037,5,0.0,1.0,3.0,1.0,1.0,11,2,30,11,2,30
5245111,34,12,55,0.0,5320,5,0.0,0.0,0.0,0.0,0.0,11,2,30,11,2,30
5245112,34,7,19,0.0,5233,5,1.0,3.0,1.0,0.0,0.0,11,2,30,11,2,30
5245113,34,7,23,0.0,5232,5,0.0,0.0,1.0,0.0,0.0,11,2,30,11,2,30
5245114,34,7,20,0.0,5268,5,0.0,0.0,0.0,0.0,0.0,11,2,30,11,2,30


In [61]:
sc = StandardScaler()

In [62]:
to_drop_cols = ["date_block_num"]

In [63]:
index_cols

['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix', 'date_block_num']

In [64]:
feature_columns = list(set(lag_cols + index_cols + list(date_columns)).difference(to_drop_cols))

In [65]:
feature_columns

['item_cat_id_fix',
 'year',
 'item_cnt_month_lag_2',
 'days_of_month',
 'month',
 'item_cnt_month_lag_12',
 'item_cnt_month_lag_4',
 'shop_id',
 'item_cnt_month_lag_1',
 'item_category_id',
 'item_cnt_month_lag_3',
 'item_id']

In [78]:
train.rename(columns = {"month_x":"month", "year_x":"year", "days_of_month_x":"days_of_month"}, inplace = True)
test.rename(columns = {"month_x":"month", "year_x":"year", "days_of_month_x":"days_of_month"}, inplace = True)

In [79]:
train.drop(["month_y", "year_y", "days_of_month_y"], axis = 1, inplace = True)
test.drop(["month_y", "year_y", "days_of_month_y"], axis = 1, inplace = True)

In [80]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 5245110 to 5459309
Data columns (total 14 columns):
date_block_num           214200 non-null int16
item_cat_id_fix          214200 non-null int16
item_category_id         214200 non-null int16
item_cnt_month           214200 non-null float32
item_id                  214200 non-null int16
shop_id                  214200 non-null int16
item_cnt_month_lag_1     214200 non-null float32
item_cnt_month_lag_2     214200 non-null float32
item_cnt_month_lag_3     214200 non-null float32
item_cnt_month_lag_4     214200 non-null float32
item_cnt_month_lag_12    214200 non-null float32
month                    214200 non-null int64
year                     214200 non-null int64
days_of_month            214200 non-null int64
dtypes: float32(6), int16(5), int64(3)
memory usage: 13.5 MB


In [81]:
train[feature_columns] = sc.fit_transform(train[feature_columns])
test[feature_columns] = sc.fit_transform(test[feature_columns])

In [82]:
all_data = pd.concat([train, test], axis = 0)
all_data = downcast_dtypes(all_data)

In [83]:
all_data.to_csv("../results/06/step_2.csv")

In [84]:
gc.collect()
print('%0.2f min: Finish scaling features'%((time.time() - start_time)/60))

67

400.77 min: Finish scaling features


### 3. First-level model

###### Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts

In [None]:
dates = all_data["date_block_num"]
last_block = dates.max() # last_block 34

In [None]:
print("%0.2f min: Start training First level models" %((time.time() - start_time)/60))

In [None]:
start_first_level_total = time.perf_counter()
scoringMethod = 'r2'
num_first_level_models = 3 

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
# Train meta-features M = 15 (12 + 15 = 27)

In [None]:
months_to_generate_meta_features = range(27,last_block +1)
mask = dates.isin(months_to_generate_meta_features)
Target = 'item_cnt_month'
y_all_level2 = all_data[Target][mask].values # choose item_cnt_month where date_block_num from 27..34
X_all_level2 = np.zeros([y_all_level2.shape[0], num_first_level_models])

In [None]:
slice_start = 0

In [None]:
SEED = 0
for cur_block_num in tqdm(months_to_generate_meta_features):
    print('-' * 50)
    print('Start training for month%d'% cur_block_num)
    start_cur_month = time.perf_counter()
    cur_X_train = all_data.loc[dates <  cur_block_num][feature_columns]
    cur_X_test =  all_data.loc[dates == cur_block_num][feature_columns]
    cur_y_train = all_data.loc[dates <  cur_block_num, Target].values
    cur_y_test =  all_data.loc[dates == cur_block_num, Target].values
    # Create Numpy arrays of train, test and target dataframes to feed into models
    train_x = cur_X_train.values
    train_y = cur_y_train.ravel()
    test_x = cur_X_test.values
    test_y = cur_y_test.ravel()
    
    preds = []
    from sklearn.linear_model import (LinearRegression, SGDRegressor)
    import lightgbm as lgb
    sgdr= SGDRegressor(
        penalty = 'l2' ,
        random_state = SEED )
    lgb_params = {
        'feature_fraction': 0.75,
        'metric': 'rmse',
        'nthread':1,
        'min_data_in_leaf': 2**7,
        'bagging_fraction': 0.75,
        'learning_rate': 0.03,
        'objective': 'mse',
        'bagging_seed': 2**7,
        'num_leaves': 2**7,
        'bagging_freq':1,
        'verbose':0}
    estimators = [sgdr]
    for estimator in estimators:
        print('Training Model %d: %s'%(len(preds), estimator.__class__.__name__))
        start = time.perf_counter()
        estimator.fit(train_x, train_y)
        pred_test = estimator.predict(test_x)
        preds.append(pred_test)

        pred_train = estimator.predict(train_x)
        print('Train RMSE for %s is %f' % (estimator.__class__.__name__, sqrt(mean_squared_error(cur_y_train, pred_train))))
        print('Test RMSE for %s is %f' % (estimator.__class__.__name__, sqrt(mean_squared_error(cur_y_test, pred_test))))

        run = time.perf_counter() - start
        print('{} runs for {:.2f} seconds.'.format(estimator.__class__.__name__, run))
        print()
#         import pickle
#         pickle.dump(estimator, open(filename, 'wb'))

    print('Training Model %d: %s'%(len(preds), 'lightgbm'))
    start = time.perf_counter()
    estimator = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 300)
    pred_test = estimator.predict(test_x)
    preds.append(pred_test)

    pred_train = estimator.predict(train_x)
    print('Train RMSE for %s is %f' % ('lightgbm', sqrt(mean_squared_error(cur_y_train, pred_train))))
    print('Test RMSE for %s is %f' % ('lightgbm', sqrt(mean_squared_error(cur_y_test, pred_test))))

    run = time.perf_counter() - start
    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))
    print()

    print('Training Model %d: %s'%(len(preds), 'keras'))
    start = time.perf_counter()
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.wrappers.scikit_learn import KerasRegressor

    def baseline_model():
        # create model
        model = Sequential()
        model.add(Dense(20, input_dim=train_x.shape[1], kernel_initializer='uniform', activation='softplus'))
        model.add(Dense(1, kernel_initializer='uniform', activation = 'relu'))
        # Compile model
        model.compile(loss='mse', optimizer='Nadam', metrics=['mse'])
        # model.compile(loss='mean_squared_error', optimizer='adam')
        return model

    estimator = KerasRegressor(build_fn=baseline_model, verbose=1, epochs=5, batch_size = 55000)
    estimator.fit(train_x, train_y)
    pred_test = estimator.predict(test_x)
    preds.append(pred_test)
    run = time.perf_counter() - start
    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))

    cur_month_run_total = time.perf_counter() - start_cur_month
    print('Total running time was {:.2f} minutes.'.format(cur_month_run_total/60))
    print('-' * 50)

    slice_end = slice_start + cur_X_test.shape[0]
    X_all_level2[ slice_start : slice_end , :] = np.c_[preds].transpose()
    slice_start = slice_end

In [None]:
# Split train and test
test_nrow = len(preds[0])
X_train_level2 = X_all_level2[ : -test_nrow, :]
X_test_level2 = X_all_level2[ -test_nrow: , :]
y_train_level2 = y_all_level2[ : -test_nrow]
y_test_level2 = y_all_level2[ -test_nrow : ]
print('%0.2f min: Finish training First level models'%((time.perf_counter() - start_first_level_total)/60))

### 4. Ensembling

In [None]:
pred_list = {}

In [None]:
# A. Second level learning model via linear regression
print('Training Second level learning model via linear regression')
from sklearn.linear_model import (LinearRegression, SGDRegressor)
lr = LinearRegression()
lr.fit(X_train_level2, y_train_level2)
# Compute R-squared on the train and test sets.
# print('Train R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, lr.predict(X_train_level2)))))
test_preds_lr_stacking = lr.predict(X_test_level2)
train_preds_lr_stacking = lr.predict(X_train_level2)
print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_lr_stacking))))
pred_list['test_preds_lr_stacking'] = test_preds_lr_stacking
if Validation:
    print('Test R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_lr_stacking))))

In [None]:
# B. Second level learning model via SGDRegressor
print('Training Second level learning model via SGDRegressor')
sgdr= SGDRegressor(
    penalty = 'l2' ,
    random_state = SEED )
sgdr.fit(X_train_level2, y_train_level2)
# Compute R-squared on the train and test sets.
# print('Train R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, lr.predict(X_train_level2)))))
test_preds_sgdr_stacking = sgdr.predict(X_test_level2)
train_preds_sgdr_stacking = sgdr.predict(X_train_level2)
print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_sgdr_stacking))))
pred_list['test_preds_sgdr_stacking'] = test_preds_sgdr_stacking
if Validation:
    print('Test R-squared for %s is %f' %('test_preds_sgdr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_sgdr_stacking))))
print('%0.2f min: Finish training second level model'%((time.time() - start_time)/60))

In [None]:
# Submission -------------------------------------------------------------------
if not Validation:
    submission = pd.read_csv('../data/sample_submission.csv')
    ver = 6
    for pred_ver in ['lr_stacking', 'sgdr_stacking']:
        print(pred_list['test_preds_' + pred_ver].clip(0,20).mean())
        submission['item_cnt_month'] = pred_list['test_preds_' + pred_ver].clip(0,20)
        submission[['ID', 'item_cnt_month']].to_csv("../results/05.csv", index = False)
print('%0.2f min: Finish running scripts'%((time.time() - start_time)/60))

In [None]:
!shutdown -s 