## Load Data, import libs

In [1]:
import numpy as np
import pandas as pd
import gzip # for gzip files
import matplotlib.pyplot as plt
%matplotlib inline

# Load items and shops
items_df = pd.read_csv('items.csv')
shops_df = pd.read_csv('shops.csv')
# Load other
icats_df = pd.read_csv('item_categories.csv')
train_df = pd.read_csv('sales_train.csv.gz', header=0, sep=',', quotechar='"')
smpsb_df = pd.read_csv('sample_submission.csv', header=0, sep=',', quotechar='"')
test_df  = pd.read_csv('test.csv', header=0, sep=',', quotechar='"')

# Check what's in dataframes

In [2]:
items_df.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [3]:
# Check the shapes of dfs
print (items_df.shape)
print (shops_df.shape)
print (icats_df.shape)
print (train_df.shape)
print (smpsb_df.shape)
print (test_df.shape)

(22170, 3)
(60, 2)
(84, 2)
(2935849, 6)
(214200, 2)
(214200, 3)


In [4]:
shops_df.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [5]:
icats_df.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [6]:
train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [7]:
smpsb_df.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [8]:
test_df.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


## Map Items Categorries
Map Cattegories to more narrow ones

In [9]:
l = list(icats_df.item_category_name)
l_cat = l

for ind in range(1,8):
    l_cat[ind] = 'Access'

for ind in range(10,18):
    l_cat[ind] = 'Consoles'

for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'

for ind in range(26,28):
    l_cat[ind] = 'phone games'

for ind in range(28,32):
    l_cat[ind] = 'CD games'

for ind in range(32,37):
    l_cat[ind] = 'Card'

for ind in range(37,43):
    l_cat[ind] = 'Movie'

for ind in range(43,55):
    l_cat[ind] = 'Books'

for ind in range(55,61):
    l_cat[ind] = 'Music'

for ind in range(61,73):
    l_cat[ind] = 'Gifts'

for ind in range(73,79):
    l_cat[ind] = 'Soft'


icats_df['cats'] = l_cat
icats_df.head()

Unnamed: 0,item_category_name,item_category_id,cats
0,PC - Гарнитуры/Наушники,0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2,1,Access
2,Аксессуары - PS3,2,Access
3,Аксессуары - PS4,3,Access
4,Аксессуары - PSP,4,Access


In [10]:
items_df = pd.merge(items_df, icats_df, on=['item_category_id'], how='left')

In [11]:
items_df = items_df[['item_id', 'cats']]
items_df.head()

Unnamed: 0,item_id,cats
0,0,Movie
1,1,Soft
2,2,Movie
3,3,Movie
4,4,Movie


## EDA

In [12]:
train_df.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [13]:
print (items_df.cats.unique().shape)
print (train_df.shop_id.unique().shape)
print (test_df.shop_id.unique().shape)
print (test_df.item_id.unique().shape)
print (test_df.shop_id.unique().shape)
print (train_df.item_id.unique().shape)

(20,)
(60,)
(42,)
(5100,)
(42,)
(21807,)


## Remove Outliers

In [14]:
train_df = train_df[train_df['item_price'] < 100000]
train_df = train_df[train_df['item_cnt_day'] < 1000]
train_df.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935846.0,2935846.0,2935846.0,2935846.0,2935846.0
mean,14.5699,33.00175,10197.22,890.7492,1.241562
std,9.422985,16.22697,6324.297,1720.491,2.217636
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,59200.0,669.0


## Aggregate training data

Create grid using all the shop_id, item_id combiation for each date_block_num

In [15]:
from itertools import product
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []
for block_num in train_df['date_block_num'].unique():
    cur_shops = train_df.loc[train_df['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = train_df.loc[train_df['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [16]:
train_df = train_df.groupby(['date_block_num','shop_id','item_id']).agg(
    {'item_cnt_day': np.sum, 'item_price': np.mean}).reset_index()

In [17]:
train_df.rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=True)

In [18]:
train_df = pd.merge(grid, train_df, on=index_cols, how='left')

## Add category id to each data point.

In [19]:
train_df = pd.merge(train_df, items_df, on=['item_id'], how='left')
test_df = pd.merge(test_df, items_df, on=['item_id'], how='left')

In [20]:
train_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price,item_cnt_month,cats
0,59,22154,0,999.0,1.0,Movie
1,59,2552,0,,,Music
2,59,2554,0,,,Music
3,59,2555,0,,,Music
4,59,2564,0,,,Music


In [None]:
test_df.head()

Unnamed: 0,ID,shop_id,item_id,cats
0,0,5,5037,Consoles Games
1,1,5,5320,Music
2,2,5,5233,Consoles Games
3,3,5,5232,Consoles Games
4,4,5,5268,Consoles Games


## Mean encoding

In [None]:
%%time
for type_ids in [['item_id'], ['shop_id'], ['cats'], ['item_id', 'shop_id']]:
    for column_id in ['item_price', 'item_cnt_month']:
        mean_df = train_df[type_ids + [column_id]].groupby(type_ids).agg(np.mean).reset_index()
        mean_df.rename(columns={column_id: "mean_of_"+column_id+"_groupby_"+"_".join(type_ids)}, inplace=True)
        
        train_df = pd.merge(train_df, mean_df, on=type_ids, how='left')
        test_df = pd.merge(test_df, mean_df, on=type_ids, how='left')

## Fillna

In [None]:
# Test fillna
test_df['mean_of_item_price_groupby_item_id'] = test_df['mean_of_item_price_groupby_item_id'].fillna(test_df['mean_of_item_price_groupby_cats'])
test_df['mean_of_item_cnt_month_groupby_item_id'] = test_df['mean_of_item_cnt_month_groupby_item_id'].fillna(test_df['mean_of_item_cnt_month_groupby_cats'])
test_df['mean_of_item_price_groupby_item_id_shop_id'] = test_df['mean_of_item_price_groupby_item_id_shop_id'].fillna(test_df['mean_of_item_price_groupby_item_id'])
test_df['mean_of_item_cnt_month_groupby_item_id_shop_id'] = test_df['mean_of_item_cnt_month_groupby_item_id_shop_id'].fillna(test_df['mean_of_item_cnt_month_groupby_item_id'])
# Train fillna
train_df['mean_of_item_price_groupby_item_id_shop_id'] = train_df['mean_of_item_price_groupby_item_id_shop_id'].fillna(train_df['mean_of_item_price_groupby_item_id'])
train_df['mean_of_item_cnt_month_groupby_item_id_shop_id'] = train_df['mean_of_item_cnt_month_groupby_item_id_shop_id'].fillna(train_df['mean_of_item_cnt_month_groupby_item_id'])

In [None]:
for df in train_df, test_df:
    for feat in df.columns[4:]:
        if 'item_cnt' in feat:
            df[feat]=df[feat].fillna(0)
        elif 'item_price' in feat:
            df[feat]=df[feat].fillna(df[feat].median())

In [None]:
train_df['item_cnt_month'] = train_df['item_cnt_month'].fillna(0)

## Add historical sales data

In [None]:
train_df_temp = train_df.copy()
train_df = train_df[train_df['date_block_num']>=12]

In [None]:
features = ['item_cnt_month', 'item_price', 'mean_of_item_price_groupby_item_id',
       'mean_of_item_cnt_month_groupby_item_id',
       'mean_of_item_price_groupby_shop_id',
       'mean_of_item_cnt_month_groupby_shop_id',
       'mean_of_item_price_groupby_cats',
       'mean_of_item_cnt_month_groupby_cats',
       'mean_of_item_price_groupby_item_id_shop_id',
       'mean_of_item_cnt_month_groupby_item_id_shop_id']

In [None]:
def add_historical_data(df):
    for diff in (1, 2, 3, 4, 6, 12):
        train_df_copy = train_df_temp.copy()
        train_df_copy['date_block_num'] += diff
        train_df_copy = train_df_copy[['date_block_num', 'item_id', 'shop_id'] + features]
        train_df_copy.rename(columns={
            feat: feat+"_"+str(diff)+'_month_ago' for feat in features
        }, inplace=True)
        df = pd.merge(df, train_df_copy, on=['shop_id', 'item_id', 'date_block_num'], how='left')
    return df

In [None]:
test_df['date_block_num'] = 34
train_df = add_historical_data(train_df)
test_df = add_historical_data(test_df)
test_df.drop('date_block_num', axis=1, inplace=True)

In [55]:
train_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,cats,mean_of_item_price_groupby_item_id,mean_of_item_cnt_month_groupby_item_id,mean_of_item_price_groupby_shop_id,mean_of_item_cnt_month_groupby_shop_id,...,item_cnt_month_12_month_ago,item_price_12_month_ago,mean_of_item_price_groupby_item_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_12_month_ago,mean_of_item_price_groupby_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_shop_id_12_month_ago,mean_of_item_price_groupby_cats_12_month_ago,mean_of_item_cnt_month_groupby_cats_12_month_ago,mean_of_item_price_groupby_item_id_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_shop_id_12_month_ago
0,54,10297,12,4.0,749.0,Movie,709.478496,1.210526,655.04886,2.636404,...,,,,,,,,,,
1,54,10296,12,3.0,1599.0,Movie,1464.972764,1.130081,655.04886,2.636404,...,,,,,,,,,,
2,54,10298,12,14.0,399.0,Movie,223.781333,4.82138,655.04886,2.636404,...,,,,,,,,,,
3,54,10300,12,3.0,699.0,Movie,519.571884,2.520833,655.04886,2.636404,...,,,,,,,,,,
4,54,10284,12,1.0,299.0,Music,284.902913,1.174757,655.04886,2.636404,...,,,,,,,,,,


In [57]:
for df in train_df, test_df:
    for feat in train_df.columns[6:]:
        if 'item_cnt' in feat:
            df[feat]=df[feat].fillna(0)
        elif 'item_price' in feat:
            df[feat]=df[feat].fillna(df[feat].median())

In [58]:
# Add pair-difference features
columns = {
    'diff_between_item_shop_and_item': ('mean_of_item_price_groupby_item_id_shop_id', 'mean_of_item_price_groupby_item_id'),
    'diff_between_item_and_category': ('mean_of_item_price_groupby_item_id', 'mean_of_item_price_groupby_cats')
}
for new_feature, (col1, col2) in columns.items():
    for df in (train_df, test_df):
        df[new_feature] = df[col1] - df[col2]

In [61]:
# Clip the values of the target
train_df['item_cnt_month'] = train_df['item_cnt_month'].clip(0, 20)

In [62]:
# Split into training set and validation set
training_set = train_df[train_df['date_block_num']<33]
validation_set = train_df[train_df['date_block_num']==33].reset_index()

In [63]:
validation_set.head()

Unnamed: 0,index,shop_id,item_id,date_block_num,item_cnt_month,item_price,cats,mean_of_item_price_groupby_item_id,mean_of_item_cnt_month_groupby_item_id,mean_of_item_price_groupby_shop_id,...,mean_of_item_price_groupby_item_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_12_month_ago,mean_of_item_price_groupby_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_shop_id_12_month_ago,mean_of_item_price_groupby_cats_12_month_ago,mean_of_item_cnt_month_groupby_cats_12_month_ago,mean_of_item_price_groupby_item_id_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_shop_id_12_month_ago,diff_between_item_shop_and_item,diff_between_item_and_category
0,6186922,45,13315,33,1.0,649.0,Books,640.875,1.125,861.189864,...,296.188435,0.0,825.151374,0.0,372.622621,0.0,299.0,0.0,8.125,290.090958
1,6186923,45,13880,33,1.0,229.0,Music,208.925643,2.561828,861.189864,...,208.925643,2.561828,861.189864,1.80925,372.622621,1.379644,214.0,1.8,5.074357,-163.696979
2,6186924,45,13881,33,2.0,659.0,Music,579.715439,2.710271,861.189864,...,579.715439,2.710271,861.189864,1.80925,372.622621,1.379644,588.3,1.96,8.584561,207.092817
3,6186925,45,13923,33,1.0,169.0,Movie,150.236386,1.572183,861.189864,...,150.236386,1.572183,861.189864,1.80925,340.509868,1.745858,157.571429,1.142857,7.335042,-190.273482
4,6186926,45,14227,33,1.0,99.0,CD games,97.129878,3.462963,861.189864,...,97.129878,3.462963,861.189864,1.80925,520.312473,3.636535,99.0,1.75,1.870122,-423.182595


## Only keep the useful columns

In [71]:
features = train_df.columns[6:].tolist()

In [72]:
train_df[features].to_csv('./data/X_train_all.csv', index=False)
train_df['item_cnt_month'].to_csv('./data/y_train_all.csv', index=False)
X_train = training_set[features]
X_train.to_csv('./data/X_train.csv', index=False)
y_train = training_set['item_cnt_month']
y_train.to_csv('./data/y_train.csv', index=False)
X_validation = validation_set[features]
X_validation.to_csv('./data/X_validation.csv', index=False)
y_validation = validation_set['item_cnt_month']
y_validation.to_csv('./data/y_validation.csv', index=False)
test_df = test_df[['ID'] + features]
test_df.to_csv('./data/X_test.csv', index=False)

## xgboost model

In [73]:
import xgboost as xgb



In [74]:
train_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,cats,mean_of_item_price_groupby_item_id,mean_of_item_cnt_month_groupby_item_id,mean_of_item_price_groupby_shop_id,mean_of_item_cnt_month_groupby_shop_id,...,mean_of_item_price_groupby_item_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_12_month_ago,mean_of_item_price_groupby_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_shop_id_12_month_ago,mean_of_item_price_groupby_cats_12_month_ago,mean_of_item_cnt_month_groupby_cats_12_month_ago,mean_of_item_price_groupby_item_id_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_shop_id_12_month_ago,diff_between_item_shop_and_item,diff_between_item_and_category
0,54,10297,12,4.0,749.0,Movie,709.478496,1.210526,655.04886,2.636404,...,296.188435,0.0,825.151374,0.0,372.622621,0.0,299.0,0.0,39.521504,368.968628
1,54,10296,12,3.0,1599.0,Movie,1464.972764,1.130081,655.04886,2.636404,...,296.188435,0.0,825.151374,0.0,372.622621,0.0,299.0,0.0,62.185807,1124.462896
2,54,10298,12,14.0,399.0,Movie,223.781333,4.82138,655.04886,2.636404,...,296.188435,0.0,825.151374,0.0,372.622621,0.0,299.0,0.0,12.277491,-116.728536
3,54,10300,12,3.0,699.0,Movie,519.571884,2.520833,655.04886,2.636404,...,296.188435,0.0,825.151374,0.0,372.622621,0.0,299.0,0.0,-18.620813,179.062016
4,54,10284,12,1.0,299.0,Music,284.902913,1.174757,655.04886,2.636404,...,296.188435,0.0,825.151374,0.0,372.622621,0.0,299.0,0.0,14.097087,-87.719709


In [75]:
params = {
        'eta': 0.08, #best 0.08
        'max_depth': 7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'seed': 3,
        'gamma':1,
        'silent': True
    }

In [76]:
X_test = test_df[features]

In [77]:
watchlist = [
    (xgb.DMatrix(X_train, y_train), 'train'),
    (xgb.DMatrix(X_validation, y_validation), 'validation')
]
model = xgb.train(params, xgb.DMatrix(X_train, y_train), 500,  watchlist, maximize=False, verbose_eval=5, early_stopping_rounds=50)

[0]	train-rmse:1.15492	validation-rmse:1.10793
Multiple eval metrics have been passed: 'validation-rmse' will be used for early stopping.

Will train until validation-rmse hasn't improved in 50 rounds.
[5]	train-rmse:0.972058	validation-rmse:0.924231
[10]	train-rmse:0.87805	validation-rmse:0.839715
[15]	train-rmse:0.823896	validation-rmse:0.798597
[20]	train-rmse:0.796545	validation-rmse:0.779291
[25]	train-rmse:0.778307	validation-rmse:0.768394
[30]	train-rmse:0.766018	validation-rmse:0.761541
[35]	train-rmse:0.756882	validation-rmse:0.752172
[40]	train-rmse:0.750681	validation-rmse:0.748152
[45]	train-rmse:0.745364	validation-rmse:0.745806
[50]	train-rmse:0.741689	validation-rmse:0.746092
[55]	train-rmse:0.737687	validation-rmse:0.746165
[60]	train-rmse:0.733751	validation-rmse:0.74752
[65]	train-rmse:0.731123	validation-rmse:0.746142
[70]	train-rmse:0.728324	validation-rmse:0.746596
[75]	train-rmse:0.72592	validation-rmse:0.745586
[80]	train-rmse:0.723336	validation-rmse:0.745008
[8

In [78]:
pred = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
test_df['item_cnt_month'] = pred.clip(0, 40)
test_df[['ID', 'item_cnt_month']].to_csv('xgboost_submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [79]:
test_df.head()

Unnamed: 0,ID,mean_of_item_price_groupby_item_id,mean_of_item_cnt_month_groupby_item_id,mean_of_item_price_groupby_shop_id,mean_of_item_cnt_month_groupby_shop_id,mean_of_item_price_groupby_cats,mean_of_item_cnt_month_groupby_cats,mean_of_item_price_groupby_item_id_shop_id,mean_of_item_cnt_month_groupby_item_id_shop_id,item_cnt_month_1_month_ago,...,mean_of_item_cnt_month_groupby_item_id_12_month_ago,mean_of_item_price_groupby_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_shop_id_12_month_ago,mean_of_item_price_groupby_cats_12_month_ago,mean_of_item_cnt_month_groupby_cats_12_month_ago,mean_of_item_price_groupby_item_id_shop_id_12_month_ago,mean_of_item_cnt_month_groupby_item_id_shop_id_12_month_ago,diff_between_item_shop_and_item,diff_between_item_and_category,item_cnt_month
0,0,1960.580473,2.873303,804.758232,1.773768,1537.78918,2.633343,1693.518519,1.444444,0.0,...,2.873303,804.758232,1.773768,1537.78918,2.633343,1693.518519,1.444444,-267.061955,422.791294,0.671905
1,1,372.622621,1.379644,804.758232,1.773768,372.622621,1.379644,372.622621,1.379644,0.0,...,0.0,830.860711,0.0,372.622621,0.0,399.0,0.0,0.0,0.0,0.007018
2,2,844.516003,2.668421,804.758232,1.773768,1537.78918,2.633343,859.0,2.0,1.0,...,0.0,830.860711,0.0,372.622621,0.0,399.0,0.0,14.483997,-693.273177,1.067947
3,3,792.527697,1.855263,804.758232,1.773768,1537.78918,2.633343,599.0,1.0,0.0,...,0.0,830.860711,0.0,372.622621,0.0,399.0,0.0,-193.527697,-745.261482,0.258787
4,4,1537.78918,2.633343,804.758232,1.773768,1537.78918,2.633343,1537.78918,2.633343,0.0,...,0.0,830.860711,0.0,372.622621,0.0,399.0,0.0,0.0,0.0,0.091715


## Neural Network (not used in final submission)

In [None]:
import keras
from keras.models import Sequential
import keras.layers as ll
from keras import metrics

In [None]:
model = Sequential(name="mlp")

model.add(ll.InputLayer([20]))
# network body
model.add(ll.Dense(32))
model.add(ll.Activation('relu'))
model.add(ll.Dropout(0.5))
model.add(ll.Dense(32))
model.add(ll.Activation('relu'))
model.add(ll.Dense(1))
model.add(ll.Activation('relu'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=[metrics.mse])

In [None]:
model.summary()

In [None]:
model.fit(X_train.values, y_train.values,
          validation_data=(X_validation.values, y_validation.values), epochs=5)

## Stacking (takes a long time to run)

In [None]:
X_train_new = X_train.copy()
X_validation_new = X_validation.copy()
X_test_new = X_test.copy()

Train three xgboost model with different params

In [None]:
params1 = {
        'eta': 0.08, #best 0.08
        'max_depth': 7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'seed': 3,
        'gamma':1,
        'silent': True
    }

In [None]:
params2 = {
        'eta': 0.08, #best 0.08
        'max_depth': 8,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'seed': 4,
        'gamma':1,
        'silent': True
    }

In [None]:
params3 = {
        'eta': 0.08, #best 0.08
        'max_depth': 6,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'seed': 5,
        'gamma':1,
        'silent': True
    }

In [None]:
watchlist = [
    (xgb.DMatrix(X_train, y_train), 'train'),
    (xgb.DMatrix (X_validation, y_validation), 'validation')
]
for i, params in enumerate([params1, params2, params3]):
    model = xgb.train(params, xgb.DMatrix(X_train, y_train), 500,  watchlist, maximize=False, verbose_eval=50, early_stopping_rounds=50)
    X_train_new['xgboost_item_cnt_month_'+str(i)] = model.predict(xgb.DMatrix(X_train), ntree_limit=model.best_ntree_limit)
    X_validation_new['xgboost_item_cnt_month_'+str(i)] = model.predict(xgb.DMatrix(X_validation), ntree_limit=model.best_ntree_limit)
    X_test_new['xgboost_item_cnt_month_'+str(i)] = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)

In [None]:
X_test_new.head(10)

In [None]:
X_train_new.to_csv('./data/X_train_new.csv', index=False)
X_validation_new.to_csv('./data/X_validation_new.csv', index=False)
X_test_new.to_csv('./data/X_test_new.csv', index=False)

In [None]:
X_train_new = pd.read_csv('./data/X_train_new.csv')
X_validation_new = pd.read_csv('./data/X_validation_new.csv')
X_test_new = pd.read_csv('./data/X_test_new.csv')

Train three knn regressors

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
X_train_sample, _, y_train_sample, __ = train_test_split(X_train, y_train, train_size=.05, random_state=10)
scaler = MinMaxScaler()
scaler.fit(X_train_sample)
for k in (2, 3, 4):
    print("Training model "+str(k))
    neigh = KNeighborsRegressor(n_neighbors=k, n_jobs=4, algorithm='kd_tree')
    neigh.fit(scaler.transform(X_train_sample), y_train_sample)
    print("Using "+str(k)+" to predict")
    X_train_new[str(k)+'_neighbors'] = neigh.predict(scaler.transform(X_train))
    X_validation_new[str(k)+'_neighbors'] = neigh.predict(scaler.transform(X_validation))
    X_test_new[str(k)+'_neighbors'] = neigh.predict(scaler.transform(X_test))


Train svm regressors (not used)

In [None]:
from sklearn.svm import SVR
scaler = MinMaxScaler()
X_train_transform = scaler.fit_transform(X_train)
X_validation_transform = scaler.transform(X_validation)
X_test_transform = scaler.transform(X_test)
for kernel in 'poly', 'rbf', 'sigmoid':
    clf = SVR(kernel=kernel, max_iter=500)
    print("Training the "+kernel+" model")
    clf.fit(X_train_transform, y_train)
    print("Using the "+kernel+" model to predict")
    X_train_new['svm_'+kernel] = clf.predict(X_train_transform)
    X_validation_new['svm_'+kernel] = clf.predict(X_validation_transform)
    X_test_new['svm_'+kernel] = clf.predict(X_test_transform)

In [None]:
X_test_new.head(10)

In [None]:
for df in X_train_new, X_validation_new, X_test_new:
    df.drop(['svm_poly', 'svm_rbf', 'svm_sigmoid'], axis=1, inplace=True)

Use LR to ensemble all models

In [None]:
X_train = X_train_new
X_validation = X_validation_new
X_test = X_test_new

In [None]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=1, copy_X=True, normalize=True, max_iter=1000)
model.fit(X_train, y_train)
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_validation, model.predict(X_validation)))

In [None]:
pred = model.predict(X_test)
test_df['item_cnt_month'] = pred.clip(0, 40)
test_df[['ID', 'item_cnt_month']].to_csv('stacking_submission.csv', index=False)