## Load Dependencies

In [1]:
import pandas as pd
import numpy as np 
import pickle
import xgboost as xgb
import time
import matplotlib as plt
import re
import datetime

In [2]:
print('pandas version '+pd.__version__)
print('numpy version '+np.__version__)
print('xgboost version '+xgb.__version__)
print('matplotlib version '+plt.__version__)
print('re version '+re.__version__)

pandas version 0.23.4
numpy version 1.15.4
xgboost version 0.80
matplotlib version 3.0.2
re version 2.2.1


## Set Global Variables

In [3]:
START_NOTEBOOK = time.time()
#Bool indicating whether or not we are loading the serialised model using pickle
LOAD_MODEL=True
#Bool indicating whether or not we are outputting files for submission
WRITE_FILE=True
#String indicating whether we are validating the model or training it for final submission, 
#can take values of 'VAL' or 'TEST' 
PHASE='TEST'

## Load Data

Let's load in the data prepared previously

In [4]:
x_train , x_test , y_train , y_test = pickle.load(open('saved_datasets/19-3-19,pipeline_A_reduced_frommaster,dateback=18/xtrain_xtest_ytrain_ytest.pickle','rb'))

## Preprocessing

Clipping the target and droping columns used for processing

In [5]:
def preproc_model3(x_train,x_test,y_train,y_test):

    x_train = [d.drop(['shop_id','item_id','item_category_id','city','item_type','item_info','month'],axis=1) for d in x_train]
    x_test =  [d.drop(['shop_id','item_id','item_category_id','city','item_type','item_info','month'],axis=1) for d in x_test]
    y_train = [d.clip(0,20) for d in y_train]
    y_test =  [d.clip(0,20) for d in y_test]
    return [x_train,x_test,y_train,y_test]

In [6]:
x_train , x_test , y_train , y_test = preproc_model3(x_train,x_test,y_train,y_test)

## Other Preprocessing Attempts

Attempts were made to better process the features, these included averaging over a longer period, dropping early data and non-negative matrix factorisations. Ultimately, none of these proved useful

### Try averaging some features

A descision was made not to use this

### NMF

### Try culling early data

Perhaps there is data so old, that we shouldn't use it?

# Feature Selection

After some experimentation it was decided that all features generated would be used, **this was not the case originally** but the data preparation notebook was edited to produce only the useful features

In [7]:
x_train[0].columns.tolist()

['date_block_num',
 'sum_item_sales_back_1',
 'sum_shop_sales_back_1',
 'item_cnt_month_back_1',
 'sum_item_cat_sales_back_1',
 'sum_city_back_1',
 'sum_itemtype_back_1',
 'sum_iteminfo_back_1',
 'sum_item_sales_back_2',
 'sum_shop_sales_back_2',
 'item_cnt_month_back_2',
 'sum_item_cat_sales_back_2',
 'sum_city_back_2',
 'sum_itemtype_back_2',
 'sum_iteminfo_back_2',
 'sum_item_sales_back_3',
 'sum_shop_sales_back_3',
 'item_cnt_month_back_3',
 'sum_item_cat_sales_back_3',
 'sum_city_back_3',
 'sum_itemtype_back_3',
 'sum_iteminfo_back_3',
 'sum_item_sales_back_4',
 'sum_shop_sales_back_4',
 'item_cnt_month_back_4',
 'sum_item_cat_sales_back_4',
 'sum_city_back_4',
 'sum_itemtype_back_4',
 'sum_iteminfo_back_4',
 'sum_item_sales_back_5',
 'sum_shop_sales_back_5',
 'item_cnt_month_back_5',
 'sum_item_cat_sales_back_5',
 'sum_city_back_5',
 'sum_itemtype_back_5',
 'sum_iteminfo_back_5',
 'sum_item_sales_back_6',
 'sum_shop_sales_back_6',
 'item_cnt_month_back_6',
 'sum_item_cat_sales_ba

### Select Validation/Test

In [8]:
if PHASE=='VAL':
    xtrain = x_train[0]
    ytrain = y_train[0]
    xtest = x_test[0]
    ytest = y_test[0]
    eval_set = [(xtrain,ytrain),(xtest,ytest)]
    n_estimators = 50000
elif PHASE=='TEST':
    xtrain = x_train[1]
    ytrain = y_train[1]
    xtest = x_test[1]
    ytest = y_test[1]
    eval_set = [(xtrain,ytrain)]
    n_estimators = 1943

## Fitting

In [9]:
model = xgb.XGBRegressor(
    seed=0,
    max_depth=2,
    learning_rate=0.05,
    n_estimators=n_estimators,
    objective='reg:linear',
    nthread=7,
    colsample_bytree=1,
    subsample=1
)


In [10]:
if LOAD_MODEL:
    model = pickle.load(open('model.pickle','rb'))

In [11]:
if not LOAD_MODEL:
    model.fit(
        verbose=True,
        X=xtrain,
        y=ytrain,
        eval_set=eval_set,
        early_stopping_rounds=500
    )

## Save Model

In [12]:
if not LOAD_MODEL:
    pickle.dump(model,open('model.pickle','wb'))

## Exploration

A number of plots are used to better understand both model and feature importance

### Accuracy per tree in forrest

In [13]:
if PHASE=='VAL':
    deval = model.evals_result()
    val0 = deval['validation_0']['rmse']
    val1 = deval['validation_1']['rmse']
    deval = pd.DataFrame({'val0':val0,'val1':val1})

Below is a plot of rmse as a function of the number of trees used, `val0` is the training set and `val1` the validation set.

In [14]:
if PHASE=='VAL':
    deval.plot()

### Gain

Unlike the frequency metric, which only counts the number of times a tree is split on each variable, gain measures the increase in accuracy contributed by each feature. After splitting, the loss function value decreases, this decrease corresponds to an increase in the gain value for the feature that was splitted on.

In [15]:
gain = pd.Series(model.get_booster().get_score(importance_type='gain'))
gain = gain.sort_values(ascending=True)
type(gain)

pandas.core.series.Series

In [16]:
ax = gain.plot(kind='barh',color='b')
ax.figure.set_size_inches(10,20)

### Gain by time-indexed variable type

Some variables are lagged over time. Here their effects are aggregated to give a better overall picture of time-based variable importance.

In [17]:
#d = pd.DataFrame()
d = {}
coltypes=[  
 'sum_item_sales_back_',
 'sum_shop_sales_back_',
 'item_cnt_month_back_',
 'sum_item_cat_sales_back_',
 'sum_item_cat_shop_sales_back_',
 'sum_city_back_',
 'sum_itemtype_back_',
 'sum_iteminfo_back_',
 'sum_city_item_back_',
 'sum_city_item_cat_back_',
 'sum_itemtype_shop_back_',
 'sum_itemtype_city_back_']
save=[]
for ct in coltypes:
    save.append(gain.filter(regex=ct))
    d[ct] = np.sum(gain.filter(regex=ct))

In [18]:
sorted(d.items(),key=lambda t: t[1],reverse=True)

[('item_cnt_month_back_', 103314.54555532697),
 ('sum_shop_sales_back_', 21380.465809077472),
 ('sum_city_back_', 20044.459942743037),
 ('sum_item_cat_sales_back_', 12652.016951253481),
 ('sum_itemtype_back_', 12535.807207940754),
 ('sum_item_sales_back_', 7747.190809840945),
 ('sum_iteminfo_back_', 5462.923161475649),
 ('sum_item_cat_shop_sales_back_', 0.0),
 ('sum_city_item_back_', 0.0),
 ('sum_city_item_cat_back_', 0.0),
 ('sum_itemtype_shop_back_', 0.0),
 ('sum_itemtype_city_back_', 0.0)]

As expected, `item_cnt_month_back_` is the most important of these features, it is the target variable that has been lagged.

# Write Submissions

In [19]:
preds = model.predict(xtest)

## Submission 1

In [20]:
if WRITE_FILE:
    submission = pd.DataFrame({'ID':np.arange(0,214200),'item_cnt_month':preds})
    submission.to_csv('submissions/submission'+str(datetime.datetime.now()).replace(':','.')+'.csv',index=False)

## Submission 2, Adjusting for Leak

In [21]:
if WRITE_FILE:
    submission['item_cnt_month'] = 0.2839365/submission.item_cnt_month.mean() * submission.item_cnt_month
    submission.to_csv('submissions/submission_adjusted_'+str(datetime.datetime.now()).replace(':','.')+'.csv',index=False)