## Load Dependencies

In [1]:
import pandas as pd
import numpy as np 
import pickle
import xgboost as xgb
import time
import graphviz
import matplotlib as plt
import re

## Set Global Variables

In [None]:
START_NOTEBOOK = time.time()
SUB=False
AVE= False
NMF_DIM = 40

## Load Data

In [None]:
x_train , x_test , y_train , y_test = pickle.load(open('../saved_datasets/19-3-19,pipeline_A_reduced_frommaster,dateback=18/xtrain_xtest_ytrain_ytest.pickle','rb'))

## Preprocessing

### Try averaging some features

In [None]:
def ave_feats(df):
    feats = [     
     'sum_item_sales_back_',
     'sum_shop_sales_back_',
     'item_cnt_month_back_',
     'sum_item_cat_sales_back_',
     'sum_city_back_',
     'sum_itemtype_back_',
     'sum_iteminfo_back_'
            ]
    df = df.copy()
    for feat in feats:
        print('averaging on '+feat+'...')
        for i in np.arange(1,18,3):
            df['ave_'+feat+str(i)] = (df[feat+str(i)]+df[feat+str(i+1)]+df[feat+str(i+2)])/3
        reduced_cols = [a for a in df.columns.tolist() if not re.match('^'+feat,a)]
        df = df[reduced_cols]
    return df

In [None]:
if AVE:
    x_train = list(map(ave_feats,x_train))
    x_test = list(map(ave_feats,x_test))

### Try culling early data

In [None]:
x_train_saved = x_train
y_train_saved = y_train
x_train = []
y_train = []
CUTOFF = 25
for xtrain_el , ytrain_el in zip(x_train_saved,y_train_saved):
    x_train.append(xtrain_el[xtrain_el.date_block_num>CUTOFF])
    y_train.append(ytrain_el[xtrain_el.date_block_num>CUTOFF])

### Other Preproc

In [None]:
def preproc_model3(x_train,x_test,y_train,y_test):

    x_train = [d.drop(['shop_id','item_id','item_category_id','city','item_type','item_info','month'],axis=1) for d in x_train]
    x_test =  [d.drop(['shop_id','item_id','item_category_id','city','item_type','item_info','month'],axis=1) for d in x_test]
    y_train = [d.clip(0,20) for d in y_train]
    y_test =  [d.clip(0,20) for d in y_test]
    return [x_train,x_test,y_train,y_test]

In [None]:
x_train , x_test , y_train , y_test = preproc_model3(x_train,x_test,y_train,y_test)

# Feature Selection

In [None]:
#x_train[0].columns.tolist()

In [None]:
feat_list= []
#feat_list = ['_'+str(x)+'$' for x in range(7,19)] + []
#feat_list = ['iteminfo','item_cat','city','item_sales']
feat_list

In [None]:
def quickdrop(df):
    cols_keep = df.columns.tolist()
    for feat in feat_list:
        cols_keep = [c for c in cols_keep if not re.match('.*'+feat,c)]
    return df[cols_keep]

In [None]:
x_train = list(map(quickdrop,x_train))
x_test = list(map(quickdrop,x_test))

In [None]:
#x_train[0].columns.tolist()

# Fitting

In [None]:
model = xgb.XGBRegressor(
    seed=0,
    max_depth=2,
    learning_rate=0.5,
    n_estimators=1000,
    objective='reg:linear',
    nthread=7,
#    min_child_weight=100,
    colsample_bytree=0.8,
    subsample=0.8
)
eval_set = [(x_train[0],y_train[0]),(x_test[0],y_test[0])]

In [None]:
model.fit(
    verbose=True,
    X=x_train[0],
    y=y_train[0],
    eval_set=eval_set,
    early_stopping_rounds=50
)

In [None]:
print('CUTOFF='+str(CUTOFF))

In [None]:
deval = model.evals_result()
val0 = deval['validation_0']['rmse']
val1 = deval['validation_1']['rmse']
deval = pd.DataFrame({'val0':val0,'val1':val1})

In [None]:
time.sleep(0.2)
deval.plot()

###### ax = xgb.plot_importance(model,height=0.8,)
print(ax.figure.set_size_inches(5,10))

In [None]:
#%env PATH=C:\Program Files (x86)\Graphviz2.38\bin
#ax = xgb.plot_tree(model)
#ax.figure.set_size_inches(20,20)

In [None]:
gain = pd.Series(model.get_booster().get_score(importance_type='gain'))
gain = gain.sort_values(ascending=True)
type(gain)

In [None]:
ax = gain.plot(kind='barh')
ax.figure.set_size_inches(10,20)

In [None]:
df = x_train[0].copy()
df['target'] = y_train[0]
df.groupby('new_item').target.mean().plot(kind='bar')

In [None]:
dft = x_test[0].copy()
dft['target'] = y_test[0]
dft.groupby('new_item').target.mean().plot(kind='bar')

In [None]:
#d = pd.DataFrame()
d = {}
coltypes=[  
 'sum_item_sales_back_',
 'sum_shop_sales_back_',
 'item_cnt_month_back_',
 'sum_item_cat_sales_back_',
 'sum_item_cat_shop_sales_back_',
 'sum_city_back_',
 'sum_itemtype_back_',
 'sum_iteminfo_back_',
 'sum_city_item_back_',
 'sum_city_item_cat_back_',
 'sum_itemtype_shop_back_',
 'sum_itemtype_city_back_']
save=[]
for ct in coltypes:
    save.append(gain.filter(regex=ct))
    d[ct] = np.sum(gain.filter(regex=ct))

In [None]:
sorted(d.items(),key=lambda t: t[1],reverse=True)

In [None]:
np.sum(save[0])

In [None]:
len(gain)

In [None]:
pd.Series(gain)

# Retrain model with all data

# Write Submission

In [None]:
SUB=True

In [None]:
del model

In [None]:
model = xgb.XGBRegressor(
    seed=0,
    max_depth=2,
    learning_rate=0.5,
    n_estimators=194,
    objective='reg:linear',
    nthread=7,
#    min_child_weight=100,
    colsample_bytree=0.8,
    subsample=0.8
)

In [None]:
#retrain on all data
model.fit(
    verbose=True,
    X=x_train[1],
    y=y_train[1],
    eval_set=eval_set
)

In [None]:
if SUB:
    print('predicting...')
    preds = model.predict(x_test[1])

In [None]:
import datetime
str(datetime.datetime.now()).replace(':','.')

In [None]:
submission = pd.DataFrame({'ID':np.arange(0,214200),'item_cnt_month':preds})
submission.to_csv('../gen_data/submission'+str(datetime.datetime.now()).replace(':','.')+'.csv',index=False)

## Adjusting

In [None]:
submission['item_cnt_month'] = 0.2839365/submission.item_cnt_month.mean() * submission.item_cnt_month
submission.to_csv('../gen_data/submission_adjusted_'+str(datetime.datetime.now()).replace(':','.')+'.csv',index=False)