# Part 2, xgboost

In [None]:
import numpy as np
import pandas as pd
import gc
import pickle
import time
import os
print(os.listdir("../input"))
from xgboost import XGBRegressor

In [None]:
data = pd.read_pickle('../input/predict-future-sales-feature-engineering-xgb-15/feature.pickle')

In [None]:
print('\n'.join(data.columns))
print(data.shape)

#### Removing some irrelevant features that led to overfitting when previously trained

In [None]:
data = data[['date_block_num',
'date_cat_avg_item_cnt_lag_1',
'date_cat_avg_item_cnt_lag_12',
'date_cat_avg_item_cnt_lag_2',
'date_cat_avg_item_cnt_lag_3',
'date_cat_avg_item_cnt_lag_6',
'date_cat_avg_item_price_lag_1',
'date_cat_avg_item_price_lag_12',
'date_cat_avg_item_price_lag_2',
'date_cat_avg_item_price_lag_3',
'date_cat_avg_item_price_lag_6',
'date_cat_sum_orders_lag_1',
'date_cat_sum_orders_lag_12',
'date_cat_sum_orders_lag_2',
'date_cat_sum_orders_lag_3',
'date_cat_sum_orders_lag_6',
'date_item_avg_item_cnt_lag_1',
'date_item_avg_item_cnt_lag_12',
'date_item_avg_item_cnt_lag_2',
'date_item_avg_item_cnt_lag_3',
'date_item_avg_item_cnt_lag_6',
'date_item_avg_item_price_lag_1',
'date_item_avg_item_price_lag_12',
'date_item_avg_item_price_lag_2',
'date_item_avg_item_price_lag_3',
'date_item_avg_item_price_lag_6',
'date_item_sum_orders_lag_1',
'date_item_sum_orders_lag_12',
'date_item_sum_orders_lag_2',
'date_item_sum_orders_lag_3',
'date_item_sum_orders_lag_6',
'date_shop_avg_item_cnt_lag_1',
'date_shop_avg_item_cnt_lag_12',
'date_shop_avg_item_cnt_lag_2',
'date_shop_avg_item_cnt_lag_3',
'date_shop_avg_item_cnt_lag_6',
'date_shop_avg_item_price_lag_1',
'date_shop_avg_item_price_lag_12',
'date_shop_avg_item_price_lag_2',
'date_shop_avg_item_price_lag_3',
'date_shop_avg_item_price_lag_6',
'date_shop_sum_orders_lag_1',
'date_shop_sum_orders_lag_12',
'date_shop_sum_orders_lag_2',
'date_shop_sum_orders_lag_3',
'date_shop_sum_orders_lag_6',
'item_cnt_month',
'item_cnt_month_lag_1',
'item_cnt_month_lag_12',
'item_cnt_month_lag_2',
'item_cnt_month_lag_3',
'item_cnt_month_lag_6',
'item_monthly_mean_lag_1',
'item_monthly_mean_lag_12',
'item_monthly_mean_lag_2',
'item_monthly_mean_lag_3',
'item_monthly_mean_lag_6',
'orders_lag_1',
'orders_lag_12',
'orders_lag_2',
'orders_lag_3',
'orders_lag_6',
'shop_item_avg_item_cnt_lag_1',
'shop_item_avg_item_cnt_lag_12',
'shop_item_avg_item_cnt_lag_2',
'shop_item_avg_item_cnt_lag_3',
'shop_item_avg_item_cnt_lag_6',
'shop_item_avg_item_price_lag_1',
'shop_item_avg_item_price_lag_12',
'shop_item_avg_item_price_lag_2',
'shop_item_avg_item_price_lag_3',
'shop_item_avg_item_price_lag_6',
'shop_item_monthly_mean_lag_1',
'shop_item_monthly_mean_lag_12',
'shop_item_monthly_mean_lag_2',
'shop_item_monthly_mean_lag_3',
'shop_item_monthly_mean_lag_6',
'shop_item_sum_orders_lag_1',
'shop_item_sum_orders_lag_12',
'shop_item_sum_orders_lag_2',
'shop_item_sum_orders_lag_3',
'shop_item_sum_orders_lag_6',
'shop_monthly_mean_lag_1',
'shop_monthly_mean_lag_12',
'shop_monthly_mean_lag_2',
'shop_monthly_mean_lag_3',
'shop_monthly_mean_lag_6',
'month'
            ]]

In [None]:
print('\n'.join(data.columns))
print(data.shape)

#### Diviidng data into train, validation and test sets based on date_block_num

In [None]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
print('\n',data.columns.values)

In [None]:
del data
gc.collect();

#### Model declaration and training

In [None]:
ts = time.time()

model = XGBRegressor(
    max_depth=8,
    tree_method = 'exact',
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

In [None]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

In [None]:
pickle.dump(model, open('xgb_model.pickle', 'wb'))
model = pickle.load(open('xgb_model.pickle', 'rb'))

#### Plot feature importance

In [None]:
from xgboost import plot_importance

import matplotlib.pyplot as plt
%matplotlib inline

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (12,42))

#### Create submission

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('submission.csv', index=False)

# Note: Takes considerable amount of time to run. I ran it on kaggle kernel!**