## DM2 DMC | XGBoost

Credits: Building on datamining2/neuralnetworks/mlp_baseline.ipynb

Install XGBoost using e.g.: conda install -c rdonnelly py-xgboost

For an introductory example, see: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

#### Working Directory

In [1]:
working_directory = 'C:/Users/JulianWeller/Desktop/DM2_DMC_Data/'

#### Imports

In [2]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

#### Loading the Data

As provided by Philipp:

In [3]:
# Filter for flat dataset
X_full = pickle.load(open(working_directory + 'X_flat.pkl', 'rb'))
Y_full = pickle.load(open(working_directory + 'Y_flat.pkl', 'rb'))
clusters = pd.read_csv(working_directory + 'data_v0.1_sales.csv')
clusters=clusters[['key','cluster']]
X_full = X_full.merge(clusters, on='key')
Y_full = Y_full.merge(clusters, on='key')
X_full = X_full[X_full['cluster']=='big']
Y_full = Y_full[Y_full['cluster']=='big']
if len(X_full)==len(Y_full): 
    del (X_full['cluster'])
    del (Y_full['cluster'])
    print(X_full.shape)
    print(Y_full.shape)
    print(X_full.columns)
    print(Y_full.columns)
    print('We now have {} training samples.'.format(len(Y_full['key'].unique())))# adjust parameter in NN accordingly
else:
    print('Sth. went wrong.')

(353379, 108)
(353379, 3)
Index(['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'mainCategory',
       'category', 'subCategory', 'releaseDate',
       ...
       'cat_7', 'cat_10', 'cat_16', 'cat_18', 'cat_24', 'cat_30', 'cat_33',
       'cat_36', 'cat_37', 'marketing_activity'],
      dtype='object', length=108)
Index(['key', 'date', 'sales'], dtype='object')
We now have 2873 training samples.


#### Train/Test Split

In [4]:
X_full['month'] = pd.DatetimeIndex(X_full['date']).month

In [5]:
Y_full['month'] = pd.DatetimeIndex(Y_full['date']).month

In [6]:
X_full_train = X_full.loc[X_full['month'] != 1]

In [7]:
Y_full_train = Y_full.loc[Y_full['month'] != 1]['sales']

In [8]:
X_full_test = X_full.loc[X_full['month'] == 1]

In [9]:
Y_full_test = Y_full.loc[Y_full['month'] == 1]['sales']

#### Additional Preparations

In [10]:
keys_dates = pd.DataFrame(X_full['key']).join(X_full['date']) # Store for future lookups

In [11]:
drop_x_cols = ['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'date', 'day_of_week', 
               'mainCategory', 'category', 'subCategory', 'releaseDate', 
               'rrp', 'price', 'month']

In [12]:
X_train = X_full_train.drop(drop_x_cols, axis=1)

In [13]:
X_test = X_full_test.drop(drop_x_cols, axis=1)

In [14]:
Y_train = Y_full_train

In [15]:
Y_test = Y_full_test

In [16]:
# Convert to numpy to reshape for input
X_train = X_train.as_matrix() # Each row has shape (num_vars,)
Y_train = Y_train.as_matrix() # Each row has shape (1,)
X_test = X_test.as_matrix()
Y_test = Y_test.as_matrix()

In [17]:
X_train.shape

(264316, 95)

In [18]:
Y_train.shape

(264316,)

#### Training the Model

In [19]:
model = XGBRegressor()
model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

#### Predictions

In [20]:
y_pred = model.predict(X_test)

In [21]:
predictions = [round(value) for value in y_pred]

#### Evaluation

In [22]:
mse = mean_squared_error(Y_test, predictions)
print("RMSE: " + str((sqrt(mse))))

RMSE: 2.2764513076054653


#### Example

For the item with key '10447L', we predicted at '2018-01-15' 12.0 and the true value was 15.0

In [23]:
X_full_test.iloc[3610]['key']

'10447L'

In [24]:
X_full_test.iloc[3610]['date']

'2018-01-15'

In [25]:
predictions[3610]

12.0

In [26]:
Y_test[3610]

15.0