## DM2 DMC | XGBoost

Credits: Building on datamining2/neuralnetworks/mlp_baseline.ipynb

Install XGBoost using e.g.: conda install -c rdonnelly py-xgboost

For an introductory example, see: https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

#### Working Directory

In [1]:
working_directory = 'C:/Users/JulianWeller/Desktop/DM2_DMC_Data/'

#### Imports

In [2]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

#### Loading the Data

As provided by Chung (and modified to also filter Y_full):

In [3]:
# Import cluster identifier
sales = pd.read_csv(working_directory + 'data_v0.1_sales.csv')
big_key = sales['key'][sales['cluster'] == "big"]
print(len(big_key.unique())) # Should only have 2907 keys remaining

# Import datasets
X_full = pickle.load(open(working_directory + 'X_flat.pkl', 'rb'))
Y_full = pickle.load(open(working_directory + 'Y_flat.pkl', 'rb'))

# Keep only rows which belong to cluster 'big'; should be 2,907*123 = 357,561 rows
X_full['key'] = X_full['key'].astype(str)
X_big = X_full[X_full['key'].isin(big_key.astype(str))]
X_big = X_big.reset_index(drop=True)
print(X_big.shape) # Check the number of rows = 357,561

# Keep only rows which belong to cluster 'big'; should be 2,907*123 = 357,561 rows
Y_full['key'] = Y_full['key'].astype(str)
Y_big = Y_full[Y_full['key'].isin(big_key.astype(str))]
Y_big = Y_big.reset_index(drop=True)
print(Y_big.shape) # Check the number of rows = 357,561

2907
(357561, 108)
(357561, 3)


In [4]:
X_full = X_big

In [5]:
Y_full = Y_big

In [6]:
X_full.shape

(357561, 108)

In [7]:
Y_full.shape

(357561, 3)

#### Train/Test Split

In [8]:
X_full['month'] = pd.DatetimeIndex(X_full['date']).month

In [9]:
Y_full['month'] = pd.DatetimeIndex(Y_full['date']).month

In [10]:
X_full_train = X_full.loc[X_full['month'] != 1]

In [11]:
Y_full_train = Y_full.loc[Y_full['month'] != 1]['sales']

In [12]:
X_full_test = X_full.loc[X_full['month'] == 1]

In [13]:
Y_full_test = Y_full.loc[Y_full['month'] == 1]['sales']

#### Additional Preparations

In [14]:
keys_dates = pd.DataFrame(X_full['key']).join(X_full['date']) # Store for future lookups

In [15]:
drop_x_cols = ['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'date', 'day_of_week', 
               'mainCategory', 'category', 'subCategory', 'releaseDate', 
               'rrp', 'price', 'month']

In [16]:
X_train = X_full_train.drop(drop_x_cols, axis=1)

In [17]:
X_test = X_full_test.drop(drop_x_cols, axis=1)

In [18]:
Y_train = Y_full_train

In [19]:
Y_test = Y_full_test

In [20]:
# Convert to numpy to reshape for input
X_train = X_train.as_matrix() # Each row has shape (num_vars,)
Y_train = Y_train.as_matrix() # Each row has shape (1,)
X_test = X_test.as_matrix()
Y_test = Y_test.as_matrix()

In [21]:
X_train.shape

(267444, 95)

In [22]:
Y_train.shape

(267444,)

#### Training the Model

In [23]:
model = XGBRegressor()
model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

#### Predictions

In [24]:
y_pred = model.predict(X_test)

In [25]:
predictions = [round(value) for value in y_pred]

#### Evaluation

In [26]:
mse = mean_squared_error(Y_test, predictions)
print("RMSE: " + str((sqrt(mse))))

RMSE: 2.264356922356634


#### Example

For the item with key '12985L', we predicted at '2018-01-12' 24.0 and the true value was 29.0

In [27]:
X_full_test.iloc[23230]['key']

'12985L'

In [28]:
X_full_test.iloc[23230]['date']

'2018-01-12'

In [29]:
predictions[23230]

24.0

In [30]:
Y_test[23230]

29.0

#### Can be easily used for other models, as well

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#### Predictions

In [33]:
y_pred = model.predict(X_test)

In [34]:
predictions = [round(value) for value in y_pred]

#### Evaluation

In [35]:
mse = mean_squared_error(Y_test, predictions)
print("RMSE: " + str((sqrt(mse))))

RMSE: 2.3180267709388747
