# XGBoost -- Prediction in Feb

Editor: Lu Lifei

Model part: based on @Julian '2018_05_09_b_DM2_DMC_XGBoost.ipynb'

## Model
### 1. Import and load

In [1]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Import datasets

X_full = pickle.load(open('/Users/effylu/Downloads/XY_flat_final/X_flat_final.pkl', 'rb'))
Y_full = pickle.load(open('/Users/effylu/Downloads/XY_flat_final/Y_flat_final.pkl', 'rb'))

### 2. Prepare

In [3]:
# Add on column 'month'

X_full['month'] = pd.DatetimeIndex(X_full['date']).month
Y_full['month'] = pd.DatetimeIndex(Y_full['date']).month

In [4]:
# Split the X_full and Y_full into traing and test

X_train = X_full.loc[X_full['month'] != 2]
Y_train = Y_full.loc[Y_full['month'] != 2]['sales']
X_test = X_full.loc[X_full['month'] == 2]
Y_test = Y_full.loc[Y_full['month'] == 2]['sales']

# Save the 'date' and 'key' for further use
column_name = X_test['date'].unique().astype(str)
row_name = X_test['key'].unique().astype(str)

In [5]:
# Prepare the data for fitting the input of the model
# Delete useless columns

drop_x_cols = ['key', 'pid_x', 'size_x', 'color', 'brand', 'rrp', 'date', 'day_of_week', 
               'mainCategory', 'category', 'subCategory', 'releaseDate', 
               'rrp', 'price', 'month']
X_train = X_train.drop(drop_x_cols, axis=1)
X_test = X_test.drop(drop_x_cols, axis=1)

In [6]:
# Change all data format into matrix

X_train = X_train.as_matrix()
X_train = np.delete(X_train, np.s_[14:28], axis=1)
Y_train = Y_train.as_matrix()
X_test = X_test.as_matrix()
X_test = np.delete(X_test, np.s_[14:28], axis=1)
Y_test = Y_test.as_matrix()

In [7]:
X_test.shape

(359072, 81)

### 3. Train model

In [8]:
# Training the model

model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
      colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
      max_depth=3, min_child_weight=1, missing=None, n_estimators=40,
      n_jobs=8, nthread=None, objective='reg:linear', random_state=0,
      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
      silent=True, subsample=1.0)
model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=40,
       n_jobs=8, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1.0)

## Prediction

In [9]:
# Test the model
# Only the sale unit of the first day for each item is right int the 'X_test'
# Select the row 'on Jan 1st'

X_Feb1 = X_test[0:1,:]
for i in range(int(len(X_test)/28-1)):
    X_Feb1 = np.vstack([X_Feb1, X_test[(28+i*28):(29+i*28),:]])
print(X_Feb1)
print(X_Feb1.shape)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  1.  0. ...,  0.  0.  0.]]
(12824, 81)


In [10]:
# Predict the sales unit 'on Jan 1st' for each items
# Change the format of the prediction results on Jan_1st

Y_Feb1 = model.predict(X_Feb1)
prediction_1 = np.asarray([round(value) for value in Y_Feb1])
prediction_1 = np.reshape(prediction_1, (len(prediction_1),1))
print(prediction_1)

[[ 0.]
 [ 0.]
 [ 0.]
 ..., 
 [ 0.]
 [ 0.]
 [ 0.]]


In [11]:
# Delete the 'last_14_day_sales'
# Add the prediction results as the 'last_1_day_sales'

X_Feb = X_Feb1
X_Feb = np.delete(X_Feb, np.s_[13:14], axis=1)
X_Feb = np.append(prediction_1, X_Feb, axis=1)
print(X_Feb)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  1. ...,  0.  0.  0.]]


In [12]:
# Add the process above into a loop
# Predict the sales units eery day in January for each item

predictions = prediction_1
for i in range(27):
    Y_Feb = model.predict(X_Feb)
    prediction = np.asarray([round(value) for value in Y_Feb])
    prediction = np.reshape(prediction, (len(prediction),1))
    predictions = np.append(predictions, prediction, axis=1)
    X_Feb = np.delete(X_Feb, np.s_[13:14], axis=1)
    X_Feb = np.append(prediction, X_Feb, axis=1)
print(predictions)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [13]:
# Check row 161

print(predictions[161,:])

[ 6.  5.  4.  3.  3.  5.  5.  5.  5.  6.  5.  6.  6.  6.  6.  6.  6.  6.
  6.  6.  6.  6.  7.  6.  7.  7.  7.  7.]


## Find Sold-Out-Date in Febuary

### 1. Restructure the predictions

In [14]:
# Aggregate sales for each day each item.

pred_agg = predictions
agg_sum = predictions[:,0]
for i in range(len(column_name)-1):
    agg_sum = pred_agg[:,i] + predictions[:, i+1]
    pred_agg[:, i+1] = agg_sum
print(pred_agg)
print(pred_agg.shape)

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(12824, 28)


In [15]:
# Check row 161

print(pred_agg[161,:])

[   6.   11.   15.   18.   21.   26.   31.   36.   41.   47.   52.   58.
   64.   70.   76.   82.   88.   94.  100.  106.  112.  118.  125.  131.
  138.  145.  152.  159.]


### 2. Find the Sold-Out-Date in Febuary

In [16]:
# Load the test data

items = pd.read_csv('/Users/effylu/Downloads/DMC_2018_task/items.csv', encoding = 'ascii', sep = '|')

# Store for further use

pred_date = np.asarray(items[['releaseDate']])

In [17]:
# Add 'key' for test data by merging 'pid' and 'size'
# Select useful attributes

items["key"] = items["pid"].map(int).map(str) + items["size"]
subItems = items[['key','pid','size','stock']]
subItems = np.asarray(subItems)
print(subItems.shape)

(12824, 4)


#### 2.1.  Match the 'items' with the 'pred_agg'

In [18]:
pred_day = np.zeros((len(subItems),1), dtype=int)

for i in range(len(subItems)):
    
    if subItems[i,3] < pred_agg[i,0]:
        pred_day[i,0] = 1
        pred_date[i,0] = column_name[0]
        continue
            
    if subItems[i,3] > pred_agg[i,27]:
        pred_day[i,0] = 14
        pred_date[i,0] = column_name[13]
        continue
        
    for j in range(len(pred_agg[0])):
            
        if pred_agg[i,j] - subItems[i,3] >= 0:
            pred_day[i,0] = j+1
            pred_date[i,0] = column_name[j]
            break
            
print(pred_date[161,0])    

2018-02-02


#### 2.2. Visualize result

In [19]:
# Visualize the result in dataframe

result = np.append(subItems, pred_date, axis=1)
result_column =['key','pid','size','stock','soldOutDate']
prediction_result = pd.DataFrame(result, columns=result_column)
prediction_result =prediction_result[['pid','size','soldOutDate']]
prediction_result

Unnamed: 0,pid,size,soldOutDate
0,10000,XL ( 158-170 ),2018-02-14
1,10001,L,2018-02-14
2,10003,3 (35-38 ),2018-02-14
3,10003,4 ( 39-42 ),2018-02-14
4,10003,5 ( 43-46 ),2018-02-14
5,10006,XL,2018-02-14
6,10008,XL,2018-02-14
7,10013,L,2018-02-14
8,10013,M,2018-02-14
9,10013,S,2018-02-14


In [20]:
prediction_result.to_csv("/Users/effylu/Downloads/Team_1_solution.csv", index=False, sep='|', encoding='ASCII')