In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [366]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [367]:
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [368]:
rid = test_data.record_ID

In [369]:
train_data.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,173669,29/01/13,8398,245338,498.0375,498.0375,0,0,15
1,125798,03/07/12,9909,217390,170.2875,170.2875,0,0,18
2,158801,27/11/12,8222,222087,160.3125,227.2875,1,1,500
3,111731,08/05/12,9164,300021,106.1625,106.1625,0,0,83
4,159553,27/11/12,9432,223245,213.0375,213.0375,0,0,36


In [370]:
(train_data.total_price>train_data.base_price).shape

(150150,)

In [371]:
train_data.total_price[train_data.total_price.isna()]

119485   NaN
Name: total_price, dtype: float64

In [372]:
# only one data point is nan so we drop it without hesitating
train_data.dropna(inplace=True)

In [373]:
# The day of the week with Monday=0, Sunday=6
train_data.week = pd.to_datetime(train_data.week)
train_data['dow'] = train_data['week'].dt.dayofweek

test_data.week = pd.to_datetime(test_data.week)
test_data['dow'] = test_data['week'].dt.dayofweek

In [374]:
train_data['is_bp_eq_tp'] = train_data.base_price==train_data.total_price
test_data['is_bp_eq_tp'] = test_data.base_price==test_data.total_price

In [375]:
train_data.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,dow,is_bp_eq_tp
0,173669,2013-01-29,8398,245338,498.0375,498.0375,0,0,15,1,True
1,125798,2012-03-07,9909,217390,170.2875,170.2875,0,0,18,2,True
2,158801,2012-11-27,8222,222087,160.3125,227.2875,1,1,500,1,False
3,111731,2012-08-05,9164,300021,106.1625,106.1625,0,0,83,6,True
4,159553,2012-11-27,9432,223245,213.0375,213.0375,0,0,36,1,True


In [376]:
cols_to_drop = ['record_ID', 'week']
train_data = train_data.drop(cols_to_drop, axis=1)
test_data = test_data.drop(cols_to_drop, axis=1)

In [377]:
train_data.columns

Index(['store_id', 'sku_id', 'total_price', 'base_price', 'is_featured_sku',
       'is_display_sku', 'units_sold', 'dow', 'is_bp_eq_tp'],
      dtype='object')

### start training model

In [350]:
y = train_data.units_sold

In [351]:
X = train_data.drop(['units_sold'], axis=1)

In [352]:
X.head()

Unnamed: 0,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,dow,is_bp_eq_tp
0,9043,217390,168.15,168.15,0,0,0,True
1,9436,222087,115.425,241.5375,0,0,1,False
2,8319,222087,158.8875,227.2875,1,0,0,False
3,8222,222087,160.3125,216.6,0,1,3,False
4,8023,300021,104.7375,104.7375,0,1,4,True


In [353]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y)

### Applying Decision Tree

In [229]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_log_error

In [243]:
def get_msle(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=5100, min_samples_split=each, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    msle = mean_squared_log_error(val_y, preds_val)
    return msle

In [247]:
# compare MAE with differing values of max_leaf_nodes
for each in [28, 30, 32]:
    my_msle = get_msle(each, train_X, val_X, train_y, val_y)
    print("%d  \t\t Mean Squared Log Error:  %f" %(each, my_msle))

28  		 Mean Squared Log Error:  0.205341
30  		 Mean Squared Log Error:  0.205148
32  		 Mean Squared Log Error:  0.205325


In [248]:
dt_model = DecisionTreeRegressor(max_leaf_nodes=5100, min_samples_split=30, random_state=0)
dt_model.fit(train_X, train_y)

DecisionTreeRegressor(max_leaf_nodes=5100, min_samples_split=30, random_state=0)

### Applying Random Forest

In [265]:
from sklearn.ensemble import RandomForestRegressor

for each in [270, 300]:
    forest_model = RandomForestRegressor(n_estimators=each, max_leaf_nodes = 16000, random_state=1)
    forest_model.fit(train_X, train_y)
    forest_preds = forest_model.predict(val_X)
    print(mean_squared_log_error(val_y, forest_preds))

0.1805990626238905
0.18060114448395062


In [354]:
forest_model = RandomForestRegressor(n_estimators=250, max_leaf_nodes = 16000, random_state=1)
forest_model.fit(X, y)

RandomForestRegressor(max_leaf_nodes=16000, n_estimators=250, random_state=1)

### Applying XGB

In [384]:
ohe_sku = pd.get_dummies(X.sku_id)
ohe_store = pd.get_dummies(X.store_id)

cols_store = list(ohe_store.columns)
for each in cols_store:
    X[each] = ohe_store[each]

cols_sku = list(ohe_sku.columns)
for each in cols_sku:
    X[each] = ohe_sku[each]

X.drop(['sku_id', 'store_id'], axis=1)

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [398]:
from xgboost import XGBRegressor

xg_model = XGBRegressor(n_estimators=250, learning_rate=0.01, max_depth=30)
xg_model.fit(train_X, train_y, early_stopping_rounds=10, eval_set=[(val_X, val_y)], verbose=False)
xg_pred = xg_model.predict(val_X)

In [399]:
print(mean_squared_log_error(val_y, xg_pred))

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

### lets predict on test data

In [385]:
# this test data preprocessing required only for xgboost
ohe_sku = pd.get_dummies(test_data.sku_id)
ohe_store = pd.get_dummies(test_data.store_id)

cols_store = list(ohe_store.columns)
for each in cols_store:
    test_data[each] = ohe_store[each]

cols_sku = list(ohe_sku.columns)
for each in cols_sku:
    test_data[each] = ohe_sku[each]

test_data = test_data.drop(['sku_id', 'store_id'], axis=1)

In [386]:
test_col = list(test_data.columns)
train_col = list(X.columns)

for each in train_col:
    if each not in test_col:
        test_data[each] = [0]*test_data.shape[0]

test_data = test_data[train_col]

test_data.shape

### predict and save submission file

In [249]:
# predict using decision tree
f = 0
preds = dt_model.predict(test_data)

In [395]:
# predict using random forest
f = 1
preds_f = forest_model.predict(test_data)

ValueError: Number of features of the model must match the input. Model n_features is 8 and input n_features is 112 

In [396]:
# predict using xgboost
f = 2
preds = xg_model.predict(test_data)

In [397]:
subm = pd.DataFrame()
subm['record_ID'] = rid
subm['units_sold'] = preds

if f==0:
    subm.to_csv("dt/submission_dt.csv", index=None)
elif f==1:
    subm.to_csv("rf/submission_rf.csv", index=None)
elif f==2:
    subm.to_csv("xgb/submission_xgb.csv", index=None)