In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [271]:
import tensorflow as tf
import keras
import random

In [272]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


### Load and Preprocess data

In [341]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [342]:
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [343]:
rid = test_data.record_ID

In [344]:
# only one data point is nan so we drop it without hesitating
train_data.dropna(inplace=True)

In [345]:
# The day of the week with Monday=0, Sunday=6
train_data.week = pd.to_datetime(train_data.week)
train_data['dow'] = train_data['week'].dt.dayofweek

test_data.week = pd.to_datetime(test_data.week)
test_data['dow'] = test_data['week'].dt.dayofweek

In [346]:
train_data['is_bp_eq_tp'] = train_data.base_price-train_data.total_price
test_data['is_bp_eq_tp'] = test_data.base_price-test_data.total_price

In [347]:
cols_to_drop = ['record_ID', 'week']
train_data = train_data.drop(cols_to_drop, axis=1)
test_data = test_data.drop(cols_to_drop, axis=1)

In [348]:
y = train_data.units_sold
X = train_data.drop(['units_sold'], axis=1)

In [280]:
ohe_sku = pd.get_dummies(X.sku_id)
ohe_store = pd.get_dummies(X.store_id)

cols_store = list(ohe_store.columns)
for each in cols_store:
    X[each] = ohe_store[each]

cols_sku = list(ohe_sku.columns)
for each in cols_sku:
    X[each] = ohe_sku[each]

X = X.drop(['sku_id', 'store_id'], axis=1)

In [281]:
# this test data preprocessing required only for xgboost
ohe_sku = pd.get_dummies(test_data.sku_id, drop_first=True)
ohe_store = pd.get_dummies(test_data.store_id, drop_first=True)

cols_store = list(ohe_store.columns)
for each in cols_store:
    test_data[each] = ohe_store[each]

cols_sku = list(ohe_sku.columns)
for each in cols_sku:
    test_data[each] = ohe_sku[each]

test_data = test_data.drop(['sku_id', 'store_id'], axis=1)

In [282]:
test_col = list(test_data.columns)
train_col = list(X.columns)

for each in train_col:
    if each not in test_col:
        test_data[each] = [0]*test_data.shape[0]

test_data = test_data[train_col]

test_data.shape

(13860, 109)

In [283]:
X.shape

(150149, 109)

In [284]:
from sklearn.preprocessing import MinMaxScaler

#converting dataset into x_train and y_train
scaler = MinMaxScaler()

# initialize the column names of the continuous data
continuous = ['total_price', 'base_price', 'is_bp_eq_tp']

# performin min-max scaling each continuous feature column to
# the range [0, 1]
scaler.fit(X[continuous])
X[continuous] = scaler.transform(X[continuous])
test_data[continuous] = scaler.transform(test_data[continuous])

In [285]:
X.head()

Unnamed: 0,total_price,base_price,is_featured_sku,is_display_sku,is_bp_eq_tp,8023,8058,8063,8091,8094,...,320485,327492,378934,398721,545621,546789,547934,600934,673209,679023
0,0.280438,0.251778,0,0,0.298805,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.822161,0.815078,0,0,0.298805,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.548564,0.530583,0,0,0.298805,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.357045,0.331437,0,0,0.298805,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.357045,0.331437,0,0,0.298805,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [333]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2)

### Train the deep learning model

In [334]:
# import keras modules
from keras.models import Sequential
from keras.layers import Dense

# create model
model = Sequential([
    Dense(input_dim=train_X.shape[1], activation='relu', units=128),
    Dense(input_dim=128, activation='relu', units=64),
    Dense(input_dim=64, activation='relu', units=32),
    Dense(input_dim=32, units=1),
])

In [335]:
# compile the model with necessary attributes
model.compile(
    optimizer=keras.optimizers.Adam(),  # Optimizer
    # Loss function to minimize
    loss=keras.losses.MeanSquaredError(),
    # List of metrics to monitor
    # metrics=['mse'],
)

In [336]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(monitor='val_loss', patience=10, mode='min')
checkpoint = ModelCheckpoint('model_best_weight.h5', monitor='val_loss', save_best_only=True, mode='min', period=1)

In [337]:
print("Fit model on training data")
history = model.fit(
    train_X,
    train_y,
    batch_size=32,
    epochs=25,
    callbacks=[early_stop, checkpoint],
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=(val_X, val_y),
)

Fit model on training data
Train on 120119 samples, validate on 30030 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25


In [338]:
# load pretrained best model weights into new model
model.load_weights("model_best_weight.h5")

In [339]:
# Generate predictions (the output of the last layer)
# on new data using `predict`
predictions = model.predict(val_X)
print("predictions shape:", predictions.shape)

predictions shape: (30030, 1)


In [340]:
from sklearn.metrics import mean_squared_log_error, mean_squared_error
msle = mean_squared_log_error(val_y, predictions)
# mse = mean_squared_error(val_y, predictions)
print(msle)

0.18084016248031187


In [332]:
preds = model.predict(test_data)

subm = pd.DataFrame()
subm['record_ID'] = rid
subm['units_sold'] = preds

subm.to_csv("submission_deep1.csv", index=None)