In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding,LSTM,GRU,TimeDistributed,RepeatVector,Merge,BatchNormalization

from keras.optimizers import SGD, RMSprop, Adam

from sklearn.preprocessing import MinMaxScaler
import math

import cPickle as pickle

Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [4]:
base_path = "/home/docker/fastai-courses/deeplearning1/nbs/persistent/dmc/deep-learning-time-series-analysis"
resources_folder = "resources/"

In [5]:
data = pd.read_csv('../data/train.csv', sep='|')
data['itemsPurchased'] = (data['revenue'] / data['price']).astype(int)
data['purchase'] = data['itemsPurchased'].apply(lambda x: 1 if x >= 1 else int(x))
data

IOError: File data/train.csv does not exist

In [31]:
nr_days = data['day'].max() +1 

In [32]:
nr_train_days = nr_days
nr_test_days = nr_days 

# nr_train_days = 70
# nr_test_days = nr_days - 71

# train_data = data[data['day'] <= nr_train_days]
# test_data = data[data['day'] > nr_train_days]

In [34]:
# print("Train day interval: %d -> %d"%(train_data['day'].min(),train_data['day'].max()))
# print("Test day interval: %d -> %d"%(test_data['day'].min(),test_data['day'].max()))
print("Train days: %d"%nr_train_days)
print("Test days: %d"%nr_test_days)

Train days: 93
Test days: 93


In [17]:
def get_dict_for_pid(df, pid):
    filtered = df.loc[df['pid'] == pid].set_index('day')
    return filtered.to_dict(orient='index')

def save_obj(obj, path ):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def get_train_test_timeseries_for_data(data, nr_train, nr_test):

    uniquePids = sorted(data['pid'].unique())

    train_pids = uniquePids[:nr_train]
    test_pids = uniquePids[nr_train:nr_train+nr_test]
    
    train_timeseries = [get_dict_for_pid(data, pid) for pid in tqdm(train_pids)]
    test_timeseries = [get_dict_for_pid(data, pid) for pid in tqdm(test_pids)]
    
    return (train_timeseries,test_timeseries)

In [20]:
(train_timeseries,test_timeseries) = get_train_test_timeseries_for_data(data,nr_train = 1000,nr_test = 500)

100%|██████████| 1000/1000 [00:10<00:00, 96.33it/s]
100%|██████████| 500/500 [00:07<00:00, 70.24it/s] 


In [None]:
# save_obj(train_timeseries,base_path+resources_folder+"train_timeseries")
# save_obj(test_timeseries,base_path+resources_folder+"test_timeseries")

In [None]:
# train_timeseries  = load_obj(base_path+resources_folder+"train_timeseries")
# test_timeseries = load_obj(base_path+resources_folder+"train_timeseries")

In [21]:
train_timeseries[0]

{10: {'adFlag': 0.0,
  'availability': 3.0,
  'basket': 0.0,
  'click': 1.0,
  'competitorPrice': 8.2200000000000006,
  'itemsPurchased': 0.0,
  'lineID': 198133.0,
  'order': 0.0,
  'pid': 1.0,
  'price': 10.08,
  'purchase': 0.0,
  'revenue': 0.0},
 21: {'adFlag': 0.0,
  'availability': 3.0,
  'basket': 0.0,
  'click': 1.0,
  'competitorPrice': 8.7100000000000009,
  'itemsPurchased': 0.0,
  'lineID': 430014.0,
  'order': 0.0,
  'pid': 1.0,
  'price': 10.08,
  'purchase': 0.0,
  'revenue': 0.0},
 33: {'adFlag': 0.0,
  'availability': 3.0,
  'basket': 0.0,
  'click': 0.0,
  'competitorPrice': 8.2200000000000006,
  'itemsPurchased': 1.0,
  'lineID': 769328.0,
  'order': 1.0,
  'pid': 1.0,
  'price': 7.2800000000000002,
  'purchase': 1.0,
  'revenue': 7.2800000000000002},
 40: {'adFlag': 0.0,
  'availability': 3.0,
  'basket': 0.0,
  'click': 1.0,
  'competitorPrice': 8.2200000000000006,
  'itemsPurchased': 0.0,
  'lineID': 1011505.0,
  'order': 0.0,
  'pid': 1.0,
  'price': 8.4199999999

In [35]:
nr_train_examples = len(train_timeseries)
nr_test_examples = len(test_timeseries)
nr_features = 4

print("Nr Train Examples %d"%nr_train_examples)
print("Nr Test Examples %d"%nr_test_examples)

Nr Train Examples 1000
Nr Test Examples 500


In [36]:
def computeFeatureMatrix(timeseries,number_examples):
    
    data_matrix = np.zeros((number_examples,nr_days,nr_features))
    data_matrix.shape

    last_known_day = [0,0,0,0]
    for index in range(number_examples):
        current_dict = timeseries[index]
        for day in range(nr_days):
            if day in current_dict:
                day_dict = current_dict[day]
                adFlag = day_dict['adFlag']
                availability = day_dict['availability']
                price = day_dict['price']

                if(math.isnan(day_dict['competitorPrice'])):
                    competitiorPrice = price
                else:
                    competitorPrice = day_dict['competitorPrice']

                last_known_day = [price,competitorPrice,availability,adFlag]


            data_matrix[index,day] = last_known_day
            
    return data_matrix

In [37]:
train_matrix = computeFeatureMatrix(train_timeseries,nr_train_examples)
test_matrix = computeFeatureMatrix(test_timeseries,nr_test_examples)

print(train_matrix.shape)
print(test_matrix.shape)

(1000, 93, 4)
(500, 93, 4)


In [38]:
def computePurchaseMatrix(timeseries,number_examples):
    purchase_matrix = np.zeros((number_examples,nr_days,1))
    
    for index in range(number_examples):
        current_dict = timeseries[index]
        for day in current_dict.keys():
            day_dict = current_dict[day]
            purchase = day_dict['purchase']
            purchase_matrix[index,day,0] = purchase
        
    return purchase_matrix

In [39]:
purchase_matrix = computePurchaseMatrix(train_timeseries,nr_train_examples)
purchase_matrix.shape

(1000, 93, 1)

In [40]:
def computeGroudTruthMatrix(timeseries,number_examples,number_days):
    groud_truth_matrix = np.zeros((number_examples,number_days,1))
    groud_truth_matrix.fill(-1)

    for index in range(number_examples):
        current_dict = timeseries[index]
        for day in current_dict.keys():
            day_dict = current_dict[day]
            items_purchase = day_dict['itemsPurchased']
            groud_truth_matrix[index,day,0] = items_purchase   
            
    
    return groud_truth_matrix

In [50]:
train_groud_truth_matrix = computeGroudTruthMatrix(train_timeseries,nr_train_examples,nr_train_days)
test_groud_truth_matrix = computeGroudTruthMatrix(test_timeseries,nr_test_examples,nr_test_days)
print(train_groud_truth_matrix.shape)
print(test_groud_truth_matrix.shape)

(1000, 93, 1)
(500, 93, 1)


In [42]:
print(train_matrix.shape)
print(purchase_matrix.shape)
print(test_groud_truth_matrix.shape)

(1000, 93, 4)
(1000, 93, 1)
(500, 93, 1)


In [44]:
model = Sequential()
model.add(BatchNormalization(input_shape=(nr_days,nr_features)))
model.add(LSTM(1024, return_sequences=True))
model.add(LSTM(1024, return_sequences=True))
model.add(TimeDistributed(Dense(1)))
model.compile(loss='mean_squared_error', optimizer = Adam(0.001))

In [45]:
model.fit(train_matrix, purchase_matrix, nb_epoch=10, batch_size = 256, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa754ae19d0>

# Compute error 

In [51]:
train_predictions = model.predict(train_matrix)
test_predictions = model.predict(test_matrix)

print(train_predictions.shape)
print(test_predictions.shape)

(1000, 93, 1)
(500, 93, 1)


In [58]:
train_predictions_arr = train_predictions.flatten()
test_predictions_arr = test_predictions.flatten()

train_groud_truth_arr = train_groud_truth_matrix.flatten().astype(int)
test_groud_truth_arr = test_groud_truth_matrix.flatten().astype(int)

train_days_mask = np.asarray([x != -1 for x in train_groud_truth_arr])
test_days_mask = np.asarray([x != -1 for x in test_groud_truth_arr])

print("%s %s %s"%(train_predictions_arr.shape,train_groud_truth_arr.shape,train_days_mask.shape ))
print("%s %s %s"%(test_predictions_arr.shape,test_groud_truth_arr.shape,test_days_mask.shape ))


(93000,) (93000,) (93000,)
(46500,) (46500,) (46500,)


In [59]:
train_predictions_filtered = train_predictions_arr[train_days_mask]
train_groud_truth_filtered = train_groud_truth_arr[train_days_mask]

test_predictions_filtered = test_predictions_arr[test_days_mask]
test_groud_truth_filtered = test_groud_truth_arr[test_days_mask]

print("%s %s"%(train_predictions_filtered.shape,train_groud_truth_filtered.shape))
print("%s %s"%(test_predictions_filtered.shape,test_groud_truth_filtered.shape))


(33034,) (33034,)
(16578,) (16578,)


In [60]:
train_error = math.sqrt(((train_predictions_filtered - train_groud_truth_filtered)**2).sum())
test_error = math.sqrt(((test_predictions_filtered - test_groud_truth_filtered)**2).sum())

In [63]:
print("Train error: %f"%train_error)
print("Test error: %f"%test_error)


Train error: 157.683326
Test error: 179.366725
