In [1]:
import os
import math

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
BATCH_SIZE = 128
TRAIN_EPOCHS = 10

PATH_DATA_TRAIN = 'data/processed/input_train.csv'
PATH_DATA_VALID = 'data/processed/input_valid.csv'
PATH_DATA_TEST = 'data/processed/input_test.csv'

In [3]:
df_train = pd.read_csv('data/processed/train.csv')
df_test = pd.read_csv('data/processed/test.csv')

df_train.loc[:, 'Sales'] = df_train['Sales'].apply(lambda x: math.log(x+1))

df_train.loc[:, 'CompetitionDistance'] = df_train['CompetitionDistance'].apply(lambda x: math.log(x+1))

train, valid = train_test_split(df_train, test_size=0.05, random_state=42)
train.to_csv(PATH_DATA_TRAIN, index=False)
valid.to_csv(PATH_DATA_VALID, index=False)

df_test.loc[:, 'CompetitionDistance'] = df_test['CompetitionDistance'].apply(lambda x: math.log(x+1))
df_test.drop(['Id'], axis=1, inplace=True)
df_test.insert(loc=2, column='Sales', value=0)
df_test.to_csv(PATH_DATA_TEST, index=False)

We build our input functions, which are responsible to feed our models with some inputs. One is dedicated to feed the training and validation phase, the other is used for the predictions.

In [4]:
feature_names = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2', 'Day', 'Month', 'Year', 'isCompetition']
csv_defaults = [[1],[1],[0.],[0],["0"],[0],['a'],['a'],[0.],[0],[1],[1],[2013],[0]]

def input_fn_train(file_path, repeat_count=1):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a prediction"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        prediction = parsed_line[2]  # Second element is the Sales
        del parsed_line[2]  # Delete it
        features = parsed_line
        return dict(zip(feature_names, features)), prediction

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def input_fn_predict(file_path):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a label"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        prediction = parsed_line[2]  # Second element is the Sales
        del parsed_line[2]  # Delete it
        features = parsed_line
        return dict(zip(feature_names, features))

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

Let's now create our features columns as described in the data analysis
Since a DNN only accept Dense columns, we will wrap our Categorical columns into indicator or embedding columns. The embeddings enables us to reduce the dimensinality of our inputs by learning progressively correlation among them. This is especially useful for Stores.

In [5]:
stores = df_train['Store'].unique().tolist()
dows = df_train['DayOfWeek'].unique().tolist()
promos = df_train['Promo'].unique().tolist()
shs1 = df_train['StateHoliday'].unique().tolist()
shs2 = df_train['SchoolHoliday'].unique().tolist()
sts = df_train['StoreType'].unique().tolist()
assos = df_train['Assortment'].unique().tolist()
promo2s = df_train['Promo2'].unique().tolist()
days = df_train['Day'].unique().tolist()
months = df_train['Month'].unique().tolist()
years = df_train['Year'].unique().tolist()
iCs = df_train['isCompetition'].unique().tolist()


feature_columns = [
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Store", 
            vocabulary_list=stores),
        22
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="DayOfWeek", 
            vocabulary_list=dows)
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Promo", 
            vocabulary_list=promos)
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="StateHoliday", 
            vocabulary_list=shs1),
        2
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="SchoolHoliday", 
            vocabulary_list=shs2)
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="StoreType", 
            vocabulary_list=sts),
        2
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Assortment", 
            vocabulary_list=assos),
        2
    ),
    tf.feature_column.numeric_column('CompetitionDistance'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Promo2", 
            vocabulary_list=promo2s)
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Day", 
            vocabulary_list=days),
        7
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Month", 
            vocabulary_list=months),
        4
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Year", 
            vocabulary_list=years),
        2
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="isCompetition", 
            vocabulary_list=iCs)
    ),
]

In [6]:
predictor = tf.estimator.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[2048, 256],
    dropout=0.25,
    optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.001,
#         l1_regularization_strength=0.01,
#         l2_regularization_strength=0.001
    ),
    model_dir='models'
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_global_id_in_cluster': 0, '_task_type': 'worker', '_log_step_count_steps': 100, '_num_worker_replicas': 1, '_task_id': 0, '_save_checkpoints_secs': 600, '_evaluation_master': '', '_master': '', '_save_checkpoints_steps': None, '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001AC224375F8>, '_service': None, '_keep_checkpoint_max': 5, '_is_chief': True, '_save_summary_steps': 100, '_num_ps_replicas': 0, '_model_dir': 'models'}


In [34]:
predictor.train(input_fn=lambda: input_fn_train(PATH_DATA_TRAIN, repeat_count=TRAIN_EPOCHS))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-188010
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 188011 into models\model.ckpt.
INFO:tensorflow:step = 188011, loss = 24.928185
INFO:tensorflow:global_step/sec: 36.7224
INFO:tensorflow:step = 188111, loss = 33.156773 (2.722 sec)
INFO:tensorflow:global_step/sec: 41.7107
INFO:tensorflow:step = 188211, loss = 33.887165 (2.397 sec)
INFO:tensorflow:global_step/sec: 41.6468
INFO:tensorflow:step = 188311, loss = 27.326492 (2.401 sec)
INFO:tensorflow:global_step/sec: 41.0723
INFO:tensorflow:step = 188411, loss = 29.516582 (2.435 sec)
INFO:tensorflow:global_step/sec: 41.5598
INFO:tensorflow:step = 188511, loss = 28.491512 (2.406 sec)
INFO:tensorflow:global_step/sec: 42.0076
INFO:tensorflow:step = 188611, l

INFO:tensorflow:step = 195811, loss = 23.75888 (2.440 sec)
INFO:tensorflow:global_step/sec: 40.8556
INFO:tensorflow:step = 195911, loss = 31.156734 (2.448 sec)
INFO:tensorflow:global_step/sec: 41.8321
INFO:tensorflow:step = 196011, loss = 23.085077 (2.391 sec)
INFO:tensorflow:global_step/sec: 41.5319
INFO:tensorflow:step = 196111, loss = 34.23162 (2.407 sec)
INFO:tensorflow:global_step/sec: 41.3788
INFO:tensorflow:step = 196211, loss = 36.087868 (2.420 sec)
INFO:tensorflow:global_step/sec: 41.3714
INFO:tensorflow:step = 196311, loss = 32.616066 (2.415 sec)
INFO:tensorflow:global_step/sec: 41.2659
INFO:tensorflow:step = 196411, loss = 32.83697 (2.423 sec)
INFO:tensorflow:global_step/sec: 40.6047
INFO:tensorflow:step = 196511, loss = 32.38109 (2.465 sec)
INFO:tensorflow:global_step/sec: 41.5911
INFO:tensorflow:step = 196611, loss = 26.036007 (2.402 sec)
INFO:tensorflow:global_step/sec: 40.7279
INFO:tensorflow:step = 196711, loss = 26.328724 (2.455 sec)
INFO:tensorflow:global_step/sec: 41

INFO:tensorflow:global_step/sec: 40.8867
INFO:tensorflow:step = 204011, loss = 25.186235 (2.447 sec)
INFO:tensorflow:global_step/sec: 40.6641
INFO:tensorflow:step = 204111, loss = 29.912231 (2.459 sec)
INFO:tensorflow:global_step/sec: 40.0611
INFO:tensorflow:step = 204211, loss = 23.429577 (2.497 sec)
INFO:tensorflow:global_step/sec: 40.7194
INFO:tensorflow:step = 204311, loss = 31.649889 (2.454 sec)
INFO:tensorflow:global_step/sec: 40.8135
INFO:tensorflow:step = 204411, loss = 26.581375 (2.450 sec)
INFO:tensorflow:global_step/sec: 41.0082
INFO:tensorflow:step = 204511, loss = 29.005531 (2.441 sec)
INFO:tensorflow:global_step/sec: 41.8986
INFO:tensorflow:step = 204611, loss = 27.451626 (2.384 sec)
INFO:tensorflow:global_step/sec: 41.4416
INFO:tensorflow:step = 204711, loss = 31.526722 (2.414 sec)
INFO:tensorflow:global_step/sec: 41.4921
INFO:tensorflow:step = 204811, loss = 30.764849 (2.412 sec)
INFO:tensorflow:global_step/sec: 41.4547
INFO:tensorflow:step = 204911, loss = 30.281635 (2

INFO:tensorflow:global_step/sec: 40.887
INFO:tensorflow:step = 212211, loss = 30.719334 (2.447 sec)
INFO:tensorflow:global_step/sec: 40.3243
INFO:tensorflow:step = 212311, loss = 35.25879 (2.480 sec)
INFO:tensorflow:global_step/sec: 40.974
INFO:tensorflow:step = 212411, loss = 33.35012 (2.441 sec)
INFO:tensorflow:global_step/sec: 41.1939
INFO:tensorflow:step = 212511, loss = 29.60277 (2.426 sec)
INFO:tensorflow:global_step/sec: 40.8914
INFO:tensorflow:step = 212611, loss = 37.975334 (2.448 sec)
INFO:tensorflow:Saving checkpoints for 212708 into models\model.ckpt.
INFO:tensorflow:global_step/sec: 33.5742
INFO:tensorflow:step = 212711, loss = 26.00983 (2.977 sec)
INFO:tensorflow:global_step/sec: 41.5369
INFO:tensorflow:step = 212811, loss = 27.972145 (2.408 sec)
INFO:tensorflow:global_step/sec: 40.6487
INFO:tensorflow:step = 212911, loss = 27.521336 (2.548 sec)
INFO:tensorflow:global_step/sec: 39.1067
INFO:tensorflow:step = 213011, loss = 30.164097 (2.468 sec)
INFO:tensorflow:global_step

INFO:tensorflow:global_step/sec: 41.7023
INFO:tensorflow:step = 220311, loss = 35.755005 (2.397 sec)
INFO:tensorflow:global_step/sec: 41.4933
INFO:tensorflow:step = 220411, loss = 28.740828 (2.411 sec)
INFO:tensorflow:global_step/sec: 41.7907
INFO:tensorflow:step = 220511, loss = 28.826284 (2.393 sec)
INFO:tensorflow:global_step/sec: 41.7717
INFO:tensorflow:step = 220611, loss = 28.52361 (2.395 sec)
INFO:tensorflow:global_step/sec: 41.5727
INFO:tensorflow:step = 220711, loss = 25.15634 (2.406 sec)
INFO:tensorflow:global_step/sec: 41.4705
INFO:tensorflow:step = 220811, loss = 33.47421 (2.409 sec)
INFO:tensorflow:global_step/sec: 42.0031
INFO:tensorflow:step = 220911, loss = 40.089558 (2.381 sec)
INFO:tensorflow:global_step/sec: 41.4197
INFO:tensorflow:step = 221011, loss = 27.237278 (2.417 sec)
INFO:tensorflow:global_step/sec: 40.7592
INFO:tensorflow:step = 221111, loss = 36.27529 (2.451 sec)
INFO:tensorflow:global_step/sec: 41.4006
INFO:tensorflow:step = 221211, loss = 30.470835 (2.415

INFO:tensorflow:global_step/sec: 40.1492
INFO:tensorflow:step = 228511, loss = 32.337166 (2.492 sec)
INFO:tensorflow:global_step/sec: 40.9863
INFO:tensorflow:step = 228611, loss = 35.072174 (2.438 sec)
INFO:tensorflow:global_step/sec: 41.233
INFO:tensorflow:step = 228711, loss = 29.028366 (2.425 sec)
INFO:tensorflow:global_step/sec: 40.6398
INFO:tensorflow:step = 228811, loss = 34.50083 (2.461 sec)
INFO:tensorflow:global_step/sec: 41.8537
INFO:tensorflow:step = 228911, loss = 24.935993 (2.389 sec)
INFO:tensorflow:global_step/sec: 41.9042
INFO:tensorflow:step = 229011, loss = 34.07903 (2.386 sec)
INFO:tensorflow:global_step/sec: 42.0931
INFO:tensorflow:step = 229111, loss = 33.850666 (2.378 sec)
INFO:tensorflow:global_step/sec: 41.0405
INFO:tensorflow:step = 229211, loss = 33.681023 (2.437 sec)
INFO:tensorflow:global_step/sec: 40.9307
INFO:tensorflow:step = 229311, loss = 33.10956 (2.441 sec)
INFO:tensorflow:global_step/sec: 41.4004
INFO:tensorflow:step = 229411, loss = 30.189846 (2.416

INFO:tensorflow:global_step/sec: 41.5236
INFO:tensorflow:step = 236711, loss = 28.181194 (2.408 sec)
INFO:tensorflow:global_step/sec: 42.1736
INFO:tensorflow:step = 236811, loss = 28.879974 (2.371 sec)
INFO:tensorflow:global_step/sec: 41.392
INFO:tensorflow:step = 236911, loss = 28.726126 (2.416 sec)
INFO:tensorflow:global_step/sec: 41.3174
INFO:tensorflow:step = 237011, loss = 27.262608 (2.421 sec)
INFO:tensorflow:global_step/sec: 41.3791
INFO:tensorflow:step = 237111, loss = 30.685776 (2.419 sec)
INFO:tensorflow:global_step/sec: 41.2637
INFO:tensorflow:step = 237211, loss = 33.49853 (2.422 sec)
INFO:tensorflow:Saving checkpoints for 237280 into models\model.ckpt.
INFO:tensorflow:global_step/sec: 34.7377
INFO:tensorflow:step = 237311, loss = 33.879414 (2.877 sec)
INFO:tensorflow:global_step/sec: 41.2853
INFO:tensorflow:step = 237411, loss = 32.59134 (2.422 sec)
INFO:tensorflow:global_step/sec: 41.3527
INFO:tensorflow:step = 237511, loss = 29.826496 (2.417 sec)
INFO:tensorflow:global_s

INFO:tensorflow:global_step/sec: 41.6588
INFO:tensorflow:step = 244811, loss = 27.214281 (2.402 sec)
INFO:tensorflow:global_step/sec: 41.5837
INFO:tensorflow:step = 244911, loss = 25.483452 (2.404 sec)
INFO:tensorflow:global_step/sec: 41.7049
INFO:tensorflow:step = 245011, loss = 36.247337 (2.398 sec)
INFO:tensorflow:global_step/sec: 41.7845
INFO:tensorflow:step = 245111, loss = 26.953962 (2.393 sec)
INFO:tensorflow:global_step/sec: 41.5403
INFO:tensorflow:step = 245211, loss = 31.983694 (2.406 sec)
INFO:tensorflow:global_step/sec: 41.7976
INFO:tensorflow:step = 245311, loss = 25.492477 (2.392 sec)
INFO:tensorflow:global_step/sec: 41.6495
INFO:tensorflow:step = 245411, loss = 31.49031 (2.402 sec)
INFO:tensorflow:global_step/sec: 41.9532
INFO:tensorflow:step = 245511, loss = 22.354897 (2.385 sec)
INFO:tensorflow:global_step/sec: 41.4391
INFO:tensorflow:step = 245611, loss = 27.413294 (2.412 sec)
INFO:tensorflow:global_step/sec: 41.5377
INFO:tensorflow:step = 245711, loss = 37.713726 (2.

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x1ac22437668>

In [35]:
training_results = predictor.evaluate(input_fn=lambda: input_fn_train(PATH_DATA_TRAIN))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-19-00:09:30
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-250680
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-19-00:11:02
INFO:tensorflow:Saving dict for global step 250680: average_loss = 0.031031687, global_step = 250680, loss = 3.9720361


In [36]:
valid_results = predictor.evaluate(input_fn=lambda: input_fn_train(PATH_DATA_VALID))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-19-00:11:03
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-250680
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-19-00:11:08
INFO:tensorflow:Saving dict for global step 250680: average_loss = 0.029388107, global_step = 250680, loss = 3.7598965


In [37]:
valid_results = predictor.predict(input_fn=lambda: input_fn_predict(PATH_DATA_VALID))
rmspe_valid = 0
n = 0
for idx, prediction in enumerate(valid_results):
    if valid.iloc[idx]['Sales'] > 0:
        rmspe_valid += ((math.exp(valid.iloc[idx]['Sales']) - math.exp(prediction['predictions'][0])) / (math.exp(valid.iloc[idx]['Sales']) - 1)) ** 2
        n += 1
rmspe_valid /= n
print("Validation RMSPE: ", math.sqrt(rmspe_valid))    

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-250680
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Validation RMSPE:  0.16927277037537805


In [38]:
test_results = predictor.predict(input_fn=lambda: input_fn_predict(PATH_DATA_TEST))
sales = []
for idx, prediction in enumerate(test_results):
    sales.append(int(math.exp(prediction['predictions'][0])-1))
df_test = pd.read_csv('data/processed/test.csv')
df_test['Sales'] = sales
df_test = df_test[['Id', 'Sales']]
df_test.set_index('Id', inplace=True)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-250680
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [39]:
raw_test = pd.read_csv('data/raw/test.csv')
raw_test.head()
sub_test = raw_test[['Id', 'Store']]

In [40]:
df_test.head()

Unnamed: 0_level_0,Sales
Id,Unnamed: 1_level_1
1,4643
2,7840
3,9341
4,5706
5,6522


In [41]:
sub_test = sub_test.join(df_test, on='Id')
sub_test.loc[np.isnan(sub_test['Sales']), 'Sales'] = 0
sub_test.drop(['Store'], axis=1, inplace=True)

In [42]:
sub_test.to_csv('data/processed/sub.csv', index=False)