In [1]:
import os
import math

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
BATCH_SIZE = 128
TRAIN_EPOCHS = 10

PATH_DATA_TRAIN = 'data/processed/input_train.csv'
PATH_DATA_VALID = 'data/processed/input_valid.csv'
PATH_DATA_TEST = 'data/processed/input_test.csv'

In [3]:
df_train = pd.read_csv('data/processed/train.csv')
df_test = pd.read_csv('data/processed/test.csv')

df_train.loc[:, 'Sales'] = df_train['Sales'].apply(lambda x: math.log(x+1))

df_train.loc[:, 'CompetitionDistance'] = df_train['CompetitionDistance'].apply(lambda x: math.log(x+1))

train, valid = train_test_split(df_train, test_size=0.05, random_state=42)
train.to_csv(PATH_DATA_TRAIN, index=False)
valid.to_csv(PATH_DATA_VALID, index=False)

df_test.loc[:, 'CompetitionDistance'] = df_test['CompetitionDistance'].apply(lambda x: math.log(x+1))
df_test.drop(['Id'], axis=1, inplace=True)
df_test.insert(loc=2, column='Sales', value=0)
df_test.to_csv(PATH_DATA_TEST, index=False)

We build our input functions, which are responsible to feed our models with some inputs. One is dedicated to feed the training and validation phase, the other is used for the predictions.

In [4]:
feature_names = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2', 'Day', 'Month', 'Year', 'isCompetition']
csv_defaults = [[1],[1],[0.],[0],["0"],[0],['a'],['a'],[0.],[0],[1],[1],[2013],[0]]

def input_fn_train(file_path, repeat_count=1):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a prediction"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        prediction = parsed_line[2]  # Second element is the Sales
        del parsed_line[2]  # Delete it
        features = parsed_line
        return dict(zip(feature_names, features)), prediction

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.shuffle(buffer_size=2048)
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def input_fn_predict(file_path):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a label"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        prediction = parsed_line[2]  # Second element is the Sales
        del parsed_line[2]  # Delete it
        features = parsed_line
        return dict(zip(feature_names, features))

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

Let's now create our features columns as described in the data analysis
Since a DNN only accept Dense columns, we will wrap our Categorical columns into indicator or embedding columns. The embeddings enables us to reduce the dimensinality of our inputs by learning progressively correlation among them. This is especially useful for Stores.

In [5]:
stores = df_train['Store'].unique().tolist()
dows = df_train['DayOfWeek'].unique().tolist()
promos = df_train['Promo'].unique().tolist()
shs1 = df_train['StateHoliday'].unique().tolist()
shs2 = df_train['SchoolHoliday'].unique().tolist()
sts = df_train['StoreType'].unique().tolist()
assos = df_train['Assortment'].unique().tolist()
promo2s = df_train['Promo2'].unique().tolist()
days = df_train['Day'].unique().tolist()
months = df_train['Month'].unique().tolist()
years = df_train['Year'].unique().tolist()
iCs = df_train['isCompetition'].unique().tolist()


feature_columns = [
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Store", 
            vocabulary_list=stores),
        22
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="DayOfWeek", 
            vocabulary_list=dows)
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Promo", 
            vocabulary_list=promos)
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="StateHoliday", 
            vocabulary_list=shs1),
        2
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="SchoolHoliday", 
            vocabulary_list=shs2)
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="StoreType", 
            vocabulary_list=sts),
        2
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Assortment", 
            vocabulary_list=assos),
        2
    ),
    tf.feature_column.numeric_column('CompetitionDistance'),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Promo2", 
            vocabulary_list=promo2s)
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Day", 
            vocabulary_list=days),
        7
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Month", 
            vocabulary_list=months),
        4
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Year", 
            vocabulary_list=years),
        2
    ),
    tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="isCompetition", 
            vocabulary_list=iCs)
    ),
]

In [6]:
predictor = tf.estimator.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[2048, 256],
    dropout=0.25,
    optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.001,
#         l1_regularization_strength=0.01,
#         l2_regularization_strength=0.001
    ),
    model_dir='models'
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_global_id_in_cluster': 0, '_service': None, '_evaluation_master': '', '_save_checkpoints_steps': None, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_task_type': 'worker', '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_tf_random_seed': None, '_master': '', '_task_id': 0, '_keep_checkpoint_max': 5, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001BAFE2975F8>, '_session_config': None, '_model_dir': 'models', '_num_ps_replicas': 0, '_keep_checkpoint_every_n_hours': 10000, '_is_chief': True}


In [16]:
predictor.train(input_fn=lambda: input_fn_train(PATH_DATA_TRAIN, repeat_count=TRAIN_EPOCHS))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-62670
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 62671 into models\model.ckpt.
INFO:tensorflow:step = 62671, loss = 107.14925
INFO:tensorflow:global_step/sec: 54.1186
INFO:tensorflow:step = 62771, loss = 123.50198 (1.849 sec)
INFO:tensorflow:global_step/sec: 63.5966
INFO:tensorflow:step = 62871, loss = 84.476105 (1.571 sec)
INFO:tensorflow:global_step/sec: 61.512
INFO:tensorflow:step = 62971, loss = 115.090515 (1.627 sec)
INFO:tensorflow:global_step/sec: 61.3831
INFO:tensorflow:step = 63071, loss = 179.60149 (1.629 sec)
INFO:tensorflow:global_step/sec: 62.3971
INFO:tensorflow:step = 63171, loss = 121.52215 (1.604 sec)
INFO:tensorflow:global_step/sec: 62.9424
INFO:tensorflow:step = 63271, loss = 117

INFO:tensorflow:global_step/sec: 62.5583
INFO:tensorflow:step = 70571, loss = 111.006424 (1.599 sec)
INFO:tensorflow:global_step/sec: 63.2113
INFO:tensorflow:step = 70671, loss = 89.167145 (1.582 sec)
INFO:tensorflow:global_step/sec: 63.7558
INFO:tensorflow:step = 70771, loss = 105.2519 (1.568 sec)
INFO:tensorflow:global_step/sec: 63.6124
INFO:tensorflow:step = 70871, loss = 109.84526 (1.572 sec)
INFO:tensorflow:global_step/sec: 62.585
INFO:tensorflow:step = 70971, loss = 114.12868 (1.598 sec)
INFO:tensorflow:global_step/sec: 62.7312
INFO:tensorflow:step = 71071, loss = 127.7356 (1.594 sec)
INFO:tensorflow:global_step/sec: 63.5667
INFO:tensorflow:step = 71171, loss = 114.29347 (1.575 sec)
INFO:tensorflow:global_step/sec: 62.7365
INFO:tensorflow:step = 71271, loss = 120.87192 (1.593 sec)
INFO:tensorflow:global_step/sec: 62.4174
INFO:tensorflow:step = 71371, loss = 94.04311 (1.601 sec)
INFO:tensorflow:global_step/sec: 62.0384
INFO:tensorflow:step = 71471, loss = 97.66802 (1.612 sec)
INFO

INFO:tensorflow:step = 78771, loss = 142.2842 (1.558 sec)
INFO:tensorflow:global_step/sec: 64.0092
INFO:tensorflow:step = 78871, loss = 112.7637 (1.561 sec)
INFO:tensorflow:global_step/sec: 64.5947
INFO:tensorflow:step = 78971, loss = 92.0212 (1.549 sec)
INFO:tensorflow:global_step/sec: 62.6838
INFO:tensorflow:step = 79071, loss = 104.7882 (1.594 sec)
INFO:tensorflow:global_step/sec: 63.289
INFO:tensorflow:step = 79171, loss = 104.07747 (1.581 sec)
INFO:tensorflow:global_step/sec: 63.6266
INFO:tensorflow:step = 79271, loss = 123.10328 (1.572 sec)
INFO:tensorflow:global_step/sec: 62.7761
INFO:tensorflow:step = 79371, loss = 92.27021 (1.593 sec)
INFO:tensorflow:global_step/sec: 62.8897
INFO:tensorflow:step = 79471, loss = 102.71976 (1.592 sec)
INFO:tensorflow:global_step/sec: 63.4887
INFO:tensorflow:step = 79571, loss = 139.60608 (1.573 sec)
INFO:tensorflow:global_step/sec: 63.7928
INFO:tensorflow:step = 79671, loss = 127.46947 (1.568 sec)
INFO:tensorflow:global_step/sec: 64.0326
INFO:te

INFO:tensorflow:global_step/sec: 64.6267
INFO:tensorflow:step = 87071, loss = 103.41435 (1.548 sec)
INFO:tensorflow:global_step/sec: 62.597
INFO:tensorflow:step = 87171, loss = 89.99614 (1.598 sec)
INFO:tensorflow:global_step/sec: 61.5663
INFO:tensorflow:step = 87271, loss = 133.09224 (1.624 sec)
INFO:tensorflow:global_step/sec: 61.9108
INFO:tensorflow:step = 87371, loss = 118.3649 (1.615 sec)
INFO:tensorflow:global_step/sec: 63.6612
INFO:tensorflow:step = 87471, loss = 100.446846 (1.571 sec)
INFO:tensorflow:global_step/sec: 63.6544
INFO:tensorflow:step = 87571, loss = 124.60663 (1.570 sec)
INFO:tensorflow:global_step/sec: 63.7799
INFO:tensorflow:step = 87671, loss = 126.37146 (1.570 sec)
INFO:tensorflow:global_step/sec: 62.5812
INFO:tensorflow:step = 87771, loss = 106.07463 (1.597 sec)
INFO:tensorflow:global_step/sec: 61.819
INFO:tensorflow:step = 87871, loss = 107.2383 (1.618 sec)
INFO:tensorflow:global_step/sec: 64.4837
INFO:tensorflow:step = 87971, loss = 116.551025 (1.551 sec)
INF

INFO:tensorflow:step = 95271, loss = 89.20017 (1.613 sec)
INFO:tensorflow:global_step/sec: 61.2141
INFO:tensorflow:step = 95371, loss = 89.10434 (1.634 sec)
INFO:tensorflow:global_step/sec: 63.5507
INFO:tensorflow:step = 95471, loss = 90.92029 (1.573 sec)
INFO:tensorflow:global_step/sec: 63.6158
INFO:tensorflow:step = 95571, loss = 108.706406 (1.572 sec)
INFO:tensorflow:global_step/sec: 64.0608
INFO:tensorflow:step = 95671, loss = 127.26102 (1.561 sec)
INFO:tensorflow:global_step/sec: 64.51
INFO:tensorflow:step = 95771, loss = 126.64249 (1.550 sec)
INFO:tensorflow:global_step/sec: 64.2831
INFO:tensorflow:step = 95871, loss = 156.97028 (1.558 sec)
INFO:tensorflow:global_step/sec: 63.4315
INFO:tensorflow:step = 95971, loss = 88.76896 (1.574 sec)
INFO:tensorflow:global_step/sec: 64.0785
INFO:tensorflow:step = 96071, loss = 106.369896 (1.561 sec)
INFO:tensorflow:global_step/sec: 64.0293
INFO:tensorflow:step = 96171, loss = 90.43747 (1.563 sec)
INFO:tensorflow:global_step/sec: 63.6173
INFO:

INFO:tensorflow:global_step/sec: 63.3344
INFO:tensorflow:step = 103471, loss = 106.9368 (1.579 sec)
INFO:tensorflow:global_step/sec: 64.7618
INFO:tensorflow:step = 103571, loss = 121.56258 (1.544 sec)
INFO:tensorflow:global_step/sec: 63.7019
INFO:tensorflow:step = 103671, loss = 104.61163 (1.570 sec)
INFO:tensorflow:global_step/sec: 64.0943
INFO:tensorflow:step = 103771, loss = 157.34973 (1.560 sec)
INFO:tensorflow:global_step/sec: 63.199
INFO:tensorflow:step = 103871, loss = 107.52329 (1.581 sec)
INFO:tensorflow:global_step/sec: 61.6282
INFO:tensorflow:step = 103971, loss = 98.81199 (1.624 sec)
INFO:tensorflow:global_step/sec: 64.2144
INFO:tensorflow:step = 104071, loss = 101.9642 (1.557 sec)
INFO:tensorflow:global_step/sec: 63.4794
INFO:tensorflow:step = 104171, loss = 118.94961 (1.575 sec)
INFO:tensorflow:global_step/sec: 60.3843
INFO:tensorflow:step = 104271, loss = 112.31298 (1.655 sec)
INFO:tensorflow:global_step/sec: 63.5968
INFO:tensorflow:step = 104371, loss = 106.42224 (1.572

INFO:tensorflow:global_step/sec: 61.8942
INFO:tensorflow:step = 111671, loss = 141.89183 (1.616 sec)
INFO:tensorflow:global_step/sec: 64.0223
INFO:tensorflow:step = 111771, loss = 94.5073 (1.562 sec)
INFO:tensorflow:global_step/sec: 63.9261
INFO:tensorflow:step = 111871, loss = 110.463715 (1.564 sec)
INFO:tensorflow:global_step/sec: 62.5297
INFO:tensorflow:step = 111971, loss = 110.44968 (1.598 sec)
INFO:tensorflow:global_step/sec: 62.2217
INFO:tensorflow:step = 112071, loss = 114.606964 (1.608 sec)
INFO:tensorflow:global_step/sec: 61.5493
INFO:tensorflow:step = 112171, loss = 106.00574 (1.624 sec)
INFO:tensorflow:global_step/sec: 60.0261
INFO:tensorflow:step = 112271, loss = 133.28279 (1.667 sec)
INFO:tensorflow:global_step/sec: 51.9052
INFO:tensorflow:step = 112371, loss = 91.78593 (1.927 sec)
INFO:tensorflow:global_step/sec: 59.4318
INFO:tensorflow:step = 112471, loss = 94.96036 (1.683 sec)
INFO:tensorflow:global_step/sec: 62.1948
INFO:tensorflow:step = 112571, loss = 96.00877 (1.60

INFO:tensorflow:step = 119771, loss = 96.48244 (1.563 sec)
INFO:tensorflow:global_step/sec: 63.812
INFO:tensorflow:step = 119871, loss = 95.22226 (1.565 sec)
INFO:tensorflow:global_step/sec: 64.7495
INFO:tensorflow:step = 119971, loss = 113.54261 (1.545 sec)
INFO:tensorflow:global_step/sec: 63.9474
INFO:tensorflow:step = 120071, loss = 129.64267 (1.565 sec)
INFO:tensorflow:global_step/sec: 63.8344
INFO:tensorflow:step = 120171, loss = 89.68358 (1.568 sec)
INFO:tensorflow:global_step/sec: 64.1531
INFO:tensorflow:step = 120271, loss = 113.46835 (1.556 sec)
INFO:tensorflow:global_step/sec: 64.4191
INFO:tensorflow:step = 120371, loss = 100.96797 (1.553 sec)
INFO:tensorflow:global_step/sec: 63.4081
INFO:tensorflow:step = 120471, loss = 115.25705 (1.578 sec)
INFO:tensorflow:global_step/sec: 64.6856
INFO:tensorflow:step = 120571, loss = 109.12264 (1.546 sec)
INFO:tensorflow:global_step/sec: 64.034
INFO:tensorflow:step = 120671, loss = 114.38852 (1.561 sec)
INFO:tensorflow:global_step/sec: 62.

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x1bafe297f60>

In [17]:
training_results = predictor.evaluate(input_fn=lambda: input_fn_train(PATH_DATA_TRAIN))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-19-01:09:30
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-125340
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-19-01:10:49
INFO:tensorflow:Saving dict for global step 125340: average_loss = 0.049209233, global_step = 125340, loss = 6.2987504


In [18]:
valid_results = predictor.evaluate(input_fn=lambda: input_fn_train(PATH_DATA_VALID))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-19-01:10:49
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-125340
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-19-01:10:54
INFO:tensorflow:Saving dict for global step 125340: average_loss = 0.047682326, global_step = 125340, loss = 6.100448


In [19]:
valid_results = predictor.predict(input_fn=lambda: input_fn_predict(PATH_DATA_VALID))
rmspe_valid = 0
n = 0
for idx, prediction in enumerate(valid_results):
    if valid.iloc[idx]['Sales'] > 0:
        rmspe_valid += ((math.exp(valid.iloc[idx]['Sales']) - math.exp(prediction['predictions'][0])) / (math.exp(valid.iloc[idx]['Sales']) - 1)) ** 2
        n += 1
rmspe_valid /= n
print("Validation RMSPE: ", math.sqrt(rmspe_valid))    

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-125340
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Validation RMSPE:  0.1946632733263381


In [20]:
test_results = predictor.predict(input_fn=lambda: input_fn_predict(PATH_DATA_TEST))
sales = []
for idx, prediction in enumerate(test_results):
    sales.append(int(math.exp(prediction['predictions'][0])-1))
df_test = pd.read_csv('data/processed/test.csv')
df_test['Sales'] = sales
df_test = df_test[['Id', 'Sales']]
df_test.set_index('Id', inplace=True)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-125340
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [21]:
raw_test = pd.read_csv('data/raw/test.csv')
raw_test.head()
sub_test = raw_test[['Id', 'Store']]

In [22]:
df_test.head()

Unnamed: 0_level_0,Sales
Id,Unnamed: 1_level_1
1,4113
2,6503
3,8048
4,4996
5,5894


In [23]:
sub_test = sub_test.join(df_test, on='Id')
sub_test.loc[np.isnan(sub_test['Sales']), 'Sales'] = 0
sub_test.drop(['Store'], axis=1, inplace=True)

In [24]:
sub_test.to_csv('data/processed/sub.csv', index=False)