In [1]:
import os
import math
import ast

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
BATCH_SIZE = 96*2
TRAIN_EPOCHS = 200

PATH_DATA_TRAIN = 'data/processed/train.csv'
PATH_DATA_TEST = 'data/processed/test.csv'

In [3]:
dtypes = {'Day': 'category', 'DayOfWeek': 'category', 'Hour': 'category', 'device': 'category', 'activated': 'category'}

df_train = pd.read_csv('data/processed/train.csv', dtype=dtypes)
df_test = pd.read_csv('data/processed/test.csv', dtype=dtypes)

We build our input functions, which are responsible to feed our models with some inputs. One is dedicated to feed the training and validation phase, the other is used for the predictions.

In [4]:
feature_names = ['Day', 'DayOfWeek', 'Hour', 'device']
csv_defaults = [['1'],['1'],['0'],['device_1'],['False']]

def input_fn_train(file_path, repeat_count=1):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a prediction"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        prediction = parsed_line[-1]  # Second element is the Sales
        del parsed_line[-1]  # Delete it
        features = parsed_line
        return dict(zip(feature_names, features)), prediction

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.shuffle(buffer_size=256)
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def input_fn_predict(file_path):
    def decode_csv(line):
        """Convert a CSV row to a dictonary of features and a label"""
        parsed_line = tf.decode_csv(line, csv_defaults)
        del parsed_line[-1]  # Delete it
        features = parsed_line
        return dict(zip(feature_names, features))

    dataset = (tf.data.TextLineDataset(file_path)
               .skip(1)  # Skip header row
               .map(decode_csv))
    dataset = dataset.batch(BATCH_SIZE)
    
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

Let's now create our features columns as described in the data analysis
Since a DNN only accept Dense columns, we will wrap our Categorical columns into indicator or embedding columns. The embeddings enables us to reduce the dimensinality of our inputs by learning progressively correlation among them. This is especially useful for Stores.

In [5]:
days = df_train['Day'].unique().tolist()
dows = df_train['DayOfWeek'].unique().tolist()
hours = df_train['Hour'].unique().tolist()
devices = df_train['device'].unique().tolist()
actived = df_train['activated'].unique().tolist()

feature_columns = [
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Day", 
            vocabulary_list=days),
        7
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="DayOfWeek", 
            vocabulary_list=dows),
        5
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="Hour", 
            vocabulary_list=hours),
        4
    ),
    tf.feature_column.embedding_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            key="device", 
            vocabulary_list=devices),
        3
    )
]

In [6]:
predictor = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[128, 32],
    dropout=0.05,
    optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.001,
#         l1_regularization_strength=0.01,
#         l2_regularization_strength=0.001
    ),
    label_vocabulary=['True', 'False'],
    model_dir='models'
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_evaluation_master': '', '_is_chief': True, '_model_dir': 'models', '_master': '', '_num_worker_replicas': 1, '_save_checkpoints_steps': None, '_log_step_count_steps': 100, '_save_checkpoints_secs': 600, '_save_summary_steps': 100, '_service': None, '_session_config': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000023DBAA43CF8>, '_keep_checkpoint_max': 5, '_task_id': 0, '_task_type': 'worker', '_global_id_in_cluster': 0, '_keep_checkpoint_every_n_hours': 10000, '_tf_random_seed': None, '_num_ps_replicas': 0}


In [13]:
predictor.train(input_fn=lambda: input_fn_train(PATH_DATA_TRAIN, repeat_count=TRAIN_EPOCHS))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-20476
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 20477 into models\model.ckpt.
INFO:tensorflow:loss = 41.712067, step = 20477
INFO:tensorflow:global_step/sec: 165.67
INFO:tensorflow:loss = 42.929314, step = 20577 (0.606 sec)
INFO:tensorflow:global_step/sec: 226.156
INFO:tensorflow:loss = 53.10225, step = 20677 (0.442 sec)
INFO:tensorflow:global_step/sec: 233.023
INFO:tensorflow:loss = 48.03665, step = 20777 (0.429 sec)
INFO:tensorflow:global_step/sec: 229.802
INFO:tensorflow:loss = 37.464706, step = 20877 (0.435 sec)
INFO:tensorflow:global_step/sec: 231.401
INFO:tensorflow:loss = 51.721718, step = 20977 (0.431 sec)
INFO:tensorflow:global_step/sec: 230.866
INFO:tensorflow:loss = 44.93939, step = 210

INFO:tensorflow:loss = 38.085976, step = 28377 (0.435 sec)
INFO:tensorflow:global_step/sec: 228.748
INFO:tensorflow:loss = 33.23069, step = 28477 (0.436 sec)
INFO:tensorflow:global_step/sec: 227.185
INFO:tensorflow:loss = 32.874474, step = 28577 (0.441 sec)
INFO:tensorflow:global_step/sec: 225.13
INFO:tensorflow:loss = 47.2688, step = 28677 (0.443 sec)
INFO:tensorflow:global_step/sec: 233.572
INFO:tensorflow:loss = 37.33864, step = 28777 (0.428 sec)
INFO:tensorflow:global_step/sec: 231.401
INFO:tensorflow:loss = 32.833275, step = 28877 (0.432 sec)
INFO:tensorflow:global_step/sec: 227.703
INFO:tensorflow:loss = 48.29333, step = 28977 (0.439 sec)
INFO:tensorflow:global_step/sec: 231.401
INFO:tensorflow:loss = 43.370262, step = 29077 (0.433 sec)
INFO:tensorflow:global_step/sec: 230.865
INFO:tensorflow:loss = 33.48403, step = 29177 (0.433 sec)
INFO:tensorflow:global_step/sec: 224.626
INFO:tensorflow:loss = 27.618963, step = 29277 (0.445 sec)
INFO:tensorflow:global_step/sec: 228.747
INFO:te

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x23dbaa43ba8>

In [14]:
training_results = predictor.evaluate(input_fn=lambda: input_fn_train(PATH_DATA_TRAIN))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-20-16:37:28
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-30714
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-20-16:37:29
INFO:tensorflow:Saving dict for global step 30714: accuracy = 0.90771264, accuracy_baseline = 0.80982906, auc = 0.9585533, auc_precision_recall = 0.9900411, average_loss = 0.1987712, global_step = 30714, label/mean = 0.80982906, loss = 37.567757, prediction/mean = 0.8107699


In [15]:
test_results = predictor.evaluate(input_fn=lambda: input_fn_train(PATH_DATA_TEST))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-03-20-16:37:30
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-30714
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-03-20-16:37:30
INFO:tensorflow:Saving dict for global step 30714: accuracy = 0.9131274, accuracy_baseline = 0.8108108, auc = 0.9580418, auc_precision_recall = 0.98986435, average_loss = 0.20340101, global_step = 30714, label/mean = 0.8108108, loss = 35.120575, prediction/mean = 0.80899066


In [16]:
test_results = predictor.predict(input_fn=lambda: input_fn_predict(PATH_DATA_TEST))
activateds = []
for idx, prediction in enumerate(test_results):
    activateds.append(prediction['classes'][0].decode('utf-8'))
df_test['predicted'] = activateds

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models\model.ckpt-30714
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [17]:
df_test.head()

Unnamed: 0,Day,DayOfWeek,Hour,device,activated,predicted
0,17,6,7,device_6,False,False
1,30,5,12,device_1,False,False
2,7,3,11,device_4,True,True
3,24,6,8,device_3,False,False
4,9,1,19,device_5,False,False


In [18]:
df_test.to_csv('data/processed/pred.csv', index=False)

AttributeError: 'DatetimeIndex' object has no attribute 'head'