In [None]:
import numpy as np
import tensorflow as tf
import os
import math
import sys
import importlib
from datetime import datetime
from datetime import timedelta

# Data preprocessing methods
import DataHandler
from DataHandler import *
if sys.version_info > (3, 0):
    importlib.reload(DataHandler)

# Input object classes
import SessionData
from SessionData import *
if sys.version_info > (3, 0):
    importlib.reload(SessionData)

import ItemData
from ItemData import *
if sys.version_info > (3, 0):
    importlib.reload(ItemData)

In [None]:
# Set to True to create small sample dataset, otherwise False
use_sample = False
use_subset = False
subset_size = 10000

dir = os.getcwd()

filename_sample_clicks = os.path.join(dir, '..', 'data', 'yoochoose-clicks-sample.dat')
filename_sample_buys = os.path.join(dir, '..', 'data', 'yoochoose-buys-sample.dat')

filename_clicks = os.path.join(dir, '..', 'data', 'yoochoose-clicks_100k.dat')
filename_buys = os.path.join(dir, '..', 'data', 'yoochoose-buys_100k.dat')

filename_test = os.path.join(dir, '../data/yoochoose-test.dat')

if use_sample:
    print ('Working with test dataset')

    train_buys, valid_buys, test_buys = create_dataset_buys(filename_sample_buys)
    train_clicks, valid_clicks, test_clicks = create_dataset_clicks(filename_sample_clicks)

else:
    print ('Working with real dataset')

    all_buys, train_buys, valid_buys, test_buys = create_dataset_buys(filename_buys)
    train_clicks, valid_clicks, test_clicks = create_dataset_clicks(filename_clicks)

if use_subset:
    print ('Working with subset')

    train_buys = train_buys[:subset_size]
    train_clicks = train_clicks[:subset_size]

In [None]:
print('-------  Creating input  ------- \n')

# Create input to the network
# train_sessions = Session.create_sessions_from_clicks(train_clicks)
# print('Created Sessions')
# train_session_objects = SessionObject.create_session_object_list(train_sessions)
# print('Created Session Objects')
# train_vectors = SessionObject.create_input_vectors(train_session_objects, info=False)
# print('Created Input Vectors')
# train_labels = create_labels(all_buys, train_session_objects, info=False)
# train_labels = reformat(train_labels)

In [None]:
session_dict, item_dict = DataHandler.structure_raw_data(train_clicks, all_buys)
print('Created [SessionData dictionary, ItemData dictionary] from dataset rows.')

input_vectors, output_vectors, _ = SessionData.SessionData.create_input_output_vectors(session_dict, info=True)
print('Finished creating [input vectors, output vectors].')

# Balancing dataset because of incredible ratio of buy sessions and non-buy sessions
# Final ratio - 1:1
train_vectors, train_labels = oversample_dataset(input_vectors, output_vectors, info=True)

train_batches_vectors = []
train_batches_labels = []
batch_size = 128

# Create training batches of vectors and labels
for index in range(0, len(train_vectors) / batch_size):
    train_batches_vectors.append(train_vectors[batch_size * index:batch_size * (index + 1)])
    train_batches_labels.append(train_labels[batch_size * index:batch_size * (index + 1)])
    
print('Created train dataset\n')

In [None]:
valid_sessions = Session.create_sessions_from_clicks(valid_clicks)
valid_session_objects = SessionObject.create_session_object_list(valid_sessions)
valid_vectors = SessionObject.create_input_vectors(valid_session_objects, info=False)
valid_labels = create_labels(all_buys, valid_session_objects, info=False)
valid_labels = reformat(valid_labels)

print('Created validation dataset\n')

In [None]:
test_sessions = Session.create_sessions_from_clicks(test_clicks)
test_session_objects = SessionObject.create_session_object_list(test_sessions)
test_vectors = SessionObject.create_input_vectors(test_session_objects, info=False)
test_labels = create_labels(all_buys, test_session_objects, info=False)
test_labels = reformat(test_labels)

print('Created test dataset\n')

print('--------  Dimensions  -------- \n')

print(' Train dataset:        %d %d   ' % (len(train_vectors), len(train_vectors[0])))
print(' Train labels:         %d %d \n' % (len(train_labels), len(train_labels[0])))

print(' Validation dataset:   %d %d   ' % (len(valid_vectors), len(valid_vectors[0])))
print(' Validation labels:    %d %d \n' % (len(valid_labels), len(valid_labels[0])))

print(' Test dataset:         %d %d   ' % (len(test_vectors), len(test_vectors[0])))
print(' Test labels:          %d %d \n' % (len(test_labels), len(test_labels[0])))

In [None]:
graph = tf.Graph()
with graph.as_default():
    # Variables.
    num_of_labels = 2
    learning_rate = 0.001
    hidden_size = 128

    # Input data. For the training data, use a placeholder that will be fed at run time with a training minibatch
    tf_train_vectors = tf.placeholder(tf.float32, shape=(batch_size, len(train_vectors[0])))
    tf_train_labels = tf.placeholder(tf.int32, shape=(batch_size, len(train_labels[0])))
    tf_valid_vectors = tf.constant(valid_vectors)
    tf_test_vectors = tf.constant(test_vectors)

    weights_h1 = tf.Variable(tf.truncated_normal([len(train_vectors[0]), hidden_size]))
    biases_h1 = tf.Variable(tf.zeros([hidden_size]))
    h1 = tf.nn.relu(tf.matmul(tf_train_vectors, weights_h1) + biases_h1)

    weights = tf.Variable(tf.truncated_normal([hidden_size, num_of_labels]))
    biases = tf.Variable(tf.zeros([num_of_labels]))

    # Training computation
    logits = tf.matmul(h1, weights) + biases
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

    # Optimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(
        tf.matmul(
            tf.nn.relu(
                tf.matmul(tf_valid_vectors, weights_h1) + biases_h1), weights) + biases)
    test_prediction = tf.nn.softmax(
        tf.matmul(
            tf.nn.relu(
                tf.matmul(tf_test_vectors, weights_h1) + biases_h1), weights) + biases)

Execute session and get results of train and test datasets 

Each batch should (in theory):
	Increase the accuracy
	Decrease the loss

After finishing all batches, test dataset is ran through the network

In [None]:
with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")

    last_validation_accuracy = 0.0

    stop_training = 0
    print(len(train_batches_vectors))

    while stop_training < 3:
        for batch_index in range(len(train_batches_vectors)):
            # Generate a batch.
            batch_vectors = train_batches_vectors[batch_index]
            batch_labels = train_batches_labels[batch_index]

            # Feed the dictionary and start training
            feed_dict = {tf_train_vectors: batch_vectors, tf_train_labels: batch_labels}

            _, l, predictions = session.run(
                [optimizer, loss, train_prediction],
                feed_dict=feed_dict
            )

            # print("Batch loss at batch %d: %f" % (batch_index, l))
            # print("Batch accuracy: %.1f%%" %
            #      accuracy(predictions, batch_labels))

            print("Validate accuracy: %.1f%%\n" %
                  accuracy(valid_prediction.eval(), valid_labels, precision_print=False))
        # with open("validations.txt", "a") as myfile:
        #                 textfloat = accuracy(valid_prediction.eval(), valid_labels, precision_print=False)
        #                 myfile.write("%.3f\n" % textfloat)

        validation_accuracy = accuracy(
            valid_prediction.eval(), valid_labels, precision_print=True)

        # Count the difference of validation set accuracy
        diff = validation_accuracy - last_validation_accuracy
        if diff < 1.0:
            stop_training += 1
            print('Accuracy improvement only %.3f, stop training increased to %d' % (diff, stop_training))
        else:
            print('Stop training reset')
            stop_training = 0

        last_validation_accuracy = validation_accuracy

    print("Test accuracy: %.1f%%" %
          accuracy(test_prediction.eval(), test_labels, precision_print=True))