In [0]:
import pandas as pd
import itertools
import collections
import numpy as np
import pandas as pd
from typing import List
import sys

from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras import Sequential
import tensorflow as tf

In [0]:
def loaddata(path):
    sessions_df = pd.read_csv(path, sep=",", header=None)
    print(sessions_df.columns)
    sessions_df.columns = ["session_id", "timestamp", "item_id", "category"]
    sessions_df["timestamp"] = pd.to_datetime(sessions_df["timestamp"])
    return sessions_df

In [4]:
url = "https://www.dropbox.com/s/urf0v28umc7afg2/yoochoose-clicks-sample.dat?dl=1"
sessions_df = loaddata(url)

Int64Index([0, 1, 2, 3], dtype='int64')


In [5]:
print("Shape of the dataframe ",sessions_df.shape,"\n",sessions_df.head())

Shape of the dataframe  (5000000, 4) 
    session_id                        timestamp    item_id  category
0           1 2014-04-07 10:51:09.277000+00:00  214536502         0
1           1 2014-04-07 10:54:09.868000+00:00  214536500         0
2           1 2014-04-07 10:54:46.998000+00:00  214536506         0
3           1 2014-04-07 10:57:00.306000+00:00  214577561         0
4           2 2014-04-07 13:56:37.614000+00:00  214662742         0


In [0]:
def build_sessions(sessions_df,max_products=1000,min_session_size=3):
    print("Session Dataframe length before grouping", len(sessions_df))
    
    all_items = sessions_df["item_id"].values
    items_counter = collections.Counter(all_items)
    most_common_items = dict(items_counter.most_common(max_products))
    ids_to_indices = dict((item_id, i+1) for i, item_id in enumerate(most_common_items.keys()))
    
    session_dictionary = sessions_df.to_dict(orient='records')
    grouped_sessions = itertools.groupby(session_dictionary, lambda d: d["session_id"])
    sessions = []
    
    for _, session in grouped_sessions:
        item_list = [d["item_id"] for d in sorted(list(session), key=lambda x: x["timestamp"])]
        item_list = [ids_to_indices[item] for item in item_list if item in ids_to_indices]
        if len(item_list) >= min_session_size:
            sessions.append(item_list)
    
    print("Sessions count ", len(sessions))
    
    return sessions, most_common_items

In [17]:
sessions, most_common_items = build_sessions(sessions_df)

Session Dataframe length before grouping 5000000
Sessions count  447161


In [41]:
max_session_length = 100
padded_sessions = pad_sequences(sessions,maxlen=max_session_length, padding='post', truncating='pre', value=0)
print("Shape of padded session ",padded_sessions.shape)
padded_sessions = np.array(padded_sessions)

Shape of padded session  (447161, 100)


In [43]:
vocab_size = len(most_common_items) + 1
embedding_size = 20
input_length = max_session_length - 1

model = Sequential()
model.add(Masking(mask_value=0, input_shape=(input_length, )))
model.add(Embedding(vocab_size, embedding_size, input_length=input_length, mask_zero=True))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(200, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_3 (Masking)          (None, 99)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 99, 20)            20020     
_________________________________________________________________
lstm_3 (LSTM)                (None, 99, 100)           48400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 99, 200)           240800    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 99, 1001)          201201    
Total params: 510,421
Trainable params: 510,421
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
def categorical_accuracy_sequential(y_true, y_pred):
    y_true = tf.squeeze(y_true)
    padding_mask = tf.greater(y_true, 0)
    
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, tf.float32)

    match = tf.cast(tf.equal(y_true, y_pred), tf.float32)

    match_masked = match * tf.cast(padding_mask, tf.float32)
    return tf.reduce_sum(match_masked) / tf.reduce_sum(tf.cast(padding_mask, tf.float32))

In [0]:
X = padded_sessions[:, :-1]
y = np.expand_dims(padded_sessions[:, 1:], -1)

In [46]:
model.compile("adam", loss="sparse_categorical_crossentropy", metrics=[categorical_accuracy_sequential])
model.fit(x=X, y=y, validation_split=0.1, batch_size=32)

Train on 402444 samples, validate on 44717 samples
Epoch 1/1


<keras.callbacks.History at 0x7f755abd0b38>