In [2]:
import sys
sys.path.append("../src")
from dataLoad import dataLoad, fe_trainLoad

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import gc
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split


In [3]:
path= "../dataset/"
item_names_table, train_user_seq_log, test_user_label, test_user_seq_log = dataLoad(path)
fe_train_user_seq_log = fe_trainLoad(path)

In [4]:
COLUMNS = [
    "user",
    "item_id",
    "day_of_week",
    "days",
    "hour",
    "weeks",
    # "cumcount",
    # "click_count_normalized",
    # "user_click_count_normalized"
]
N_SAMPLES = 500000

transactions = fe_train_user_seq_log.iloc[:N_SAMPLES]#.merge(item_names_table, on = ("item_id"))[COLUMNS]

In [5]:
transactions.item_id.nunique()

37959

In [6]:
ids = np.arange(transactions.item_id.nunique())
# shuffling to prevent any correlation between new labels and outcome
np.random.shuffle(ids)

In [7]:
# mapping article id to integer encoding 
map_item_id = {i:j for i, j in zip(transactions.item_id.unique(), ids)}

In [8]:
transactions["item_map"] = transactions["item_id"].map(map_item_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions['item_map'] = transactions['item_id'].map(map_item_id)


In [10]:
transactions.head()

Unnamed: 0,user,item_id,timestamp,day_of_week,days,hour,weeks,cumcount,click_count,click_count_normalized,user_click_count,user_click_count_normalized,item_map
0,0,805696,2021-02-11 13:03:42,3,0,13,1,1,3435,0.09966,1714,0.053422,4702
1,0,386903,2021-02-11 13:03:52,3,0,13,1,1,561,0.016252,1714,0.053422,22602
2,0,386903,2021-02-12 13:41:36,4,1,13,1,2,561,0.016252,1714,0.053422,22602
3,0,3832,2021-02-11 13:04:07,3,0,13,1,1,15941,0.462606,1714,0.053422,637
4,0,3832,2021-02-12 08:33:29,4,1,8,1,2,15941,0.462606,1714,0.053422,637


In [11]:
transactions_train = transactions.groupby("user").agg({"item_map":lambda x: list(x)})

In [12]:
transactions_train.head()

Unnamed: 0_level_0,item_map
user,Unnamed: 1_level_1
0,"[4702, 22602, 22602, 637, 637, 637, 637, 637, ..."
25,"[4702, 637, 33418, 16416, 16416, 23023, 23023,..."
29,"[4702, 22602, 16416, 33706, 26961, 26961, 3427..."
43,"[4702, 15908, 33706, 33706, 26961, 32769, 7111..."
54,"[4702, 637, 637, 637, 33418, 15908, 15908, 159..."


In [13]:
# Padding to make all lists of same size 
length = max(map(len, transactions_train.item_map))
X = np.asarray([[0]*(length-len(xi)) + xi for xi in transactions_train.item_map]).astype("int32")
X

array([[    0,     0,     0, ..., 21345, 35679, 13956],
       [    0,     0,     0, ...,  2992, 29781, 37039],
       [    0,     0,     0, ..., 20547, 20547, 20547],
       ...,
       [    0,     0,     0, ...,  6717, 24520, 15676],
       [    0,     0,     0, ..., 27775, 21775, 29049],
       [    0,     0,     0, ..., 10178, 33259, 18530]])

In [17]:
X_train, X_val, y_label_train, y_val_train = train_test_split(X[:,:-1], X[:, -1], test_size=0.2, shuffle=True, random_state=42)
y_train = tf.keras.utils.to_categorical(y_label_train, num_classes=transactions.item_id.nunique())
# onehot encoding on labels corresponding to num_classes
y_val = tf.keras.utils.to_categorical(y_val_train, num_classes=transactions.item_id.nunique())
X_train.shape

(265, 19833)

In [18]:
gc.collect()

164

In [19]:
# BASIC Bidirectional-LSTM Model Architecture

model = tf.keras.Sequential()
# Adding an Embbeding Layer
model.add(layers.Embedding(transactions.item_id.nunique(), 20, input_length=X_train.shape[1]))
model.add(layers.Bidirectional(layers.LSTM(64, dropout=0.2)))
model.add(layers.Dense(transactions.item_id.nunique(), activation="softmax"))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19833, 20)         759180    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              43520     
 l)                                                              
                                                                 
 dense (Dense)               (None, 37959)             4896711   
                                                                 
Total params: 5,699,411
Trainable params: 5,699,411
Non-trainable params: 0
_________________________________________________________________


In [21]:
from tensorflow.keras.optimizers import RMSprop, Adam

optimizer = Adam(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [22]:
history = model.fit(X_train, y_train, epochs=3, validation_data=(X_val, y_val), verbose=2, batch_size=64)
model.save("./model1.h5")

Epoch 1/3


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
ax1.plot(history.history["accuracy"])
ax2.plot(history.history["loss"], color="Orange")
# plt.plot(history.history["val_acc"])
# plt.title("model accuracy")
# plt.ylabel("accuracy")
ax2.set_xlabel("epoch")
ax1.set_ylabel("accuracy")
ax2.set_ylabel("loss")
ax1.set_title("Train Accuracy/Epoch")
ax2.set_title("Train Loss/Epoch")


plt.show()