In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('/content/drive/Shareddrives/DSL_Modeling_B/data/menu_final.csv', index_col=0).reset_index(drop=True)
data.head()

Unnamed: 0,SessionID,Menu,MenuID,timestamp
0,0,찐빵,0.0,1
1,0,오징어찌개,1.0,2
2,0,육개장,2.0,3
3,0,단호박샌드,3.0,4
4,0,김치찌개,4.0,5


In [None]:
del data['MenuID']
data.head()

Unnamed: 0,SessionID,Menu,timestamp
0,0,찐빵,1
1,0,오징어찌개,2
2,0,육개장,3
3,0,단호박샌드,4
4,0,김치찌개,5


In [None]:
# error 발생시키는 값 처리
data.iloc[135569, 1] = '수제비'
data.iloc[135809, 1] = '계란국'

In [None]:
# 컬럼명 변경
data.rename(columns={'SessionID':'session', 'Menu':'item'}, inplace=True)
data.head()

Unnamed: 0,session,item,timestamp
0,0,찐빵,1
1,0,오징어찌개,2
2,0,육개장,3
3,0,단호박샌드,4
4,0,김치찌개,5


In [None]:
# session별 history
history = data.groupby('session').item.apply(list)
history.head()

session
0    [찐빵, 오징어찌개, 육개장, 단호박샌드, 김치찌개, 어묵국, 베이글, 팽이장국, ...
1    [인절미토스트, 유부장국, 순두부찌개, 씨크립샌드, 대구찌개, 설렁탕, 팬케익, 쇠...
2    [사과파이, 부대찌개, 오징어국, 브라우니, 콩나물국, 대구찌개, 고구마샌드, 차돌...
3    [크로와상, 시금치국, 닭곰탕, 마늘빵, 꽃게탕, 수제비국, 씨크립샌드, 쇠고기샤브...
4    [야채샌드, 버섯들깨탕, 쇠고기미역국, 팬케익, 순두부찌개, 된장찌개, 단호박샌드,...
Name: item, dtype: object

# Model

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

## Dataset

In [None]:
class SessionDataset:

      def __init__(self, df):

          self.df = df.sort_values(by = ['session', 'timestamp']).reset_index(drop = True) # session (int) | timestamp (int) | item (string)
          self.offsets    = np.concatenate((np.zeros(1, dtype = np.int32), self.df.groupby('session').size().cumsum().values)) # indices in df where the sessions start
          self.n_sessions = len(self.offsets) - 1

          self.item_to_id = {item : i for i, item in enumerate(self.df.item.unique())}

          self.n_items = len(self.item_to_id)

      def item_to_one_hot(self, item):

          return tf.one_hot(self.item_to_id[item], depth = self.n_items)

      def extract_session(self, i, one_hot_encoded = True):

          session = self.df[self.offsets[i]:self.offsets[i+1]].copy()
          if one_hot_encoded:
              session.loc[:, 'item'] = session.item.apply(lambda x : self.item_to_one_hot(x))
          return session.item.values.tolist()

## Loss functions: TOP1 and BPR

In [None]:
# y_true = (BATCH_SIZE, n_classes)   one-hot representations of the target items (ground truths)
# y_pred = (BATCH_SIZE, n_classes)   model output = next item scores (logits) for each item in the batch

sampling = True

if sampling: # = the negative items considered in the loss computation are those within the same batch
    
    def BPR(y_true, y_pred):
        to_lookup = tf.argmax(y_true, axis = 1)   # = indices of the target items
        scores = tf.nn.embedding_lookup(tf.transpose(y_pred), to_lookup)  # embedding_lookup is the same as "extract_rows". In this way, the positive items end up on the diagonal
        return tf.reduce_mean(-tf.math.log(tf.nn.sigmoid(tf.linalg.diag_part(scores) - scores)))

    def TOP1(y_true, y_pred):
        to_lookup = tf.argmax(y_true, axis = 1)
        scores = tf.nn.embedding_lookup(tf.transpose(y_pred), to_lookup)
        diag_scores = tf.linalg.diag_part(scores)
        loss_by_sample  = tf.reduce_mean(tf.nn.sigmoid(scores - diag_scores) + tf.nn.sigmoid(tf.square(scores)), axis = 0)
        loss_by_sample -= tf.nn.sigmoid(tf.square(diag_scores)) / tf.reduce_sum(tf.ones_like(diag_scores)) # only sigmoids of squares of negative items had to be added: remove those of positive items
        return tf.reduce_mean(loss_by_sample)

else: # = consider all negative items in the loss computation (only makes sense if the number of items is small, like the same order as the batch size)

    def BPR(y_true, y_pred):  # both inputs have shape (BATCH_SIZE, n_classes)
        _y_pred = tf.expand_dims(y_pred, axis = -1)  # (BATCH_SIZE, n_classes, 1) 
        mat = tf.matmul(tf.expand_dims(tf.ones_like(y_true), -1), tf.expand_dims(y_true, axis = 1)) # (BATCH_SIZE, n_classes, 1) x (BATCH_SIZE, 1, n_classes) = (BATCH_SIZE, n_classes, n_classes)
        score_diffs = tf.matmul(mat, _y_pred) # (BATCH_SIZE, n_classes, n_classes) x (BATCH_SIZE, n_classes, 1) = (BATCH_SIZE, n_classes, 1)
        score_diffs = tf.squeeze(score_diffs - _y_pred, -1) # (BATCH_SIZE, n_classes)
        return -tf.reduce_sum(tf.math.log(tf.nn.sigmoid(score_diffs)))

    def TOP1(y_true, y_pred):
        _y_pred = tf.expand_dims(y_pred, axis = -1)  # (BATCH_SIZE, n_classes) ---> (BATCH_SIZE, n_classes, 1) 
        mat = tf.matmul(tf.expand_dims(tf.ones_like(y_true), -1), tf.expand_dims(y_true, axis = 1)) # (BATCH_SIZE, n_classes, 1) x (BATCH_SIZE, 1, n_classes) --> (BATCH_SIZE, n_classes, n_classes)
        score_diffs = tf.matmul(mat, _y_pred) # (BATCH_SIZE, n_classes, n_classes) x (BATCH_SIZE, n_classes, 1) --> (BATCH_SIZE, n_classes, 1)
        score_diffs = tf.squeeze(score_diffs - _y_pred, -1) # (BATCH_SIZE, n_classes)
        loss_by_sample = tf.reduce_sum(tf.nn.sigmoid(tf.square(y_pred)), axis = -1) + \
                          tf.reduce_sum(tf.sigmoid(-score_diffs), axis = -1) + \
                        -tf.squeeze(tf.squeeze(tf.nn.sigmoid(tf.square(tf.matmul(tf.expand_dims(y_true, 1), _y_pred))), -1), -1)
        return tf.reduce_sum(loss_by_sample)

## Model: GRU4Rec

In [None]:
class Gru4Rec:

    def __init__(self, n_classes, n_layers = 1, n_hidden = 64, loss = TOP1, batch_size = 10):

        self.n_classes  = n_classes   # = number of items

        self.n_layers = n_layers  # number of stacked GRU layers
        self.n_hidden = n_hidden  # dimension of GRU cell's hidden state
        self.loss     = loss
        self.batch_size = batch_size

        self.model = self.build_model()

    def build_model(self):

        model = tf.keras.models.Sequential()
        for i in range(self.n_layers):
            model.add(tf.keras.layers.GRU(name = 'GRU_{}'.format(i+1),
                                          units      = self.n_hidden, 
                                          activation = 'relu', 
                                          stateful   = True,
                                          return_sequences = (i < self.n_layers - 1)))
        model.add(tf.keras.layers.Dense(units = self.n_classes, activation = 'linear'))   # class logits

        # track top 3 accuracy (= how often the true item is among the top 3 recommended)
        top3accuracy = lambda y_true, y_pred: tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 3)
        top3accuracy.__name__ = 'top3accuracy'
        model.compile(loss = self.loss, optimizer = 'adam', metrics = ['accuracy', top3accuracy])

        model.build(input_shape = (self.batch_size, 1, self.n_classes))
        print(model.summary())

        return model

    def _reset_hidden(self, i):

        for nl, layer in enumerate(self.model.layers):   # session has changed: reset corresponding hidden state
            if self._is_GRU_layer(layer) and layer.states[0] is not None:
                hidden_updated = layer.states[0].numpy()
                hidden_updated[i, :] = 0.
                self.model.layers[nl].reset_states(hidden_updated)

    def _is_GRU_layer(self, layer):

        return layer.name.startswith('GRU_')

    def train_batch_generator(self, dataset):  # session | item | timestamp
        # generates batches of training data X, y = session item, next session item

        assert dataset.n_sessions > self.batch_size, "Training set is too small. Reduce batch size or collect more training data"
        ixs = np.arange(dataset.n_sessions)

        stacks = [[]] * self.batch_size   # stacks containing batch_size REVERSED (pieces of) sessions at once. Will be emptied progressively
        next_session_id = 0

        X, y = np.empty(shape = (self.batch_size, 1, self.n_classes)), np.empty(shape = (self.batch_size, self.n_classes))    
        while True:
            X[:], y[:] = None, None
            for i in range(self.batch_size): # fill in X, y (current batch)
                # 1. If stack i is empty (only happens at first round) or has only one element: fill it with a new session
                if len(stacks[i]) <= 1:
                    if next_session_id >= dataset.n_sessions: # no more sessions available: shuffle sessions and restart
                        np.random.shuffle(ixs)
                        next_session_id = 0
                    while not len(stacks[i]) >= 2:   # ignore sessions with only one element (cannot contribute to the training)
                        stacks[i] = dataset.extract_session(ixs[next_session_id])[::-1]  # the data does not have to be all in memory at the same time: we could e.g. load a session at once
                        next_session_id += 1
                    self._reset_hidden(i)   # if session changes, the corresponding hidden state must be reset
                # 2. Stack i is now valid: set input + target variables
                X[i, 0] = stacks[i].pop()
                y[i]    = stacks[i][-1]

            yield tf.constant(X, dtype = tf.float32), tf.constant(y, dtype = tf.float32)

    def fit(self, dataset, steps_per_epoch = 10000, epochs = 50):

        checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = "gru-chkpt-{epoch:02d}.hdf5")
        self.model.fit_generator(generator       = self.train_batch_generator(dataset), 
                                 steps_per_epoch = steps_per_epoch, 
                                 epochs          = epochs,
                                 callbacks       = [checkpoint], 
                                 shuffle         = False)

## Test

In [None]:
df = data.sort_values(by = ['session', 'timestamp']).reset_index(drop = True)
offsets = np.concatenate((np.zeros(1, dtype = np.int32), df.groupby('session').size().cumsum().values))

dataset_train = SessionDataset(df.iloc[~df.index.isin(offsets[1:] - 1)])  # training set: remove last element from each session

# Test set: x = penultimate item in each session, y = last item in each session
X_test = df.iloc[offsets[1:] - 2][['session', 'item']].sort_values(by = ['session']).reset_index(drop = True)
y_test = df.iloc[offsets[1:] - 1][['session', 'item']].sort_values(by = ['session']).reset_index(drop = True)

print("X_test")
print(X_test.head())
print('')
print("y_test")
print(y_test.head())

X_test
   session    item
0        0   냉이된장국
1        1   근대된장국
2        2  사골우거지국
3        3  매운콩나물국
4        4  냉이된장찌개

y_test
   session   item
0        0    맑은국
1        1     우동
2        2  양송이스프
3        3   잔치국수
4        4  들깨미역국


In [None]:
g4r = Gru4Rec(n_classes = dataset_train.n_items)
g4r.fit(dataset_train)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 GRU_1 (GRU)                 (10, 64)                  472896    
                                                                 
 dense_1 (Dense)             (10, 2397)                155805    
                                                                 
Total params: 628,701
Trainable params: 628,701
Non-trainable params: 0
_________________________________________________________________




None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Model save

In [None]:
g4r.model.save_weights("/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/GRU4Rec_top1_weights.h5")

## Model load

In [None]:
g4r = Gru4Rec(n_classes = dataset_train.n_items)

In [None]:
g4r.model.load_weights("/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/GRU4Rec_top1_weights.h5")

## Evaluation

In [None]:
final_states = np.empty(shape = (dataset_train.n_sessions, g4r.n_layers, g4r.n_hidden)) # final states will be stored here
final_states[:] = None
done = [False] * dataset_train.n_sessions   # keep track of the sessions for which the last state has already been calculated

stacks = [dataset_train.extract_session(i)[::-1] for i in range(g4r.batch_size)]
next_session_id = g4r.batch_size
batch_idx_to_session = np.arange(g4r.batch_size)   # keep track of which session is in each batch element
X = np.empty(shape = (g4r.batch_size, 1, g4r.n_classes))

g4r.model.reset_states()    # all hidden states set to 0 (starting point)

n_done = 0
while n_done < dataset_train.n_sessions:
    for i in range(g4r.batch_size):
        while len(stacks[i]) == 1:  # stack i is at the end
            if not done[batch_idx_to_session[i]]:
                # save final hidden state
                final_states[batch_idx_to_session[i], :] = np.array([layer.states[0][i, :] for layer in g4r.model.layers if g4r._is_GRU_layer(layer)])
                done[batch_idx_to_session[i]] = True
                n_done += 1
                if n_done % 100 == 0:
                    print("Progress: {} / {}".format(n_done, dataset_train.n_sessions))
            if next_session_id >= dataset_train.n_sessions: # restart from the beginning (just to reach required batch size)
                next_session_id = 0
            stacks[i] = dataset_train.extract_session(next_session_id)[::-1]
            batch_idx_to_session[i] = next_session_id
            next_session_id += 1
            g4r._reset_hidden(i)   # session has changed --> reset corresponding hidden state
        X[i, 0] = stacks[i].pop()

    _ = g4r.model.predict(X)   # hidden states get updated when "predict" is called

print("All final hidden states calculated")
np.save('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/final_states_top1.npy', final_states, allow_pickle = False)

Progress: 100 / 4583
Progress: 200 / 4583
Progress: 300 / 4583
Progress: 400 / 4583
Progress: 500 / 4583
Progress: 600 / 4583
Progress: 700 / 4583
Progress: 800 / 4583
Progress: 900 / 4583
Progress: 1000 / 4583
Progress: 1100 / 4583
Progress: 1200 / 4583
Progress: 1300 / 4583
Progress: 1400 / 4583
Progress: 1500 / 4583
Progress: 1600 / 4583
Progress: 1700 / 4583
Progress: 1800 / 4583
Progress: 1900 / 4583
Progress: 2000 / 4583
Progress: 2100 / 4583
Progress: 2200 / 4583
Progress: 2300 / 4583
Progress: 2400 / 4583
Progress: 2500 / 4583
Progress: 2600 / 4583
Progress: 2700 / 4583
Progress: 2800 / 4583
Progress: 2900 / 4583
Progress: 3000 / 4583
Progress: 3100 / 4583
Progress: 3200 / 4583
Progress: 3300 / 4583
Progress: 3400 / 4583
Progress: 3500 / 4583
Progress: 3600 / 4583
Progress: 3700 / 4583
Progress: 3800 / 4583
Progress: 3900 / 4583
Progress: 4000 / 4583
Progress: 4100 / 4583
Progress: 4200 / 4583
Progress: 4300 / 4583
Progress: 4400 / 4583
Progress: 4500 / 4583
All final hidden st

In [None]:
final_states = np.load('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/final_states_top1.npy')

g4r.model.reset_states()

rem = dataset_train.n_sessions % g4r.batch_size
if rem > 0:
    X_test = pd.concat((X_test, X_test[:(g4r.batch_size - rem)]), axis = 0)

# Calculate next item predictions for all sessions
y_pred = np.empty(shape = (dataset_train.n_sessions, g4r.n_classes))
y_pred[:] = None
X = np.empty(shape = (g4r.batch_size, 1, g4r.n_classes))
for batch_id in range(dataset_train.n_sessions // g4r.batch_size):
    # X contains the penultimate item in the session (= last item in the training set)
    X[:] = None
    for i in range(g4r.batch_size):
        X[i, :] = dataset_train.item_to_one_hot(X_test.iloc[batch_id * g4r.batch_size + i]['item'])
    # set hidden states equal to final hidden states for sessions in the batch
    nlg = 0
    for nl, layer in enumerate(g4r.model.layers):
        if g4r._is_GRU_layer(layer):
            g4r.model.layers[nl].reset_states(final_states[batch_id * g4r.batch_size : (batch_id + 1) * g4r.batch_size, nlg, :])
            nlg += 1
    # objective: predict last element in the session
    y_pred[batch_id * g4r.batch_size : (batch_id + 1) * g4r.batch_size, :] = g4r.model.predict(X)[:g4r.batch_size]

y_pred = tf.constant(y_pred[:dataset_train.n_sessions], dtype = tf.float32)

In [None]:
# Retrieve ground truths
y_true = np.empty(shape = (dataset_train.n_sessions, dataset_train.n_items))
for i in range(y_true.shape[0]):
    y_true[i, :] = dataset_train.item_to_one_hot(y_test.item.values[i])
y_true = tf.constant(y_true, dtype = tf.float32)

In [None]:
acc       = (tf.reduce_sum(tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 1)) / y_true.shape[0]).numpy()
top_3_acc = (tf.reduce_sum(tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k = 3)) / y_true.shape[0]).numpy()

print("Accuracy = {}".format(acc))
print("Top-3 accuracy = {}".format(top_3_acc))

Accuracy = 0.1047348901629448
Top-3 accuracy = 0.23019856214523315


## Predict value save

In [None]:
y_true_np = np.array(y_true)[:4580, :]
y_pred_np = np.array(y_pred)[:4580, :]

In [None]:
y_true_np

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
y_pred_np

array([[-0.18234196,  0.13206947,  0.4707626 , ..., -0.30531332,
         0.46278262, -0.6237427 ],
       [-0.17019618, -0.5519112 ,  0.15971166, ..., -0.25626686,
        -0.12692827, -0.7925914 ],
       [ 0.15443835, -0.3544121 , -0.11244911, ...,  0.08300754,
        -0.10244731, -0.40732247],
       ...,
       [ 0.40965822, -0.0440309 , -0.13181376, ...,  0.4221342 ,
         1.2867311 ,  0.16569565],
       [-0.15870696,  0.6066526 ,  2.7666235 , ...,  1.1260334 ,
        -0.19086054, -0.5577077 ],
       [-0.28598043, -1.2953919 ,  1.126379  , ...,  1.4687626 ,
        -0.13356   ,  0.6174801 ]], dtype=float32)

In [None]:
y_true_np.shape

(4580, 2397)

In [None]:
y_pred_np.shape

(4580, 2397)

In [None]:
np.save('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/y_true_top1.npy', y_true_np, allow_pickle = False)
np.save('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/y_pred_top1.npy', y_pred_np, allow_pickle = False)

## Test predict value save

In [None]:
y_true_test = np.load('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/y_true_top1.npy')
y_pred_test = np.load('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/y_pred_top1.npy')

In [None]:
# 설문조사 데이터는 session 4532 부터
y_true_test = y_true_test[4532:, :]
y_pred_test = y_pred_test[4532:, :]

In [None]:
np.save('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/y_true_test_top1.npy', y_true_test, allow_pickle = False)
np.save('/content/drive/Shareddrives/DSL_Modeling_B/model/Session-based/GRU4Rec_save/y_pred_test_top1.npy', y_pred_test, allow_pickle = False)