# 1. Bi-LSTM, DAE

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Input, Embedding, Flatten, Dropout, concatenate, Activation, RepeatVector, BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from tensorflow import keras
from keras.utils.np_utils import to_categorical

from tensorflow.keras import layers
import tensorflow as tf

from collections import defaultdict
import random
import gc

itemset_item_training = pd.read_csv('./Dataset/itemset_item_training.csv', delimiter=',', names=['itemset_id', 'item_id'])

row_connect_itemset_to_item_dataframe = itemset_item_training.groupby('itemset_id', as_index=False)['item_id'].agg(lambda x: list(sorted(x)))
row_connect_itemset_to_item_dataframe['item_count'] = row_connect_itemset_to_item_dataframe['item_id'].apply(lambda x: len(x))

row_connect_item_to_itemset_dataframe = itemset_item_training.groupby('item_id', as_index=False)['itemset_id'].agg(lambda x: list(sorted(x)))
row_connect_item_to_itemset_dataframe['itemset_count'] = row_connect_item_to_itemset_dataframe['itemset_id'].apply(lambda x: len(x))

row_connect_itemset_to_item_dataframe.insert(1, 'iset_id', row_connect_itemset_to_item_dataframe['itemset_id'])
row_connect_itemset_to_item_dataframe.set_index('iset_id', inplace=True)

# train에서 1번도 출현하지 않은 itemset index에 대해 row 삽입
for i in list(set(range(27694)) - set(row_connect_itemset_to_item_dataframe['itemset_id'])):
    row_connect_itemset_to_item_dataframe.loc[i] = [i, [], 0]
row_connect_itemset_to_item_dataframe = row_connect_itemset_to_item_dataframe.sort_index()

ii_itemset_sim_check = list(row_connect_itemset_to_item_dataframe.itertuples(index=False))
ii_item_sim_check = list(row_connect_item_to_itemset_dataframe.itertuples(index=False))

ii_itemset_sim_check.sort(key=lambda x: x[0])
ii_item_sim_check.sort(key=lambda x: x[0])

check_appearance = []
with open('./Dataset/itemset_item_valid_query.csv', 'r') as f:
    for row in f.readlines(): 
        itemset, item = row.strip().split(',')
        check_appearance.append(int(item))

check_appearance.sort(reverse=True)
re_index = defaultdict(lambda: -1)
index = 0
while check_appearance:
    re_index[check_appearance.pop()] = index
    index += 1

for i in range(42563):
    if re_index[i] == -1:
        re_index[i] = index
        index += 1

itemset_sequence = defaultdict(list)
for item in ii_itemset_sim_check:
    if len(item[1]):
        random.shuffle(item[1])
        itemset_sequence[item[0]] = item[1]
train_data = [value for value in itemset_sequence.values()]

validset_dict = defaultdict(list)

with open('./Dataset/itemset_item_valid_query.csv') as f:
    for row in f.readlines():
        key, value = map(float, row.strip().split(','))
        validset_dict[int(key)].append(int(value))

validset_list = [(key, value) for key, value in validset_dict.items()]
validset_list.sort(key=lambda x: x[0])
valid_data = [value[1] for value in validset_list]

validset_answer_dict = defaultdict(list)
with open('./Dataset/itemset_item_valid_answer.csv') as f:
    for row in f.readlines():
        key, value = map(float, row.strip().split(','))
        validset_answer_dict[int(key)].append(int(value))

validset_answer_list = [(key, value) for key, value in validset_answer_dict.items()]
validset_answer_list.sort(key=lambda x: x[0])
valid_answer_data = [value[1] for value in validset_answer_list]

n_steps = 4
n_features = 1

X_train, Y_train, X_valid, Y_valid = [], [], [], []
for seq in train_data:
    X_train.append(seq[:len(seq)-1] + [-1 for _ in range(5-len(seq))])
    Y_train.append(seq[-1])
X_train, Y_train = np.array(X_train), np.array(Y_train)

for seq in valid_data:
    X_valid.append(seq[:len(seq)-1] + [-1 for _ in range(5-len(seq))])

for seq in valid_answer_data:
    Y_valid.append(seq[0])
X_valid, Y_valid = np.array(X_valid), np.array(Y_valid)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))
X_valid = X_valid.reshape((X_valid.shape[0], X_valid.shape[1], n_features))

## 1-1. Validation of the trained model

In [None]:
model = tf.keras.models.load_model('Bi-LSTM-DAE-64-0.2-64-tanh-3-0.1127-93.5052.h5')
model.summary()

correct = 0
total = len(X_valid)
first = True
score = []
for validation, gt in tqdm(zip(X_valid, Y_valid), total=len(X_valid)):
    validation = np.reshape(validation, (1, n_steps, n_features))
    prediction = model.predict(validation, verbose=0)
    result = tf.math.top_k(prediction, k=100)
    result = result.indices.numpy()
    if first:
        print(result.shape)
        first = False
    if gt in result:
        correct += 1
        # print(validation, gt, np.argwhere(result == gt)[0], result)
        score.append(np.argwhere(result == gt)[0][1]+1)
    else:
        score.append(101)
print(f'Top 100 Acc : {correct/total:.4f}, Avg Rank : {sum(score)/len(score):.4f}')

## 1-2. Single Bi-LSTM

In [None]:
model = Sequential()
model.add(tf.keras.layers.Input(shape=(n_steps, n_features)))
model.add(tf.keras.layers.Masking(mask_value=-1, input_shape=(n_features, )))
model.add(Bidirectional(LSTM(64, activation='tanh')))
model.add(Dense(42563, activation='softmax'))
model.summary()

# Is Adadelta useful in this task? ...
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

class PredictionCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        correct = 0
        total = len(X_valid)
        first = True
        score = []
        for validation, gt in tqdm(zip(X_valid, Y_valid), total=len(X_valid)):
            validation = np.reshape(validation, (1, n_steps, n_features))
            prediction = model.predict(validation, verbose=0)
            result = tf.math.top_k(prediction, k=100)
            result = result.indices.numpy()
            if first:
                print(result.shape)
                first = False
            if gt in result:
                correct += 1
                # print(validation, gt, np.argwhere(result == gt)[0], result)
                score.append(np.argwhere(result == gt)[0][1]+1)
            else:
                score.append(101)
        print(f'Top 100 Acc : {correct/total:.4f}, Avg Rank : {sum(score)/len(score):.4f}')
        model.save(f'./Checkpoints/Bi-LSTM-64-tanh-{epoch}-{correct/total:.4f}-{sum(score)/len(score):.4f}.h5')
        gc.collect()
        return super().on_epoch_end(epoch, logs)

history = model.fit(X_train, Y_train, shuffle=True, epochs=20, callbacks=[PredictionCallback()])

## 1-3. DAE based on the LSTM Autoencoder

In [None]:
model = Sequential()

# Encoder
model.add(tf.keras.layers.Masking(mask_value=-1., input_shape=(n_steps, n_features)))
model.add(Bidirectional(LSTM(64, activation='tanh', return_sequences=True, recurrent_dropout=0.2), input_shape=(n_steps, n_features)))
model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(32, activation='tanh', return_sequences=True)))

# Decoder
# model.add(Bidirectional(LSTM(32, activation='tanh', return_sequences=True)))
model.add(Bidirectional(LSTM(64, activation='tanh')))

# Classification
model.add(Dense(42563, activation='softmax'))
model.summary()

# Is Adadelta useful in this task? ...
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

class PredictionCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        correct = 0
        total = len(X_valid)
        first = True
        score = []
        for validation, gt in tqdm(zip(X_valid, Y_valid), total=len(X_valid)):
            validation = np.reshape(validation, (1, n_steps, n_features))
            prediction = model.predict(validation, verbose=0)
            result = tf.math.top_k(prediction, k=100)
            result = result.indices.numpy()
            if first:
                print(result.shape)
                first = False
            if gt in result:
                correct += 1
                # print(validation, gt, np.argwhere(result == gt)[0], result)
                score.append(np.argwhere(result == gt)[0][1]+1)
            else:
                score.append(101)
        print(f'Top 100 Acc : {correct/total:.4f}, Avg Rank : {sum(score)/len(score):.4f}')
        model.save(f'./Checkpoints/Bi-LSTM-DAE-64-0.2-64-tanh-{epoch}-{correct/total:.4f}-{sum(score)/len(score):.4f}.h5')
        gc.collect()
        return super().on_epoch_end(epoch, logs)

history = model.fit(X_train, Y_train, shuffle=True, epochs=40, callbacks=[PredictionCallback()])