In [None]:
import pandas as pd
import numpy as np
from transformers import set_seed
from tqdm import tqdm
import os
import random
import json

set_seed(42)
dataset = "az-books"
source_dir = os.path.join(f"Datasets/{dataset}", "proc_data")
target_dir = os.path.join(f"data/{dataset}", "proc_data")
os.makedirs(target_dir, exist_ok=True)

In [None]:
save_data_file = source_dir + '/sequential_data.json'  # interaction sequence between user and item
item2attributes_file = source_dir + '/item2attributes.json'  # item and corresponding attributes
datamaps_file = source_dir + '/datamaps.json'  # datamap
split_file = source_dir + '/train_test_split.json'  # train/test splitting

user_items = json.load(open(save_data_file, "r"))
item2attributes = json.load(open(item2attributes_file, "r"))
datamaps = json.load(open(datamaps_file, "r"))
itemid2title = datamaps["itemid2title"]
df = []

for user, his_items in user_items.items():
    for idx, itemid in enumerate(his_items[0]):
        if idx == 0:
            continue
        row = {'User ID': user, 'Item ID': itemid, 'user history ID': his_items[0][:idx]}
        df.append(row)
df_data = pd.DataFrame(df)
print(len(df_data))
df_data.head(5)

In [None]:
user_fields = ["User ID"]
item_fields = ["Item ID", "Brand 1", "Brand 2"]

user_feat_count = [len(datamaps["user2id"]) + 1]
item_feat_count = [len(datamaps["item2id"]) + 1,  2062 + 1, 35]

user_feat_offset = [0]
item_feat_offset = [0, item_feat_count[0], item_feat_count[0]]


print("---------------------------------------------------------------")
for f, fc, fo in zip(user_fields, user_feat_count, user_feat_offset):
    print(f, fc, fo)
    
print("---------------------------------------------------------------")

for f, fc, fo in zip(item_fields, item_feat_count, item_feat_offset):
    print(f, fc, fo)
print("---------------------------------------------------------------")


item_feats_table = [[0, 0, 0]] + [([k] + item2attributes[str(k)]) for k in range(1, len(item2attributes) + 1)]


In [None]:
df_data = df_data[df_data["user history ID"].apply(lambda x: len(x)) >= 5].reset_index(drop=True)
df_item = pd.DataFrame({
    "Item ID": [int(k) for k in item2attributes.keys()],
    "Brand 1": [item2attributes[k][0] for k in item2attributes.keys()],
    "Brand 2": [item2attributes[k][1] for k in item2attributes.keys()],
})

df_data = pd.merge(df_data, df_item, on=["Item ID"], how="inner")

print(len(df_data))
df_data.head()

In [None]:
for field in user_fields + item_fields:
    df_data[field] = df_data[field].apply(lambda x: int(x))

df_data["user history ID"] = df_data["user history ID"].apply(lambda x: [int(k) for k in x])


In [None]:
valid_idx = []
test_idx = []

for uid, df_u in tqdm(df_data.groupby(["User ID"])):
    valid_idx.append(df_u.tail(2).index[0])
    test_idx.append(df_u.tail(1).index[0])

valid_idx = sorted(valid_idx)
test_idx = sorted(test_idx)
train_idx = sorted(list(set(range(len(df_data))) - set(valid_idx + test_idx)))

df_train = df_data.iloc[train_idx].reset_index(drop=True)
df_valid = df_data.iloc[valid_idx].reset_index(drop=True)
df_test = df_data.iloc[test_idx].reset_index(drop=True)

In [None]:
train_num = len(df_train)
valid_num = len(df_valid)
test_num = len(df_test)
print("Num train/valid/test:", train_num, valid_num, test_num)

df_train.to_parquet(os.path.join(target_dir, "train.parquet.gz"), compression="gzip")
df_valid.to_parquet(os.path.join(target_dir, "valid.parquet.gz"), compression="gzip")
df_test.to_parquet(os.path.join(target_dir, "test.parquet.gz"), compression="gzip")

df_train.head()

In [None]:
movie_to_users = {i: [] for i in range(item_feat_count[0])}
movie_to_users[0] = [0]

for idx, row in tqdm(df_train.iterrows()):
    movie_to_users[row["Item ID"]].append(row["User ID"])

movie_to_users = [list(set(movie_to_users[i])) for i in range(item_feat_count[0])]

In [None]:
meta_data = {
    "user_fields": user_fields,
    "item_fields": item_fields,
    "user_feat_count": user_feat_count,
    "item_feat_count": item_feat_count,
    "user_feat_offset": user_feat_offset,
    "item_feat_offset": item_feat_offset,
    "item_feats_table": item_feats_table,
    "feature_dict": datamaps,
    "item_to_users": movie_to_users
}

json.dump(meta_data, open(os.path.join(target_dir, "match-meta.json"), "w"), ensure_ascii=False)

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

df_data = pd.concat([df_train, df_valid, df_test]).reset_index(drop=True)

user_X = []
item_X = []

for idx, row in tqdm(df_data.iterrows()):
    user_X.append([row[field] for field in user_fields])
    item_X.append([row[field] for field in item_fields])

hist_ID = df_data["user history ID"].tolist()
hist_length = [len(x) for x in hist_ID]

user_X = np.array(user_X)
item_X = np.array(item_X)

hist_ID = pad_sequence(
    [torch.tensor(x[-30:]) for x in hist_ID], 
    batch_first=True,
)

hist_mask = pad_sequence(
    [torch.ones(min(x, 30)) for x in hist_length], 
    batch_first=True,
)

print("user_X", user_X.shape)
print("item_X", item_X.shape)
print("hist_ID", hist_ID.shape)
print("hist_mask", hist_mask.shape)

In [None]:
import h5py

with h5py.File(os.path.join(target_dir, f"match.h5"), "w") as hf:
    hf.create_dataset("train user data", data=user_X[:train_num, :])
    hf.create_dataset("valid user data", data=user_X[train_num:train_num+valid_num, :])
    hf.create_dataset("test user data", data=user_X[train_num+valid_num:, :])

    hf.create_dataset("train item data", data=item_X[:train_num, :])
    hf.create_dataset("valid item data", data=item_X[train_num:train_num+valid_num, :])
    hf.create_dataset("test item data", data=item_X[train_num+valid_num:, :])

    hf.create_dataset("train history ID", data=hist_ID[:train_num, :])
    hf.create_dataset("valid history ID", data=hist_ID[train_num:train_num+valid_num, :])
    hf.create_dataset("test history ID", data=hist_ID[train_num+valid_num:, :])

    hf.create_dataset("train history mask", data=hist_mask[:train_num, :])
    hf.create_dataset("valid history mask", data=hist_mask[train_num:train_num+valid_num, :])
    hf.create_dataset("test history mask", data=hist_mask[train_num+valid_num:, :])
