In [None]:
import pandas as pd
import numpy as np
from transformers import set_seed
from tqdm import tqdm
import os
import random
import json

set_seed(42)
dataset = "ml-1m"
source_dir = os.path.join(f"Datasets/{dataset}", "raw_data")
target_dir = os.path.join(f"data/{dataset}", "proc_data")
os.makedirs(target_dir, exist_ok=True)

In [None]:
age_dict = {
    1: "under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "above 56"
}

job_dict = {
    0: "other or not specified",
	1: "academic/educator",
	2: "artist",
	3: "clerical/admin",
	4: "college/grad student",
	5: "customer service",
	6: "doctor/health care",
	7: "executive/managerial",
	8: "farmer",
	9: "homemaker",
	10: "K-12 student",
	11: "lawyer",
	12: "programmer",
	13: "retired",
	14: "sales/marketing",
	15: "scientist",
	16: "self-employed",
	17: "technician/engineer",
	18: "tradesman/craftsman",
	19: "unemployed",
	20: "writer",
}

In [None]:
# User data

user_data = []
user_fields = ["User ID", "Gender", "Age", "Job", "Zipcode"]
for line in open(os.path.join(source_dir, "users.dat"), "r").readlines():
    ele = line.strip().split("::")
    user_id, gender, age, job, zipcode = [x.strip() for x in ele]
    assert gender in ["M", "F"], ele
    gender = "male" if gender == "M" else "female"
    age = age_dict[int(age)]
    job = job_dict[int(job)]
    user_data.append([user_id, gender, age, job, zipcode])

df_user = pd.DataFrame(user_data, columns=user_fields)
print(f"Total number of users: {len(df_user)}")
assert len(df_user["User ID"]) == len(set(df_user["User ID"]))

In [None]:
# Movie data

movie_data = []
movie_fields = ["Movie ID", "Movie title", "Movie genre"]
for line in open(os.path.join(source_dir, "movies.dat"), "r", encoding="ISO-8859-1").readlines():
    ele = line.strip().split("::")
    movie_id = ele[0].strip()
    movie_title = ele[1].strip()
    movie_genre = ele[2].strip().split("|")[0]
    movie_data.append([movie_id, movie_title, movie_genre])

df_movie = pd.DataFrame(movie_data, columns=movie_fields)
print(f"Total number of movies: {len(df_movie)}")
assert len(df_movie["Movie ID"]) == len(set(df_movie["Movie ID"]))

In [None]:
# Rating data

rating_data = []
rating_fields = ["User ID", "Movie ID", "rating", "timestamp"]
user_list = list(df_user["User ID"])
for line in open(os.path.join(source_dir, "ratings.dat"), "r").readlines():
    ele = [x.strip() for x in line.strip().split("::")] 
    user, movie, rating, timestamp = ele[0], ele[1], int(ele[2]), int(ele[3])
    if user in user_list:
        rating_data.append([user, movie, rating, timestamp])

df_ratings = pd.DataFrame(rating_data, columns=rating_fields)
print(f"Total number of ratings: {len(df_ratings)}")


In [None]:
df_data = pd.merge(df_ratings, df_user, on=["User ID"], how="inner")
df_data = pd.merge(df_data, df_movie, on=["Movie ID"], how="inner")

df_data = df_data[df_data["rating"] > 3]

df_data.sort_values(by=["timestamp", "User ID", "Movie ID"], inplace=True, kind="stable")

field_names = user_fields + movie_fields

df_data = df_data[field_names].reset_index(drop=True)
print("Total number after filtering:", len(df_data))
df_data.head()

In [None]:
# Encode the feature dict for CTR data

def add_to_dict(dict, feature):
    if feature not in dict:
        dict[feature] = len(dict)

feature_dict = {field : {} for field in field_names}

for idx, row in tqdm(df_data.iterrows()):
    for field in field_names:
        add_to_dict(feature_dict[field], row[field])

user_feat_count = [len(feature_dict[field]) for field in user_fields]
item_feat_count = [len(feature_dict[field]) for field in movie_fields]

# Treat user and movie features differently
user_feat_offset, item_feat_offset = [0], [0]
for c in user_feat_count[:-1]:
    user_feat_offset.append(user_feat_offset[-1] + c)

for c in item_feat_count[:-1]:
    item_feat_offset.append(item_feat_offset[-1] + c)

print("---------------------------------------------------------------")
for f, fc, fo in zip(user_fields, user_feat_count, user_feat_offset):
    print(f, fc, fo)
    
print("---------------------------------------------------------------")

for f, fc, fo in zip(movie_fields, item_feat_count, item_feat_offset):
    print(f, fc, fo)
print("---------------------------------------------------------------")


for field in field_names:
    df_data[field] = df_data[field].apply(lambda x: feature_dict[field][x])

df_data.head()


In [None]:
movie_feat_dict = {}

for _, row in tqdm(df_data.iterrows()):
    if row["Movie ID"] not in movie_feat_dict:
        movie_feat_dict[row["Movie ID"]] = [int(row["Movie ID"]), int(row["Movie title"]), int(row["Movie genre"])]

movie_feats_table = [movie_feat_dict[i] for i in range(item_feat_count[0])]
print(len(movie_feats_table))

In [None]:
user_history_dict = {
    "ID": {k: [] for k in set(df_data["User ID"])},
}

user_history_column = {
    "ID": [],
}


for idx, row in tqdm(df_data.iterrows()):
    user_id, movie_id = row["User ID"], row["Movie ID"]
    user_history_column["ID"].append(user_history_dict["ID"][user_id].copy())
    user_history_dict["ID"][user_id].append(movie_id)

df_data["user history ID"] = user_history_column["ID"]

df_data = df_data[df_data["user history ID"].apply(lambda x: len(x)) >= 5].reset_index(drop=True)

df_data.head()

In [None]:
valid_idx = []
test_idx = []

for uid, df_u in tqdm(df_data.groupby(["User ID"])):
    valid_idx.append(df_u.tail(2).index[0])
    test_idx.append(df_u.tail(1).index[0])

valid_idx = sorted(valid_idx)
test_idx = sorted(test_idx)
train_idx = sorted(list(set(range(len(df_data))) - set(valid_idx + test_idx)))

df_train = df_data.iloc[train_idx].reset_index(drop=True)
df_valid = df_data.iloc[valid_idx].reset_index(drop=True)
df_test = df_data.iloc[test_idx].reset_index(drop=True)

In [None]:
train_num = len(df_train)
valid_num = len(df_valid)
test_num = len(df_test)
print("Num train/valid/test:", train_num, valid_num, test_num)

df_train.to_parquet(os.path.join(target_dir, "train.parquet.gz"), compression="gzip")
df_valid.to_parquet(os.path.join(target_dir, "valid.parquet.gz"), compression="gzip")
df_test.to_parquet(os.path.join(target_dir, "test.parquet.gz"), compression="gzip")

df_train.head()

In [None]:
movie_to_users = {i: [] for i in range(item_feat_count[0])}

for idx, row in tqdm(df_data.iterrows()):
    movie_to_users[row["Movie ID"]].append(row["User ID"])


movie_to_users = [list(set(movie_to_users[i])) for i in range(item_feat_count[0])]

In [None]:
meta_data = {
    "user_fields": user_fields,
    "item_fields": movie_fields,
    "user_feat_count": user_feat_count,
    "item_feat_count": item_feat_count,
    "user_feat_offset": user_feat_offset,
    "item_feat_offset": item_feat_offset,
    "movie_feats_table": movie_feats_table,
    "feature_dict": feature_dict,
    "item_to_users": movie_to_users
}

json.dump(meta_data, open(os.path.join(target_dir, "match-meta.json"), "w"), ensure_ascii=False)

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

df_data = pd.concat([df_train, df_valid, df_test]).reset_index(drop=True)

user_X = []
item_X = []

for idx, row in tqdm(df_data.iterrows()):
    user_X.append([row[field] for field in user_fields])
    item_X.append([row[field] for field in movie_fields])

hist_ID = df_data["user history ID"].tolist()
hist_length = [len(x) for x in hist_ID]

user_X = np.array(user_X)
item_X = np.array(item_X)

hist_ID = pad_sequence(
    [torch.tensor(x[-30:]) for x in hist_ID], 
    batch_first=True,
)

hist_mask = pad_sequence(
    [torch.ones(min(x, 30)) for x in hist_length], 
    batch_first=True,
)

print("user_X", user_X.shape)
print("item_X", item_X.shape)
print("hist_ID", hist_ID.shape)
print("hist_mask", hist_mask.shape)

In [None]:
import h5py

with h5py.File(os.path.join(target_dir, f"match.h5"), "w") as hf:
    hf.create_dataset("train user data", data=user_X[:train_num, :])
    hf.create_dataset("valid user data", data=user_X[train_num:train_num+valid_num, :])
    hf.create_dataset("test user data", data=user_X[train_num+valid_num:, :])

    hf.create_dataset("train item data", data=item_X[:train_num, :])
    hf.create_dataset("valid item data", data=item_X[train_num:train_num+valid_num, :])
    hf.create_dataset("test item data", data=item_X[train_num+valid_num:, :])

    hf.create_dataset("train history ID", data=hist_ID[:train_num, :])
    hf.create_dataset("valid history ID", data=hist_ID[train_num:train_num+valid_num, :])
    hf.create_dataset("test history ID", data=hist_ID[train_num+valid_num:, :])

    hf.create_dataset("train history mask", data=hist_mask[:train_num, :])
    hf.create_dataset("valid history mask", data=hist_mask[train_num:train_num+valid_num, :])
    hf.create_dataset("test history mask", data=hist_mask[train_num+valid_num:, :])
