In [None]:
import pandas as pd
import numpy as np
import json
import os
import re
import random
import copy
from transformers import set_seed
import hashlib
import json
import pickle as pkl
import h5py
from tqdm import tqdm, trange

set_seed(42)

dataset = "BookCrossing"
source_dir = os.path.join(f"Datasets/{dataset}", "raw_data")
target_dir = os.path.join(f"data/{dataset}", "proc_data")
os.makedirs(target_dir, exist_ok=True)

In [None]:
from string import ascii_letters, digits

def character_check(item, special_letters=""):
    for letter in str(item):
        if letter not in ascii_letters + digits + special_letters:
            return 1
    return 0

In [None]:
# Read user info

user_fields = ["User ID", "Location", "Age"]
pattern = re.compile(r'NULL|".*?(?<!\\)"', re.S)
with open(os.path.join(source_dir, "BX-Users.csv"), 'r', encoding='cp1252') as f:
    content = pattern.findall(f.read())
    content = [s[1:-1] if s != 'NULL' else None for s in content]
    processed_list = list(np.array(content).reshape((-1, 3)))
    processed_list.pop(0)
    df_users = pd.DataFrame(processed_list, columns=user_fields)

# There are messy info/code (or totally empty) in the `Location` field, we only use the country instead.
# E.g., ['&#37073;&#24030;&#26159;, &#20013;&#22269;&#27827;&#21335;&#30465;&#37073;&#24030;&#24066;, china', 
#        'philippine science high school - cmc, mcc main stadium, sagadan, tubod, lanao del norte, philippines', 
#        '6.a.4.a.6.a`4.a, 6.a.4.a.6.a`4.a.6.a`4.a.6.a.4.a.6.a`4.aoe6.a`4.a -- 6.a.4.a.6.a`4.aoe6.a`4.a ã??, ä¸\xadå?½']
def convert_location_to_country(x):
    x = x.split(', ')[-1].strip().title().replace("!", "").strip()
    if x.lower() in ["usa", "us", "u s", "u s a"]:
        x = "USA"
    if x.lower() in ["uk", "u k"]:
        x = "UK"
    while len(x) > 0 and x[-1] in [",", "."]:
        x = x[:-1]
    while len(x) > 0 and x[0] in [",", "."]:
        x = x[1:]
    if "U.S" in x.upper() and x != "U.S. Virgin Islands":
        x = "USA"
    if x in ["San José", "San Josï¿½"]:
        x = "USA"
    if x in ["España", "Castilla-León", "Espaã±A", "Cataluña", "Mérida", "Álava", "Málaga", "A Coruña", "Barcelonès", "Berguedà",
              "Espaï¿½A", "Castilla-Leï¿½N", "A Coruï¿½A", "Cataluï¿½A", "Barcelonï¿½S", "Ï¿½Lava", "Mï¿½Rida", "Berguedï¿½", "Mï¿½Laga"] or "spain" in x.lower():
        x = "Spain"
    if x in ["L`Italia"]:
        x = "Italy"
    if x in ["Baden-Württemberg", "Bademn Würtemberg", "Baden-Wï¿½Rttemberg", "Bademn Wï¿½Rtemberg"]:
        x = "German"
    if x in ["Cote D`Ivoire", "Côte D", "Cï¿½Te D"]:
        x = "Ivory Coast"
    if x in ["Oberösterreich", "Oberï¿½Sterreich"]:
        x = "Austria"
    if x in ["México", "Mï¿½Xico"]:
        x = "Mexico"
    if x in ["Türkiye", "Içel", "Tï¿½Rkiye"]:
        x = "Turkey"
    if x in ["L`Algérie", "Algérie", "Kärnten", "Kï¿½Rnten", "L`Algï¿½Rie", "Algï¿½Rie"]:
        x = "Algeria"
    if "Brasil" in x:
        x = "Brazil"
    if x in ["Rhône-Alpes", "Rhône Alpes", "Rhï¿½Ne-Alpes", "Rhï¿½Ne Alpes"]:
        x = "France"
    if "Greece" in x:
        x = "Greece"
    if x in ["Santarém", "Santarï¿½M"]:
        x = "Portugal"
    if x in ["Länsi-Suomen Lääni", "Lï¿½Nsi-Suomen Lï¿½Ï¿½Ni"]:
        x = "Finland"
    if x in ["V.Götaland", "Nyhamnsläge", "V.Gï¿½Taland", "Nyhamnslï¿½Ge"]:
        x = "Sweden"
    if x in ["Moçambique", "Moï¿½Ambique"]:
        x = "Mozambique"
    if x in ["Ix Región", "Ix Regiï¿½N"]:
        x = "Chile"
    if x in ["Maï¿½Opolskie", "Ma³Opolskie"]:
        x = "Poland"
    if x in ["Perï¿½", "Perãº"]:
        x = "Peru"
    if x != "China" and ("china" in x.lower() or x == "La Chine Éternelle" or x == "La Chine Ï¿½Ternelle"):
        x = "China"
    if x == "Ï¿½Ï¿½Ï¿½":
        x = "China"
    if (x == "" or \
        x in ["Öð¹Ú", "ºþäï", "We`Re Global", "Ï¿½Ï¿½Ï¿½Ï¿½", "Iï¿½El"] or \
        len(x) == 1 or \
        "N/A" in x or \
        "&#" in x or \
        "?" in x or \
        "@" in x or \
        "*" in x):
        x = "unknown"
    return x
df_users["Location"] = df_users["Location"].apply(convert_location_to_country)
df_users["location_check"] = df_users["Location"].apply(lambda x: character_check(x, special_letters="- .&/()"))

assert len(df_users.loc[df_users["location_check"] == 1, "Location"]) == 0

# Nearly a half of the features in `Age` field are missing.
def convert_age_to_bucket(x):
    if x is None:
        x = "unknown"
    else:
        x = int(x)
        # There are out-of-range ages (e.g., < 5 or > 100).
        if x < 5 or x > 100:
            x = "unknown"
        # Age discretization
        elif x < 18:
            x = "under 18"
        elif 18 <= x < 25:
            x = "18-24"
        elif 25 <= x < 30:
            x = "25-29"
        elif 30 <= x < 35:
            x = "30-34"
        elif 35 <= x < 40:
            x = "35-39"
        elif 40 <= x < 45:
            x = "40-44"
        elif 45 <= x < 50:
            x = "45-49"
        elif 50 <= x < 55:
            x = "50-54"
        elif 55 <= x < 60:
            x = "55-59"
        else:
            x = "60+"
    return x
df_users["Age"] = df_users["Age"].apply(convert_age_to_bucket)

for field in user_fields:
    for s in list(df_users[field]):
        if field == "User ID":
            assert 1 <= int(s) <= 278858
        if field == "Location":
            assert 2 <= len(s) <= 45
        if field == "Age":
            assert s in ["unknown", "under 18" ,"18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60+"]

df_users = df_users[user_fields]
print(df_users.head())
print('---------------------------------------------------------------')
print(df_users.info())
print('---------------------------------------------------------------')
print(df_users.describe())
print('---------------------------------------------------------------')

In [None]:
# Read book info

book_fields = ["ISBN", "Book title", "Author", "Publication year", "Publisher"]
pattern = re.compile(r'(?<=");(?=")')
processed_list = []
with open(os.path.join(source_dir, "BX-Books.csv"), 'r', encoding='cp1252') as f:
    for line in f.readlines():
        split_line = pattern.split(line.strip())
        split_line = [item[1:-1].strip('\t') for item in split_line][:-3] # The last three image URLs are not needed.
        processed_list.append(split_line)
    processed_list.pop(0)
    df_books = pd.DataFrame(processed_list, columns=book_fields)

# ISBN should only contain letters and digits.
df_books['ISBN_check'] = df_books['ISBN'].apply(lambda x: character_check(x))
df_books = df_books[df_books['ISBN_check'] == 0]

# There are invalid publication years, i.e., "0"
def convert_publication_year(x):
    x = x if len(x) == 4 else "unknown"
    return x
df_books["Publication year"] = df_books["Publication year"].apply(convert_publication_year)

df_books["Publisher"] = df_books["Publisher"].apply(lambda x: x if x.lower() != "n/a" else "unknown")
df_books["Author"] = df_books["Author"].apply(lambda x: x if x.lower() != "n/a" else "unknown")

for field in book_fields:
    for s in list(df_books[field]):
        if field == "ISBN":
            assert len(s) == 10
        if field == "Book title":
            assert 1 <= len(s) <= 256
        if field == "Author":
            assert 1 <= len(s) <= 143
        if field == "Publication year":
            assert s == "unknown" or len(s) == 4
        if field == "Publisher":
            assert 1 <= len(s) <= 134

df_books = df_books[book_fields]
print(df_books.head())
print('---------------------------------------------------------------')
print(df_books.info())
print('---------------------------------------------------------------')
print(df_books.describe())
print('---------------------------------------------------------------')

In [None]:
# Read ratings

processed_list = []
with open(os.path.join(source_dir, "BX-Book-Ratings.csv"), 'r', encoding='cp1252') as f:
    for line in f.readlines():
        split_line = line.strip().split(';')
        split_line = [item[1:-1] for item in split_line]
        processed_list.append(split_line)
    processed_list.pop(0)

df_ratings = pd.DataFrame(processed_list, columns=["User ID", "ISBN", "rating"])
print("Total number of ratings:", len(df_ratings))

In [None]:
df_data = pd.merge(df_ratings, df_users, on=["User ID"], how="inner")
df_data = pd.merge(df_data, df_books, on=["ISBN"], how="inner")

df_data["rating"] = df_data["rating"].apply(lambda x: int(x))
df_data = df_data[df_data["rating"] > 5]

field_names = user_fields + book_fields

df_data = df_data[field_names].reset_index(drop=True)
print("Total number after filtering:", len(df_data))
df_data.head()

In [None]:
# Encode the feature dict

def add_to_dict(dict, feature):
    if feature not in dict:
        dict[feature] = len(dict)

field_names = user_fields + book_fields
feature_dict = {field : {} for field in field_names}

for idx, row in df_data.iterrows():
    for field in field_names:
        add_to_dict(feature_dict[field], row[field])

user_feat_count = [len(feature_dict[field]) for field in user_fields]
item_feat_count = [len(feature_dict[field]) for field in book_fields]


# Treat user and book features differently
user_feat_offset, item_feat_offset = [0], [0]
for c in user_feat_count[:-1]:
    user_feat_offset.append(user_feat_offset[-1] + c)

for c in item_feat_count[:-1]:
    item_feat_offset.append(item_feat_offset[-1] + c)

print("---------------------------------------------------------------")
for f, fc, fo in zip(user_fields, user_feat_count, user_feat_offset):
    print(f, fc, fo)
    
print("---------------------------------------------------------------")

for f, fc, fo in zip(book_fields, item_feat_count, item_feat_offset):
    print(f, fc, fo)
print("---------------------------------------------------------------")


for field in field_names:
    df_data[field] = df_data[field].apply(lambda x: feature_dict[field][x])

df_data.head()

In [None]:
book_feat_dict = {}
for _, row in tqdm(df_data.iterrows()):
    if row["ISBN"] not in book_feat_dict:
        book_feat_dict[row["ISBN"]] = [int(row["ISBN"]), int(row["Book title"]), int(row["Author"]), int(row["Publication year"]), int(row["Publisher"])]

book_feat_table = [book_feat_dict[i] for i in range(len(book_feat_dict))]
print(len(book_feat_table))

In [None]:
user_history_dict = {
    "ID": {k: [] for k in set(df_data["User ID"])},
}

user_history_column = {
    "ID": [],
}


for idx, row in tqdm(df_data.iterrows()):
    user_id, movie_id = row["User ID"], row["ISBN"]
    user_history_column["ID"].append(user_history_dict["ID"][user_id].copy())
    user_history_dict["ID"][user_id].append(movie_id)

df_data["user history ID"] = user_history_column["ID"]

df_data = df_data[df_data["user history ID"].apply(lambda x: len(x)) >= 5].reset_index(drop=True)

# 7-core filtering
user_counter = df_data["User ID"].value_counts()
user_counter = user_counter[user_counter >= 3]
df_data = df_data[df_data["User ID"].isin(user_counter.index)].reset_index(drop=True)


df_data.head()

In [None]:
valid_idx = []
test_idx = []

for uid, df_u in tqdm(df_data.groupby(["User ID"])):
    valid_idx.append(df_u.tail(2).index[0])
    test_idx.append(df_u.tail(1).index[0])

valid_idx = sorted(valid_idx)
test_idx = sorted(test_idx)
train_idx = sorted(list(set(range(len(df_data))) - set(valid_idx + test_idx)))

df_train = df_data.iloc[train_idx].reset_index(drop=True)
df_valid = df_data.iloc[valid_idx].reset_index(drop=True)
df_test = df_data.iloc[test_idx].reset_index(drop=True)

In [None]:
train_num = len(df_train)
valid_num = len(df_valid)
test_num = len(df_test)
print("Num train/valid/test:", train_num, valid_num, test_num)

df_train.to_parquet(os.path.join(target_dir, "train.parquet.gz"), compression="gzip")
df_valid.to_parquet(os.path.join(target_dir, "valid.parquet.gz"), compression="gzip")
df_test.to_parquet(os.path.join(target_dir, "test.parquet.gz"), compression="gzip")

df_train.head()

In [None]:
book_to_users = {i: [] for i in range(len(book_feat_dict))}

for idx, row in tqdm(df_train.iterrows()):
    for isbn in row["user history ID"] + [row["ISBN"]]:
        book_to_users[isbn].append(row["User ID"])

book_to_users = [list(set(book_to_users[i])) for i in range(len(book_feat_dict))]

In [None]:
meta_data = {
    "user_fields": user_fields,
    "item_fields": book_fields,
    "user_feat_count": user_feat_count,
    "item_feat_count": item_feat_count,
    "user_feat_offset": user_feat_offset,
    "item_feat_offset": item_feat_offset,
    "book_feats_table": book_feat_table,
    "feature_dict": feature_dict,
    "item_to_users": book_to_users
}

json.dump(meta_data, open(os.path.join(target_dir, "match-meta.json"), "w"), ensure_ascii=False)

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

df_data = pd.concat([df_train, df_valid, df_test]).reset_index(drop=True)

user_X = []
item_X = []

for idx, row in tqdm(df_data.iterrows()):
    user_X.append([row[field] for field in user_fields])
    item_X.append([row[field] for field in book_fields])

hist_ID = df_data["user history ID"].tolist()
hist_length = [len(x) for x in hist_ID]

user_X = np.array(user_X)
item_X = np.array(item_X)

hist_ID = pad_sequence(
    [torch.tensor(x[-30:]) for x in hist_ID], 
    batch_first=True,
)

hist_mask = pad_sequence(
    [torch.ones(min(x, 30)) for x in hist_length], 
    batch_first=True,
)

print("user_X", user_X.shape)
print("item_X", item_X.shape)
print("hist_ID", hist_ID.shape)
print("hist_mask", hist_mask.shape)

In [None]:
import h5py

with h5py.File(os.path.join(target_dir, f"match.h5"), "w") as hf:
    hf.create_dataset("train user data", data=user_X[:train_num, :])
    hf.create_dataset("valid user data", data=user_X[train_num:train_num+valid_num, :])
    hf.create_dataset("test user data", data=user_X[train_num+valid_num:, :])

    hf.create_dataset("train item data", data=item_X[:train_num, :])
    hf.create_dataset("valid item data", data=item_X[train_num:train_num+valid_num, :])
    hf.create_dataset("test item data", data=item_X[train_num+valid_num:, :])

    hf.create_dataset("train history ID", data=hist_ID[:train_num, :])
    hf.create_dataset("valid history ID", data=hist_ID[train_num:train_num+valid_num, :])
    hf.create_dataset("test history ID", data=hist_ID[train_num+valid_num:, :])

    hf.create_dataset("train history mask", data=hist_mask[:train_num, :])
    hf.create_dataset("valid history mask", data=hist_mask[train_num:train_num+valid_num, :])
    hf.create_dataset("test history mask", data=hist_mask[train_num+valid_num:, :])