In [3]:
import pandas as pd
import numpy as np
import json
import os
import re
import random
import copy
from transformers import set_seed
import hashlib
import json
import pickle as pkl
import h5py

set_seed(42)

dataset_name = "BookCrossing"
root = f"../data/{dataset_name}"
source_dir = os.path.join(root, "raw_data")
target_dir = os.path.join(root, "proc_data")

In [2]:
from string import ascii_letters, digits

def character_check(item, special_letters=""):
    for letter in str(item):
        if letter not in ascii_letters + digits + special_letters:
            return 1
    return 0

In [3]:
# Read user info

user_fields = ["User ID", "Location", "Age"]
pattern = re.compile(r'NULL|".*?(?<!\\)"', re.S)
with open(os.path.join(source_dir, "BX-Users.csv"), 'r', encoding='cp1252') as f:
    content = pattern.findall(f.read())
    content = [s[1:-1] if s != 'NULL' else None for s in content]
    processed_list = list(np.array(content).reshape((-1, 3)))
    processed_list.pop(0)
    df_users = pd.DataFrame(processed_list, columns=user_fields)

# There are messy info/code (or totally empty) in the `Location` field, we only use the country instead.
# E.g., ['&#37073;&#24030;&#26159;, &#20013;&#22269;&#27827;&#21335;&#30465;&#37073;&#24030;&#24066;, china', 
#        'philippine science high school - cmc, mcc main stadium, sagadan, tubod, lanao del norte, philippines', 
#        '6.a.4.a.6.a`4.a, 6.a.4.a.6.a`4.a.6.a`4.a.6.a.4.a.6.a`4.aoe6.a`4.a -- 6.a.4.a.6.a`4.aoe6.a`4.a ã??, ä¸\xadå?½']
def convert_location_to_country(x):
    x = x.split(', ')[-1].strip().title().replace("!", "").strip()
    if x.lower() in ["usa", "us", "u s", "u s a"]:
        x = "USA"
    if x.lower() in ["uk", "u k"]:
        x = "UK"
    while len(x) > 0 and x[-1] in [",", "."]:
        x = x[:-1]
    while len(x) > 0 and x[0] in [",", "."]:
        x = x[1:]
    if "U.S" in x.upper() and x != "U.S. Virgin Islands":
        x = "USA"
    if x in ["San José", "San Josï¿½"]:
        x = "USA"
    if x in ["España", "Castilla-León", "Espaã±A", "Cataluña", "Mérida", "Álava", "Málaga", "A Coruña", "Barcelonès", "Berguedà",
              "Espaï¿½A", "Castilla-Leï¿½N", "A Coruï¿½A", "Cataluï¿½A", "Barcelonï¿½S", "Ï¿½Lava", "Mï¿½Rida", "Berguedï¿½", "Mï¿½Laga"] or "spain" in x.lower():
        x = "Spain"
    if x in ["L`Italia"]:
        x = "Italy"
    if x in ["Baden-Württemberg", "Bademn Würtemberg", "Baden-Wï¿½Rttemberg", "Bademn Wï¿½Rtemberg"]:
        x = "German"
    if x in ["Cote D`Ivoire", "Côte D", "Cï¿½Te D"]:
        x = "Ivory Coast"
    if x in ["Oberösterreich", "Oberï¿½Sterreich"]:
        x = "Austria"
    if x in ["México", "Mï¿½Xico"]:
        x = "Mexico"
    if x in ["Türkiye", "Içel", "Tï¿½Rkiye"]:
        x = "Turkey"
    if x in ["L`Algérie", "Algérie", "Kärnten", "Kï¿½Rnten", "L`Algï¿½Rie", "Algï¿½Rie"]:
        x = "Algeria"
    if "Brasil" in x:
        x = "Brazil"
    if x in ["Rhône-Alpes", "Rhône Alpes", "Rhï¿½Ne-Alpes", "Rhï¿½Ne Alpes"]:
        x = "France"
    if "Greece" in x:
        x = "Greece"
    if x in ["Santarém", "Santarï¿½M"]:
        x = "Portugal"
    if x in ["Länsi-Suomen Lääni", "Lï¿½Nsi-Suomen Lï¿½Ï¿½Ni"]:
        x = "Finland"
    if x in ["V.Götaland", "Nyhamnsläge", "V.Gï¿½Taland", "Nyhamnslï¿½Ge"]:
        x = "Sweden"
    if x in ["Moçambique", "Moï¿½Ambique"]:
        x = "Mozambique"
    if x in ["Ix Región", "Ix Regiï¿½N"]:
        x = "Chile"
    if x in ["Maï¿½Opolskie", "Ma³Opolskie"]:
        x = "Poland"
    if x in ["Perï¿½", "Perãº"]:
        x = "Peru"
    if x != "China" and ("china" in x.lower() or x == "La Chine Éternelle" or x == "La Chine Ï¿½Ternelle"):
        x = "China"
    if x == "Ï¿½Ï¿½Ï¿½":
        x = "China"
    if (x == "" or \
        x in ["Öð¹Ú", "ºþäï", "We`Re Global", "Ï¿½Ï¿½Ï¿½Ï¿½", "Iï¿½El"] or \
        len(x) == 1 or \
        "N/A" in x or \
        "&#" in x or \
        "?" in x or \
        "@" in x or \
        "*" in x):
        x = "unknown"
    return x
df_users["Location"] = df_users["Location"].apply(convert_location_to_country)
df_users["location_check"] = df_users["Location"].apply(lambda x: character_check(x, special_letters="- .&/()"))

assert len(df_users.loc[df_users["location_check"] == 1, "Location"]) == 0

# Nearly a half of the features in `Age` field are missing.
def convert_age_to_bucket(x):
    if x is None:
        x = "unknown"
    else:
        x = int(x)
        # There are out-of-range ages (e.g., < 5 or > 100).
        if x < 5 or x > 100:
            x = "unknown"
        # Age discretization
        elif x < 18:
            x = "under 18"
        elif 18 <= x < 25:
            x = "18-24"
        elif 25 <= x < 30:
            x = "25-29"
        elif 30 <= x < 35:
            x = "30-34"
        elif 35 <= x < 40:
            x = "35-39"
        elif 40 <= x < 45:
            x = "40-44"
        elif 45 <= x < 50:
            x = "45-49"
        elif 50 <= x < 55:
            x = "50-54"
        elif 55 <= x < 60:
            x = "55-59"
        else:
            x = "60+"
    return x
df_users["Age"] = df_users["Age"].apply(convert_age_to_bucket)

for field in user_fields:
    for s in list(df_users[field]):
        if field == "User ID":
            assert 1 <= int(s) <= 278858
        if field == "Location":
            assert 2 <= len(s) <= 45
        if field == "Age":
            assert s in ["unknown", "under 18" ,"18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60+"]

df_users = df_users[user_fields]

md5_hash = hashlib.md5(json.dumps(df_users.values.tolist(), sort_keys=True).encode('utf-8')).hexdigest()
print("df_users", md5_hash)
df_users.head(20)
assert md5_hash == "7cc0a4c37b494183233b02fcead5ea1b"

df_users 7cc0a4c37b494183233b02fcead5ea1b


Unnamed: 0,User ID,Location,Age
0,1,USA,unknown
1,2,USA,18-24
2,3,Russia,unknown
3,4,Portugal,under 18
4,5,United Kingdom,unknown
5,6,USA,60+
6,7,USA,unknown
7,8,Canada,unknown
8,9,USA,unknown
9,10,Spain,25-29


In [4]:
# Read book info

book_fields = ["ISBN", "Book title", "Author", "Publication year", "Publisher"]
pattern = re.compile(r'(?<=");(?=")')
processed_list = []
with open(os.path.join(source_dir, "BX-Books.csv"), 'r', encoding='cp1252') as f:
    for line in f.readlines():
        split_line = pattern.split(line.strip())
        split_line = [item[1:-1].strip('\t') for item in split_line][:-3] # The last three image URLs are not needed.
        processed_list.append(split_line)
    processed_list.pop(0)
    df_books = pd.DataFrame(processed_list, columns=book_fields)

# ISBN should only contain letters and digits.
df_books['ISBN_check'] = df_books['ISBN'].apply(lambda x: character_check(x))
df_books = df_books[df_books['ISBN_check'] == 0]

# There are invalid publication years, i.e., "0"
def convert_publication_year(x):
    x = x if len(x) == 4 else "unknown"
    return x
df_books["Publication year"] = df_books["Publication year"].apply(convert_publication_year)

df_books["Publisher"] = df_books["Publisher"].apply(lambda x: x if x.lower() != "n/a" else "unknown")
df_books["Author"] = df_books["Author"].apply(lambda x: x if x.lower() != "n/a" else "unknown")

for field in book_fields:
    for s in list(df_books[field]):
        if field == "ISBN":
            assert len(s) == 10
        if field == "Book title":
            assert 1 <= len(s) <= 256
        if field == "Author":
            assert 1 <= len(s) <= 143
        if field == "Publication year":
            assert s == "unknown" or len(s) == 4
        if field == "Publisher":
            assert 1 <= len(s) <= 134

df_books = df_books[book_fields]
print(df_books.head())
print('---------------------------------------------------------------')
print(df_books.info())
print('---------------------------------------------------------------')
print(df_books.describe())
print('---------------------------------------------------------------')
md5_hash = hashlib.md5(json.dumps(df_books.values.tolist(), sort_keys=True).encode('utf-8')).hexdigest()
print("df_books", md5_hash)
assert md5_hash == "bd070d039ad5d48228c0f256e61f381e"

df_books bd070d039ad5d48228c0f256e61f381e


In [5]:
# Encode features

from tqdm import tqdm

def add_to_dict(dict, feature):
    if feature not in dict:
        dict[feature] = len(dict)

feature_dict = {field : {} for field in user_fields + book_fields}
user_dict = {}
book_dict = {}

for idx, row in df_users.iterrows():
    if row["User ID"] not in user_dict:
        user_dict[row["User ID"]] = [row["Location"], row["Age"]]
    for field in user_fields:
        add_to_dict(feature_dict[field], row[field])

for idx, row in df_books.iterrows():
    if row["ISBN"] not in book_dict:
        book_dict[row["ISBN"]] = [row["Book title"], row["Author"], row["Publication year"], row["Publisher"]]
    for field in book_fields:
        add_to_dict(feature_dict[field], row[field])

feature_count = [len(feature_dict[field]) for field in user_fields + book_fields]

for field in user_fields:
    print(field, len(feature_dict[field]))
    assert len(feature_dict[field]) == len(set(list(df_users[field])))

for field in book_fields:
    print(field, len(feature_dict[field]))
    assert len(feature_dict[field]) == len(set(list(df_books[field])))


User ID 278858
Location 922
Age 11
ISBN 271375
Book title 242152
Author 102027
Publication year 116
Publisher 16807


In [None]:
import json
json.dump(book_dict, open(os.path.join(target_dir, "book_dict.json"), "w"), indent=4)

In [6]:
# Read ratings

processed_list = []
with open(os.path.join(source_dir, "BX-Book-Ratings.csv"), 'r', encoding='cp1252') as f:
    for line in f.readlines():
        split_line = line.strip().split(';')
        split_line = [item[1:-1] for item in split_line]
        processed_list.append(split_line)
    column_list = processed_list[0]
    processed_list.pop(0)


user_hist, hist_rating, labels = {}, {}, {}
for user, isbn, rating in processed_list:
    if user in feature_dict["User ID"] and isbn in feature_dict["ISBN"]:
        if user not in user_hist:
            user_hist[user] = []
            hist_rating[user] = []
            labels[user] = []
        user_hist[user].append(isbn)
        hist_rating[user].append(int(rating))
        labels[user].append(int(int(rating) >= 5))

In [7]:
# Generate and shuffle data samples in DataFrame format

data_list = []

# filter users who rated no more than 5 books
user_del = []
for user, hist in user_hist.items():
    cnt = 0
    if len(hist) <= 5:
        user_del.append(user)

print("Number of users deleted:", len(user_del))

for user in user_del:
    del user_hist[user]
    del hist_rating[user]
    del labels[user]


Number of users deleted: 74392


In [8]:
for user in user_hist.keys():
    zipped_data = list(zip(user_hist[user], hist_rating[user], labels[user]))
    set_seed(42)
    random.shuffle(zipped_data)
    user_hist[user], hist_rating[user], labels[user] = map(list, zip(*zipped_data))
    isbn = user_hist[user][-1]
    data_sample = copy.deepcopy([user] + user_dict[user] + [isbn] + book_dict[isbn] +
                                    [user_hist[user][:-1]] + [hist_rating[user][:-1]] + [labels[user][-1]] + [hist_rating[user][-1]])
    data_list.append(data_sample)

print(len(data_list))


17714


In [None]:
cnt = 0
count = {}
for user, hist in user_hist.items():
    cnt += len(hist) - 1
    if len(hist)-1 not in count:
        count[len(hist)-1] = 0
    count[len(hist)-1] += 1
print("avg", cnt/10784)

print("*"*50)
print("Hist lens / Number of users")
for cnt in sorted(count.keys()):
    print(cnt, count[cnt])

In [9]:
set_seed(42)
random.shuffle(data_list)
df_data = pd.DataFrame(data_list, columns=user_fields + book_fields + ["user_hist", "hist_rating" , "labels", "rating"])
print(f"Total number of samples: {len(df_data)}")

df_data.head(20)

Total number of samples: 17714


Unnamed: 0,User ID,Location,Age,ISBN,Book title,Author,Publication year,Publisher,user_hist,hist_rating,labels,rating
0,226267,USA,unknown,0743417682,From a Buick 8,Stephen King,2003,Pocket Books,"[051513287X, 0515131229, 0312982518, 067103818...","[6, 0, 9, 6, 10, 6, 7, 9, 8, 7, 10]",1,7
1,57863,USA,40-44,0812558626,Briar Rose,Jane Yolen,1993,Tor Books,"[0316955124, 0380727501, 0609809547, 038071089...","[0, 0, 0, 0, 4, 8]",0,0
2,95636,USA,30-34,1551669005,Parting Gifts,Charlotte Allen,2002,Mira,"[0515125490, 0385264267, 0440240913, 067102136...","[8, 0, 0, 8, 7]",0,0
3,51481,Germany,unknown,0373034709,To Marry A Stranger (Enchanted Brides) (Harle...,Renee Roszel,1997,Harlequin,"[3404131215, 0373029373, 037311513X, 037316171...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
4,190147,Italy,unknown,8807700948,L'amico del pazzo a altri racconti (I Canguri/...,Marco Drago,1998,Feltrinelli,"[0375700498, 8408002511, 8807102005, 089471796...","[0, 0, 0, 0, 0, 0]",0,0
5,229041,USA,unknown,0609803697,Freedom from Asthma: The Revolutionary 5-Day T...,Alexander Stalmatski,1999,Three Rivers Press (CA),"[0451513355, 0449202917, 0393321096, 055321041...","[9, 10, 9, 5, 6, 7, 0, 9, 8, 9, 10, 0]",0,4
6,272961,USA,50-54,0312978901,Born Evil (Claremont Studies in the Philosophy...,Adrian Havill,2001,St. Martin's True Crime Classics,"[0671027387, 0345455207, 0312995423, 081251528...","[7, 9, 6, 0, 0, 6, 0, 6, 9]",0,0
7,140997,Germany,40-44,3596220742,Die wunderbaren Jahre.,Reiner Kunze,1978,"Fischer (Tb.), Frankfurt","[3499134705, 3442420024, 3492226965, 351836987...","[6, 0, 7, 0, 7]",1,9
8,202260,Spain,unknown,8479534850,Sabiduria Interior,Louise L. Hay,2002,Ediciones Urano,"[840208348X, 2253012726, 0805029516, 058253701...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
9,39147,USA,unknown,0821768859,Champion of the Heart (Zebra Historical Romanc...,Laurel O'Donnell,2001,Kensington Publishing Corporation,"[0425185222, 0373710992, 037371162X, 055325800...","[0, 6, 8, 0, 0]",0,0


In [10]:
# Save train/test in parquet format

df_train = df_data[:int(0.9 * len(df_data))].reset_index(drop=True)
df_test = df_data[int(0.9 * len(df_data)):].reset_index(drop=True)

print(f"Train num: {len(df_train)}")
print(f"Test num: {len(df_test)}")

df_train.to_parquet(
    os.path.join(target_dir, "train.parquet.gz"), 
    compression="gzip", 
)
df_test.to_parquet(
    os.path.join(target_dir, "test.parquet.gz"), 
    compression="gzip", 
)

Train num: 15942
Test num: 1772


In [11]:
# Re-read for sanity check

train_dataset = pd.read_parquet(os.path.join(target_dir, "train.parquet.gz"))
test_dataset = pd.read_parquet(os.path.join(target_dir, "test.parquet.gz"))

for (i1, a1), (i2, a2) in zip(df_train.iterrows(), train_dataset.iterrows()):
    for field in user_fields + book_fields + ["labels"]:
        assert not isinstance(a1[field], str) or "\t" not in a1[field]
        assert a1[field] == a2[field], (field, a1[field], a2[field])
for (i1, a1), (i2, a2) in zip(df_test.iterrows(), test_dataset.iterrows()):
    for field in user_fields + book_fields + ["labels"]:
        assert not isinstance(a1[field], str) or "\t" not in a1[field]
        assert a1[field] == a2[field], (field, a1[field], a2[field])

In [12]:
# Save the meta data for CTR

field_names = user_fields + book_fields

feature_count = [len(feature_dict[field]) for field in field_names]

feature_offset = [0]
for c in feature_count[:-1]:
    feature_offset.append(feature_offset[-1] + c)

for field in field_names:
    print(field, len(feature_dict[field]))

meta_data = {
    'field_names': field_names,
    'feature_count': feature_count,
    'feature_dict': feature_dict,
    'feature_offset': feature_offset,
    'num_ratings': 11
}

json.dump(meta_data, open(os.path.join(target_dir, 'ctr-meta.json'), 'w'))

User ID 278858
Location 922
Age 11
ISBN 271375
Book title 242152
Author 102027
Publication year 116
Publisher 16807


In [None]:
book_dict = json.load(open(os.path.join(target_dir, 'book_dict.json')))
meta_data = json.load(open(os.path.join(target_dir, 'ctr-meta.json')))
isbn2id = meta_data['feature_dict']['ISBN']
id2book = {book_id: [isbn] + book_dict[isbn] for isbn, book_id in isbn2id.items()}
json.dump(id2book, open(os.path.join(target_dir, 'id2book.json'), "w"), indent=4)
json.dump(isbn2id, open(os.path.join(target_dir, 'isbn2id.json'), "w"), indent=4)

In [13]:
# Convert df_data to CTR data via feature_dict

ctr_X, ctr_Y = [], []
for idx, row in df_data.iterrows():
    ctr_X.append([feature_dict[field][row[field]] for field in field_names])
    ctr_Y.append(int(row["labels"]))


ctr_X = np.array(ctr_X)
ctr_Y = np.array(ctr_Y)
print("ctr_X", ctr_X.shape)
print("ctr_Y", ctr_Y.shape)
feature_count_np = np.array(feature_count).reshape(1, -1)
assert (ctr_X - feature_count_np <= 0).sum() == ctr_X.shape[0] * ctr_X.shape[1]
assert (ctr_Y == 0).sum() + (ctr_Y == 1).sum() == ctr_Y.shape[0]

ctr_X (17714, 8)
ctr_Y (17714,)


In [15]:
history_column = {}

history_column["ID"] = df_data['user_hist'].tolist()
history_column["rating"] = df_data['hist_rating'].tolist()
history_column["hist length"] = [len(x) for x in history_column["rating"]]

train_num = int(0.9 * len(ctr_X))

user_seq = {
    "history ID": {
        "train": history_column["ID"][:train_num],
        "test": history_column["ID"][train_num:],
    },
    "history rating": {
        "train": history_column["rating"][:train_num],
        "test": history_column["rating"][train_num:],
    },
    "history length": {
        "train": history_column["hist length"][:train_num],
        "test": history_column["hist length"][train_num:],
    },
}

json.dump(user_seq, open(os.path.join(target_dir, "user_seq.json"), "w"), ensure_ascii=False)



In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

user_seq_trunc = {
    "history ID": {}, 
    "history rating": {}, 
    "history mask": {}, 
}

for hist_name in user_seq:
    for split in user_seq[hist_name]:
        if hist_name != "history length":
            user_seq_trunc[hist_name][split] = pad_sequence(
                [torch.tensor(x[-60:]) for x in user_seq[hist_name][split]], 
                batch_first=True, 
            )
        else:
            user_seq_trunc["history mask"][split] = pad_sequence(
                [torch.ones(min(x, 60)) for x in user_seq[hist_name][split]], 
                batch_first=True, 
            )

md5_user_seq_trunc = {}
for hist_name in user_seq_trunc:
    md5_user_seq_trunc[hist_name] = {}
    for split in user_seq_trunc[hist_name]:
        md5_user_seq_trunc[hist_name][split] = user_seq_trunc[hist_name][split].tolist()
        print(hist_name, split, user_seq_trunc[hist_name][split].shape)

In [None]:
# Save CTR data

with h5py.File(os.path.join(target_dir, f'ctr.h5'), 'w') as hf:
    hf.create_dataset('train data', data=ctr_X[:int(0.9 * len(ctr_X)), :])
    hf.create_dataset('test data', data=ctr_X[int(0.9 * len(ctr_X)):, :])
    hf.create_dataset('train label', data=ctr_Y[:int(0.9 * len(ctr_X))])
    hf.create_dataset('test label', data=ctr_Y[int(0.9 * len(ctr_X)):])
    for hist_name in user_seq_trunc:
        for split in user_seq_trunc[hist_name]:
            hf.create_dataset(f"{split} {hist_name}", data=user_seq_trunc[hist_name][split])


with h5py.File(os.path.join(target_dir, f'ctr.h5'), 'r') as hf:
    assert (ctr_X - np.concatenate([hf['train data'][:], hf['test data'][:]], axis=0)).sum() == 0
    assert (ctr_Y - np.concatenate([hf['train label'][:], hf['test label'][:]], axis=0)).sum() == 0
    for hist_name in user_seq_trunc:
        for split in user_seq_trunc[hist_name]:
            assert (user_seq_trunc[hist_name][split] - hf[f"{split} {hist_name}"][:]).sum() == 0    

    x = hf['train data'][:]
    assert (x - ctr_X[:int(0.9 * len(ctr_X)), :]).sum() == 0
    print(f'train data: {x.shape}')
    
    x = hf['test data'][:]
    assert (x - ctr_X[int(0.9 * len(ctr_X)):, :]).sum() == 0
    print(f'test data: {x.shape}')
    x = hf['train label'][:]
    assert (x - ctr_Y[:int(0.9 * len(ctr_X))]).sum() == 0
    print(f'train label: {x.shape}')
    x = hf['test label'][:]
    assert (x - ctr_Y[int(0.9 * len(ctr_X)):]).sum() == 0
    print(f'test label: {x.shape}')



In [None]:
# Final check: ensure each row from tsv and ctr is matched

train_dataset = pd.read_parquet(os.path.join(target_dir, 'train.parquet.gz'))
test_dataset = pd.read_parquet(os.path.join(target_dir, 'test.parquet.gz')).reset_index(drop=True)


with h5py.File(os.path.join(target_dir, f'ctr.h5'), 'r') as hf:
    train_x = hf['train data'][:]
    train_y = hf['train label'][:]
    test_x = hf['test data'][:]
    test_y = hf['test label'][:]

for idx, row in train_dataset.iterrows():
    for fi, field in enumerate(field_names):
        assert feature_dict[field][row[field]] == train_x[idx, fi]
    assert int(row["labels"]) == train_y[idx]

for idx, row in test_dataset.iterrows():
    for fi, field in enumerate(field_names):
        assert feature_dict[field][row[field]] == test_x[idx, fi]
    assert int(row["labels"]) == test_y[idx]

print("Pass final check.")