In [1]:
import os 
import gzip 
import re

import pandas as pd
from tqdm import tqdm
import numpy as np
import ujson

In [2]:
def _remove_char(value):
    l_temp = []
    
    value = re.sub("</?span[^>]*>", "", str(value)) # remove special tag token </span>
    value = re.sub(
        '[\[\],!.;#$^*\_——<>/=%&?@"&\'-:]', ' ', str(value))
    l_temp = [i for i in value.split()]
    return l_temp


def _remove_dup(l):
    """ Remove duplicated words, first remove front ones. """
    l_temp = []

    i = len(l) - 1
    while i >= 0:
        l[i] = l[i].lower()
        if l[i] not in l_temp:
            l_temp.append(l[i])
        i = i - 1

    l_temp.reverse()
    return l_temp

def get_df(path, keep_columns=None):
    """ Apply raw data to pandas DataFrame. """
    i = 0
    df = {}
    g = gzip.open(path, 'rb')
    for line in g:
        df[i] = ujson.loads(line)
        i += 1
    if keep_columns != None:
        assert type(keep_columns) == list
        return pd.DataFrame.from_dict(df, orient='index')[keep_columns]
    else:
        return pd.DataFrame.from_dict(df, orient='index')

In [3]:
in_dir = "/home/jupyter/unity_jointly_rec_and_search/datasets/amazon_review_datasets/"
dataset_name = "Cell_Phones_and_Accessories"
meta_path = os.path.join(in_dir, f"meta_{dataset_name}.json.gz")
review_path = os.path.join(in_dir, f"{dataset_name}.json.gz")

# preprocess meta_df
meta_df = get_df(meta_path)

meta_df.fillna('unknown', inplace=True)
replace_pattern = re.escape(";+/\.?")
replace_pattern = f"[{replace_pattern}\n]+"
meta_df['title'] = meta_df['title'] \
    .str.replace("<\w+>", " ", regex=True) \
    .str.replace("</\w+>", " ", regex=True) \
    .str.strip() \
    .str.replace("\t", " ") \
    .str.replace("\n", " ") 
meta_df['description'] = meta_df['description'].apply(lambda x: " ".join(x))
meta_df['description'] = meta_df['description'] \
    .str.replace("<\w+>", " ", regex=True) \
    .str.replace("</\w+>", " ", regex=True) \
    .str.strip() \
    .str.replace("\t", " ") \
    .str.replace("\n", " ") 

meta_df = meta_df.drop_duplicates(subset="asin")
meta_df = meta_df[meta_df["category"].apply(lambda x: len(x) > 1)]

asin_to_pid = {asin: pid for pid, asin in enumerate(meta_df["asin"].unique())}
asin_to_categories = {asin: categories for asin, categories in zip(meta_df.asin, meta_df.category)}
valid_asins = set(asin_to_categories.keys())
asin_to_query = {}
for asin, categories in asin_to_categories.items():
    qs = map(_remove_char, categories)
    qs = [x for q in qs for x in q]
    query = " ".join(_remove_dup(qs))
    asin_to_query[asin] = query

pid_to_title = {}
pid_to_description = {}
aid_to_viewpids = {}
aid_to_buypids = {}
for idx, row in meta_df.iterrows():
    pid_to_title[asin_to_pid[row.asin]] = row.title
    pid_to_description[asin_to_pid[row.asin]] = row.description
    
    valid_view = valid_asins.intersection(set(row.also_view))
    if len(valid_view) > 0:
        aid_to_viewpids[asin_to_pid[row.asin]] = set(asin_to_pid[asin] for asin in list(valid_view))
        
    valid_buy = valid_asins.intersection(set(row.also_buy))
    if len(valid_buy) > 0:
        aid_to_buypids[asin_to_pid[row.asin]] = set(asin_to_pid[asin] for asin in list(valid_buy))
        
assert len(pid_to_title) == len(pid_to_description) == len(meta_df)

aid_to_only_buypids = {}
for aid, buypids in tqdm(aid_to_buypids.items(), total=len(aid_to_buypids)):
    if aid in aid_to_viewpids:
        only_buypids = buypids.difference(aid_to_viewpids[aid])
        aid_to_only_buypids[aid] = only_buypids
    else:
        aid_to_only_buypids[aid] = buypids

print("number of aids has viewpids, buypids = {:,}, {:,}".format(len(aid_to_viewpids), len(aid_to_buypids)))
print("viewpids, buypids, only_buypids' average lengths are = {:.3f}, {:.3f}, {:.3f}".format(
    np.mean([len(xs) for xs in aid_to_viewpids.values()]), np.mean([len(xs) for xs in aid_to_buypids.values()]),
    np.mean([len(xs) for xs in aid_to_only_buypids.values()])
))

# preprocess review_df
review_df = get_df(review_path, keep_columns=["reviewerID", "asin", "unixReviewTime", "vote", "reviewText"])
review_df = review_df[review_df["asin"].isin(valid_asins)]
review_df["query"] = review_df["asin"].apply(lambda x: asin_to_query[x])
review_df = review_df[review_df.groupby('query')['query'].transform('size') >= 3] 

review_df["pid"] = review_df["asin"].apply(lambda x: asin_to_pid[x])
review_df["vote"] = review_df["vote"].fillna(value="0")
review_df["vote"] = review_df["vote"].apply(lambda x: int(x.replace(',','')))
query_to_qid = {query: qid + len(asin_to_pid) for qid, query in enumerate(review_df["query"].unique())}
review_df["qid"] = review_df["query"].apply(lambda x: query_to_qid[x])

user_review_df = review_df[review_df.groupby('reviewerID')['reviewerID'].transform('size') >= 5] \
                .reset_index(drop=True).sort_values(by=["reviewerID", "unixReviewTime"])
user_to_uid = {user:uid for uid, user in enumerate(user_review_df.reviewerID.unique())}
user_review_df["uid"] = user_review_df["reviewerID"].apply(lambda x: user_to_uid[x])



100%|██████████| 27030/27030 [00:00<00:00, 783811.32it/s]


number of aids has viewpids, buypids = 30,029, 27,030
viewpids, buypids, only_buypids' average lengths are = 5.063, 4.727, 4.227


In [4]:
train_user_review_df = []
test_user_review_df = []
for i, group in tqdm(user_review_df.groupby("uid")):
    assert len(group) >= 5
    train_user_review_df.append(group.iloc[:-1])
    test_user_review_df.append(group.iloc[[-1]])

train_user_review_df = pd.concat(train_user_review_df).reset_index(drop=True)
test_user_review_df = pd.concat(test_user_review_df).reset_index(drop=True)
assert len(train_user_review_df) + len(test_user_review_df) == len(user_review_df)
assert len(train_user_review_df.uid.unique()) == len(test_user_review_df.uid.unique())

pid_to_review = {}
for pid, group in train_user_review_df.groupby("pid"):
    assert pid == group.pid.iloc[0]
    if all(group.vote == 0):
        review = np.random.choice(group.reviewText, size=1)[0]
    else:
        review = group.reviewText.iloc[np.argmax(group.vote)]
    pid_to_review[pid] = review

print("number of train_user_review, test_user_review = {:,}, {:,}".format(len(train_user_review_df), len(test_user_review_df)))
print("number of unique users = {:,}".format(len(user_review_df.uid.unique())))
print("total number of pids = {:,}, number of pid has review = {:,}".format(len(pid_to_title), len(pid_to_review)))
print("number of interacted (query, item) pairs in review_df = {:,}, in user_review_df = {:,}".format(
    len(review_df.pid.unique()), len(user_review_df.pid.unique())
))

100%|██████████| 215338/215338 [00:46<00:00, 4597.05it/s]


number of train_user_review, test_user_review = 1,368,611, 215,338
number of unique users = 215,338
total number of pids = 533,871, number of pid has review = 217,162
number of interacted (query, item) pairs in review_df = 532,969, in user_review_df = 239,464


In [5]:
# for unified_kgc train and test
from collections import defaultdict
import random
random.seed(4680)

aid_to_simpids = aid_to_viewpids
aid_to_complpids = aid_to_only_buypids

# simpids
val_test_aids = random.sample(aid_to_simpids.keys(), int(0.2*len(aid_to_simpids)))
val_aids = val_test_aids[:int(0.5*len(val_test_aids))]
test_aids = val_test_aids[int(0.5*len(val_test_aids)):]
train_aid_to_simpids, val_aid_to_simpids, test_aid_to_simpids = {}, {}, {}
for aid, simpids in tqdm(aid_to_simpids.items(), total=len(aid_to_simpids)):
    if aid in val_aids:
        val_aid_to_simpids[aid] = simpids
    elif aid in test_aids:
        test_aid_to_simpids[aid] = simpids
    else:
        train_aid_to_simpids[aid] = simpids
        
# complpids
val_test_aids = random.sample(aid_to_complpids.keys(), int(0.2*len(aid_to_complpids)))
val_aids = val_test_aids[:int(0.5*len(val_test_aids))]
test_aids = val_test_aids[int(0.5*len(val_test_aids)):]
train_aid_to_complpids, val_aid_to_complpids, test_aid_to_complpids = {}, {}, {}
for aid, complpids in tqdm(aid_to_complpids.items(), total=len(aid_to_complpids)):
    if aid in val_aids:
        val_aid_to_complpids[aid] = complpids
    elif aid in test_aids:
        test_aid_to_complpids[aid] = complpids
    else:
        train_aid_to_complpids[aid] = complpids
        
# relpids
qid_to_relpids = defaultdict(set)
for qid, pid in zip(review_df.qid, review_df.pid):
    qid_to_relpids[qid].add(pid)
    
qid_pids_pairs = list(qid_to_relpids.items())
random.shuffle(qid_pids_pairs)
train_qid_to_relpids = {qid: pids for qid, pids in qid_pids_pairs[:int(0.8*len(qid_pids_pairs))]}
val_qid_to_relpids = {qid: pids for qid, pids in qid_pids_pairs[int(0.8*len(qid_pids_pairs)): int(0.9*len(qid_pids_pairs))]}
test_qid_to_relpids = {qid: pids for qid, pids in qid_pids_pairs[int(0.9*len(qid_pids_pairs)):]}

# check
assert len( set(train_aid_to_simpids.keys()) & set(val_aid_to_simpids.keys()) & set(test_aid_to_simpids.keys()) ) == 0
assert len( set(train_aid_to_complpids.keys()) & set(val_aid_to_complpids.keys()) & set(test_aid_to_complpids.keys())) == 0
assert len( set(train_qid_to_relpids.keys()) & set(val_qid_to_relpids.keys()) & set(test_qid_to_relpids.keys())) == 0


print("unique qids, pids for search, average length = {:,}, {:,}, {:.3f}".format(
    len(train_qid_to_relpids), 
    len(set([x for xs in train_qid_to_relpids.values() for x in xs])),
    np.mean([len(xs) for xs in qid_to_relpids.values()])))
print("unique aids, pids for sim_rec, average length = {:,}, {:,}, {:.3f}".format(
    len(train_aid_to_simpids), 
    len(set([x for xs in aid_to_simpids.values() for x in xs])),
    np.mean([len(xs) for xs in aid_to_simpids.values()])))
print("unique aids, pids for compl_rec, average length = {:,}, {:,}, {:.3f}".format(
    len(train_aid_to_complpids), 
    len(set([x for xs in aid_to_complpids.values() for x in xs])),
    np.mean([len(xs) for xs in aid_to_complpids.values()])))

unique_relpids = set([x for xs in train_qid_to_relpids.values() for x in xs])
unique_simpids = set([x for xs in aid_to_simpids.values() for x in xs]).union(set(train_aid_to_simpids.keys()))
unique_complpids = set([x for xs in aid_to_complpids.values() for x in xs]).union(set(train_aid_to_complpids.keys()))
print("relpids and simpids intersect rate = {:.3f}".format(
    len(unique_relpids.intersection(unique_simpids)) / len(unique_relpids.union(unique_simpids))
))
print("relpids and complpids intersect rate = {:.3f}".format(
    len(unique_relpids.intersection(unique_complpids)) / len(unique_relpids.union(unique_complpids))
))
print("simpids and complpids intersect rate = {:.3f}".format(
   len( unique_simpids.intersection(unique_complpids)) / len(unique_simpids.union(unique_complpids))
))

# for user train and test
test_user_qid_to_relpids = defaultdict(set)
excluded_train_qid_to_relpids = {}
for i, row in test_user_review_df.iterrows():
    qid, pid = int(row.qid), int(row.pid)
    test_user_qid_to_relpids[qid].add(pid)
for qid, relpids in train_qid_to_relpids.items():
    if qid in test_user_qid_to_relpids:
        excluded_train_qid_to_relpids[qid] = relpids.difference(test_user_qid_to_relpids[qid])
    else:
        excluded_train_qid_to_relpids[qid] = relpids
        
print("total number of gt pairs for train_qid_to_relpids and excluded_qid_to_relpids = {:,}, {:,}".format(
    sum([len(xs) for xs in train_qid_to_relpids.values()]), sum([len(xs) for xs in excluded_train_qid_to_relpids.values()])
))
print("unique qids for for train_qid_to_relpids and excluded_qid_to_relpids = {:,}, {:,}".format(
    len(train_qid_to_relpids.keys()), len(excluded_train_qid_to_relpids.keys())
))
train_qid_to_relpids = excluded_train_qid_to_relpids
excluded_train_qid_to_relpids = None

100%|██████████| 30029/30029 [00:05<00:00, 5839.56it/s]
100%|██████████| 27030/27030 [00:03<00:00, 7168.70it/s]


unique qids, pids for search, average length = 593, 524,434, 718.287
unique aids, pids for sim_rec, average length = 24,024, 19,252, 5.063
unique aids, pids for compl_rec, average length = 21,624, 14,195, 4.227
relpids and simpids intersect rate = 0.062
relpids and complpids intersect rate = 0.050
simpids and complpids intersect rate = 0.347
total number of gt pairs for train_qid_to_relpids and excluded_qid_to_relpids = 524,434, 446,338
unique qids for for train_qid_to_relpids and excluded_qid_to_relpids = 593, 593


In [7]:
# write to disk
import os 
import copy
import pickle as pkl

import re
space_pattern = re.compile(r'\s+')

pid_to_title = {pid: title.strip() for pid, title in pid_to_title.items()}
query_to_qid = {query.strip(): qid for query, qid in query_to_qid.items()}
clean_pid_to_review = {}
for pid, review in pid_to_review.items():
    if type(review) == float:
        continue
    clean_pid_to_review[pid] = re.sub(space_pattern, " ", review)
    clean_pid_to_review[pid] = re.sub('<[^<]+?>', " ", clean_pid_to_review[pid])[:1000]
pid_to_review = clean_pid_to_review
clean_pid_to_review = None

def add_query_prefix(text):
    return "query: " + text

def add_product_prefix(text):
    return "product: " + text

out_dir = os.path.join(in_dir, dataset_name)
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
    
with open(os.path.join(out_dir, "collection_title.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        title = add_product_prefix(title)
        fout.write(f"{pid}\t{title}\n")

with open(os.path.join(out_dir, "colletion_title_review.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        text = title
        if pid in pid_to_review:
            review = pid_to_review[pid]
            text = text + " ; " + review 
        text = add_product_prefix(text)
        fout.write(f"{pid}\t{text}\n")
        
        
with open(os.path.join(out_dir, "product.jsonl"), "w") as fout:
        for pid, title in pid_to_title.items():
            text = title
            if pid in pid_to_review:
                review = pid_to_review[pid]
                text = text + " ; " + review
            text = add_product_prefix(text)
            example = {"id": pid, "contents": text}
            fout.write(ujson.dumps(example) + "\n")
            
with open(os.path.join(out_dir, "all_queries.tsv"), "w") as fout:
    for query, qid in query_to_qid.items():
        query = add_query_prefix(query)
        fout.write(f"{qid}\t{query}\n")
            
with open(os.path.join(out_dir, "all_entities.tsv"), "w") as fout:
    for pid, title in pid_to_title.items():
        text = title
        if pid in pid_to_review:
            review = pid_to_review[pid]
            text = text + " ; " + review
        text = add_product_prefix(text)
        fout.write(f"{pid}\t{text}\n")
    for query, qid in query_to_qid.items():
        query = add_query_prefix(query)
        fout.write(f"{qid}\t{query}\n")
        
with open(os.path.join(out_dir, "asin_to_pid.pkl"), "wb") as fout:
    pkl.dump(asin_to_pid, fout)

with open(os.path.join(out_dir, "query_to_qid.pkl"), "wb") as fout:
    pkl.dump(query_to_qid, fout)
    
fn_to_data = {
    "train_aid_to_simpids.pkl": train_aid_to_simpids,
    "val_aid_to_simpids.pkl": val_aid_to_simpids,
    "test_aid_to_simpids.pkl": test_aid_to_simpids,
    
    "train_aid_to_complpids.pkl": train_aid_to_complpids,
    "val_aid_to_complpids.pkl": val_aid_to_complpids,
    "test_aid_to_complpids.pkl": test_aid_to_complpids,
    
    "train_qid_to_relpids.pkl": train_qid_to_relpids,
    "val_qid_to_relpids.pkl": val_qid_to_relpids,
    "test_qid_to_relpids.pkl": test_qid_to_relpids,
    
    "train_user_review_df.pkl": train_user_review_df,
    "test_user_review_df.pkl": test_user_review_df,
}

for fn, data in fn_to_data.items():
    fn = os.path.join(out_dir, fn)
    with open(fn, "wb") as fout:
        pkl.dump(data, fout)

In [None]:
for fn in os.listdir(out_dir):
    if not fn.endswith("tsv"):
        continue
    
    fn = os.path.join(out_dir, fn)
    ! wc -l $fn
    ! head -n 3 $fn
    ! tail -n 3 $fn 
    print("="*75)

In [None]:
pid_to_review[16741]