In [1]:
import os
import json
from tqdm import tqdm
import pandas as pd
import random
import numpy as np

In [2]:
import random
def neg_sample(all_items, pos_items, n_sample=99):
    random.seed(42)
    pos_set = set(pos_items)  
    neg_items = [item for item in all_items if item not in pos_set]  
    return random.sample(neg_items, k=n_sample)

# Games

In [3]:
with open('../../datasets/games/meta_Video_Games.json') as f:
    metadata = [json.loads(line) for line in f]
with open('../../datasets/games/Video_Games_5.json') as f:
    reviews = [json.loads(line) for line in f]
users = set()
items = set()
for review in tqdm(reviews):
    users.add(review['reviewerID'])
    items.add(review['asin'])
item2id = dict()
count = 0
for item in items:
    item2id[item] = count
    count += 1
print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))

100%|██████████| 497577/497577 [00:00<00:00, 1221352.18it/s]

55223 17408 497577 0.0005175966099616421





In [4]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if len(meta['title']) > 1: # remove the item without title
        id_title[meta['asin']] = meta['title']
users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': [],
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(review['overall'])
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 84819/84819 [00:00<00:00, 1025397.89it/s]
100%|██████████| 497577/497577 [00:01<00:00, 335235.99it/s]


In [5]:
with open("../../datasets/games/SeqRec/id2name.txt") as f:
    all_items = []
    for line in f.readlines():
        all_items.append(line.split("\t")[0])

In [6]:
user_id = 0
interactions = []
B = []
for key in tqdm(users.keys()):
    items = users[key]['items']
    ratings = users[key]['ratings']
    timestamps = users[key]['timestamps']
    all = list(zip(items, ratings, timestamps))
    res = sorted(all, key=lambda x: int(x[-1]))
    items, ratings, timestamps = zip(*res)
    items, ratings, timestamps = list(items), list(ratings), list(timestamps)
    users[key]['items'] = items
    users[key]['item_ids'] = [item2id[x] for x in items]
    users[key]['item_titles'] = [id_title[x] for x in items]
    users[key]['ratings'] = ratings
    users[key]['timestamps'] = timestamps
    for i in range(min(10, len(items) - 1), len(items)):
        st = max(i - 10, 0)
        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   
print(len(interactions))

100%|██████████| 55223/55223 [00:03<00:00, 15970.65it/s]

149796





In [7]:
interactions = sorted(interactions, key=lambda x: x[-1])
import csv
test_size = 5000
valid_size = 5000
train_size = len(interactions) - test_size - valid_size
with open('../../datasets/games/SeqRec/train.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[:train_size])
with open('../../datasets/games/SeqRec/valid.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[train_size:(train_size+valid_size)])
with open('../../datasets/games/SeqRec/test.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[(train_size+valid_size):])

In [8]:
def csv_to_json(input_path, output_path, sample=False):
    data = pd.read_csv(input_path)
    if sample:
        data = data.sample(n=5000, random_state=42).reset_index(drop=True)
        data.to_csv(output_path[:-5] + ".csv", index=False)
    json_list = []
    for index, row in tqdm(data.iterrows()):
    #    row["user id"] = eval(row['user_id'])
        row['history_item_title'] = eval(row['history_item_title'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_item_title'])
        history = "The user has played the following video games before:"
        for i in range(L):
            if i == 0:
                history += "\"" + row['history_item_title'][i] + "\""
            else:
                history += ", \"" + row['history_item_title'][i] + "\""
        target_movie = str(row['item_title'])
        target_movie_str = "\"" + target_movie + "\""
        json_list.append({
            "user id" : row["user_id"],
            "instruction": "Given a list of video games the user has played before, please recommend a new video game that the user likes to the user.",
            "input": f"{history}\n ",
            "output": target_movie_str,
        })        
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [9]:
path = "../../datasets/games/SeqRec"
csv_to_json(f'{path}/train.csv', f'{path}/train.json')
csv_to_json(f'{path}/valid.csv', f'{path}/valid.json')
csv_to_json(f'{path}/test.csv', f'{path}/test.json')

139796it [00:23, 5837.35it/s]
5000it [00:00, 5124.16it/s]
5000it [00:00, 5842.95it/s]


In [10]:
with open("../../datasets/games/SeqRec/test.json", "r") as f:
    test = json.load(f)
test_ = []
random.seed(42)
for t in tqdm(test):
    t["neg_samples"] = neg_sample(all_items, users[t["user id"]]["item_titles"], n_sample=99)
    test_.append(t)
with open("../../datasets/games/SeqRec/test.json", "w") as f:
    json.dump(test_, f, indent=4, ensure_ascii=False)

100%|██████████| 5000/5000 [00:04<00:00, 1034.67it/s]


# Movies

In [11]:
results = []  
with open('../../datasets/movies/meta_Movies_and_TV.json', 'r') as f:  
    for line in tqdm(f):
        results.append(line)
import re
new_datas = []
for data in tqdm(results):
    new_data = {}
    asin = re.findall(r'"asin": "(.*?)"', data)
    title = re.findall(r'"title": "(.*?)"' , data.replace("\'", "_"))
    brand = re.findall(r'"brand": "(.*?)"' , data.replace("\'", "_"))
    if len(brand) > 0:
        brand = brand[0]
    else:
        brand = None
    if len(asin) > 0:
        new_data["asin"] = asin[0]
    if len(title) > 0:
        new_data["title"] = title[0]
        if brand is not None:
            new_data["title"] += f" - {brand}"
    new_datas.append(new_data)
with open('../../datasets/movies/SeqRec/meta_movie_process.json', "w") as f:
    json.dump(new_datas, f)

203766it [00:00, 321763.59it/s]
100%|██████████| 203766/203766 [00:02<00:00, 100957.82it/s]


In [12]:
import json
from tqdm import tqdm
with open('../../datasets/movies/SeqRec/meta_movie_process.json') as f:
    metadata = json.load(f)
reviews = []
with open('../../datasets/movies/Movies_and_TV_5.json') as f:
    for line in tqdm(f):
        review = json.loads(line)
        review = {
            'reviewerID' : review['reviewerID'],
            "asin" : review["asin"],
            "overall" : review["overall"],
            "unixReviewTime" : review["unixReviewTime"]
        }
        reviews.append(review)
    
users = set()
items = set()
for review in tqdm(reviews):
    users.add(review['reviewerID'])
    items.add(review['asin'])
item2id = dict()
count = 0
for item in items:
    item2id[item] = count
    count += 1
print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))

3410019it [00:28, 117939.13it/s]
100%|██████████| 3410019/3410019 [00:02<00:00, 1181681.65it/s]

297529 60175 3410019 0.00019046334058915956





In [13]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if "title" in meta and len(meta['title']) > 50: # remove movies with titles that are too short to be distinguished
        id_title[meta['asin']] = meta['title']

users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': []
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(review['overall'])
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 203766/203766 [00:00<00:00, 1810502.06it/s]
100%|██████████| 3410019/3410019 [00:05<00:00, 635319.76it/s] 


In [14]:
user_id = 0
interactions = []
B = []
for key in tqdm(users.keys()):
    items = users[key]['items']
    ratings = users[key]['ratings']
    timestamps = users[key]['timestamps']
    all = list(zip(items, ratings, timestamps))
    res = sorted(all, key=lambda x: int(x[-1]))
    items, ratings, timestamps = zip(*res)
    items, ratings, timestamps = list(items), list(ratings), list(timestamps)
    users[key]['items'] = items
    users[key]['item_ids'] = [item2id[x] for x in items]
    users[key]['item_titles'] = [id_title[x] for x in items]
    users[key]['ratings'] = ratings
    users[key]['timestamps'] = timestamps
    for i in range(min(10, len(items) - 1), len(items)):
        st = max(i - 10, 0)
        if i - st < 3:
            continue
        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   
print(len(interactions))

100%|██████████| 216367/216367 [00:05<00:00, 37205.47it/s]

114594





In [15]:
id2name = {}
for data in tqdm(interactions):
    ids = data[3] + [data[4]]
    names = data[5] + [data[6]]
    for id, name in zip(ids, names):
        if id not in id2name.keys():
            id2name[id] = name
        else:
            if id2name[id] != name:
                print("error")
                break

100%|██████████| 114594/114594 [00:00<00:00, 268371.49it/s]


In [16]:
id2name = dict(sorted(id2name.items(), key=lambda x: x[1]))
all_items = list(id2name.values())

In [17]:
processed_id2name = {}
processed_id2name_set = set([])
for _, name in tqdm(id2name.items()):
    if name not in processed_id2name_set:
        processed_id2name[len(processed_id2name)] = name
        processed_id2name_set.add(name)
with open("../../datasets/movies/SeqRec/id2name.json", "w") as f:
    json.dump(processed_id2name, f, indent=4, ensure_ascii=False)

100%|██████████| 16233/16233 [00:00<00:00, 408190.27it/s]


In [18]:
interactions = sorted(interactions, key=lambda x: x[-1])
import csv
test_size = 5000
valid_size = 5000
train_size = len(interactions) - test_size - valid_size
with open('../../datasets/movies/SeqRec/train.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[:train_size])
with open('../../datasets/movies/SeqRec/valid.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[train_size:(train_size+valid_size)])
with open('../../datasets/movies/SeqRec/test.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[(train_size+valid_size):])

In [19]:
def csv_to_json(input_path, output_path, sample=False):
    data = pd.read_csv(input_path)
    
    if sample:
        data = data.sample(n=5000, random_state=42).reset_index(drop=True)
        data.to_csv(output_path[:-5] + ".csv", index=False)
    json_list = []
    for index, row in tqdm(data.iterrows()):
        row['history_item_title'] = eval(row['history_item_title'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_item_title'])
        history = "The user has watched the following movies and TVs before:"  
        for i in range(L):
            if i == 0:
                history += "\"" + row['history_item_title'][i] + "\""
            else:
                history += ", \"" + row['history_item_title'][i] + "\""
        target_movie = str(row['item_title'])
        target_movie_str = "\"" + target_movie + "\""
        json_list.append({
            "user id" : row["user_id"],
            "instruction": "Given a list of movies ad TVs the user has watched before, please recommend the title of a new movie or TV that the user will like to watch in the following time.",
            "input": f"{history}\n ",
            "output": target_movie_str,
        })        
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [20]:
path = "../../datasets/movies/SeqRec"
csv_to_json(f'{path}/train.csv', f'{path}/train.json')
csv_to_json(f'{path}/valid.csv', f'{path}/valid.json')
csv_to_json(f'{path}/test.csv', f'{path}/test.json')

104594it [00:15, 6589.42it/s]
5000it [00:00, 6521.96it/s]
5000it [00:00, 7021.61it/s]


In [21]:
with open("../../datasets/movies/SeqRec/test.json", "r") as f:
    test = json.load(f)
test_ = []
random.seed(42)
for t in tqdm(test):
    t["neg_samples"] = neg_sample(all_items, users[t["user id"]]["item_titles"], n_sample=99)
    test_.append(t)
with open("../../datasets/movies/SeqRec/test.json", "w") as f:
    json.dump(test_, f, indent=4, ensure_ascii=False)

100%|██████████| 5000/5000 [00:05<00:00, 849.55it/s]


# Food

In [22]:
import json  
results = []  
with open('../../datasets/food/meta_Grocery_and_Gourmet_Food.json', 'r') as f:  
    for line in f:
        results.append(line)
import re
new_datas = []
for data in results:
    new_data = {}
    asin = re.findall(r"_asin_:\s*_(.*?)_" , data.replace("\'", "_"))
    title = re.findall(r"_title_:\s*_(.*?)_" , data.replace("\'", "_"))
    brand = re.findall(r"_brand_:\s*_(.*?)_" , data.replace("\'", "_"))
    if len(brand) > 0:
        brand = brand[0]
    else:
        brand = None
    if len(asin) > 0:
        new_data["asin"] = asin[0]
    if len(title) > 0:
        new_data["title"] = title[0]
        if brand is not None and len(brand)>0:
            new_data["title"] += f" - {brand}"
    new_datas.append(new_data)
with open('../../datasets/food/SeqRec/meta_food_process.json', "w") as f:
    json.dump(new_datas, f)

In [23]:
import json
from tqdm import tqdm
with open('../../datasets/food/SeqRec/meta_food_process.json') as f:
    metadata = json.load(f)
with open('../../datasets/food/Grocery_and_Gourmet_Food_5.json') as f:
    reviews = [json.loads(line) for line in tqdm(f)]
users = set()
items = set()
for review in tqdm(reviews):
    users.add(review['reviewerID'])
    items.add(review['asin'])
item2id = dict()
count = 0
for item in tqdm(items):
    item2id[item] = count
    count += 1
print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))

151254it [00:02, 58705.39it/s]
100%|██████████| 151254/151254 [00:00<00:00, 1456889.07it/s]
100%|██████████| 8713/8713 [00:00<00:00, 1585671.49it/s]

14681 8713 151254 0.0011824519884614812





In [24]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if "title" in meta and len(meta['title']) > 1: # remove the item without title
        id_title[meta['asin']] = meta['title']

users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': []
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(review['overall'])
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 171760/171760 [00:00<00:00, 1623503.98it/s]
100%|██████████| 151254/151254 [00:00<00:00, 611163.92it/s]


In [25]:
user_id = 0
interactions = []
B = []
for key in tqdm(users.keys()):
    items = users[key]['items']
    ratings = users[key]['ratings']
    timestamps = users[key]['timestamps']
    all = list(zip(items, ratings, timestamps))
    res = sorted(all, key=lambda x: int(x[-1]))
    items, ratings, timestamps = zip(*res)
    items, ratings, timestamps = list(items), list(ratings), list(timestamps)
    users[key]['items'] = items
    users[key]['item_ids'] = [item2id[x] for x in items]
    users[key]['item_titles'] = [id_title[x] for x in items]
    users[key]['ratings'] = ratings
    users[key]['timestamps'] = timestamps
    for i in range(min(10, len(items) - 1), len(items)):
        st = max(i - 10, 0)
        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   
print(len(interactions))

100%|██████████| 14641/14641 [00:00<00:00, 45220.40it/s]

43293





In [26]:
id2name = {}
for data in tqdm(interactions):
    ids = data[3] + [data[4]]
    names = data[5] + [data[6]]
    for id, name in zip(ids, names):
        if id not in id2name.keys():
            id2name[id] = name
        else:
            if id2name[id] != name:
                print("error")
                break

100%|██████████| 43293/43293 [00:00<00:00, 284677.34it/s]


In [27]:
id2name = dict(sorted(id2name.items(), key=lambda x: x[1]))
all_items = list(id2name.values())

In [28]:
processed_id2name = {}
for _, name in tqdm(id2name.items()):
    if name not in processed_id2name.values():
        processed_id2name[len(processed_id2name)] = name
with open("../../datasets/food/SeqRec/id2name.json", "w") as f:
    json.dump(processed_id2name, f, indent=4, ensure_ascii=False)

100%|██████████| 7069/7069 [00:00<00:00, 13731.60it/s]


In [29]:
interactions = sorted(interactions, key=lambda x: x[-1])
import csv
test_size = round(len(interactions)*0.1)
valid_size = round(len(interactions)*0.1)
train_size = len(interactions) - test_size - valid_size
with open('../../datasets/food/SeqRec/train.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[:train_size])
with open('../../datasets/food/SeqRec/valid.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[train_size:(train_size+valid_size)])
with open('../../datasets/food/SeqRec/test.csv', 'w') as f:
    csvwriter = csv.writer(f)
    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    csvwriter.writerows(interactions[(train_size+valid_size):])

In [30]:
def csv_to_json(input_path, output_path, sample=False):
    data = pd.read_csv(input_path)
    
    if sample:
        data = data.sample(n=5000, random_state=42).reset_index(drop=True)
        data.to_csv(output_path[:-5] + ".csv", index=False)
    json_list = []
    for index, row in tqdm(data.iterrows()):
        row['history_item_title'] = eval(row['history_item_title'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_item_title'])
        history = "The user has purchased the following food before:"  
        for i in range(L):
            if i == 0:
                history += "\"" + row['history_item_title'][i] + "\""
            else:
                history += ", \"" + row['history_item_title'][i] + "\""
        target_movie = str(row['item_title'])
        target_movie_str = "\"" + target_movie + "\""
        json_list.append({
            "user id" : row["user_id"],
            "instruction": "Given a list of food the user has purchased before, please recommend the name of a new food that the user will like to purchase in the following time.",
            "input": f"{history}\n ",
            "output": target_movie_str,
        })        
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [31]:
path = "../../datasets/food/SeqRec"
csv_to_json(f'{path}/train.csv', f'{path}/train.json')
csv_to_json(f'{path}/valid.csv', f'{path}/valid.json')
csv_to_json(f'{path}/test.csv', f'{path}/test.json')

34635it [00:05, 5995.58it/s]
4329it [00:01, 4140.50it/s]
4329it [00:00, 7250.85it/s]


In [32]:
with open("../../datasets/food/SeqRec/test.json", "r") as f:
    test = json.load(f)
test_ = []
random.seed(42)
for t in tqdm(test):
    t["neg_samples"] = neg_sample(all_items, users[t["user id"]]["item_titles"], n_sample=99)
    test_.append(t)
with open("../../datasets/food/SeqRec/test.json", "w") as f:
    json.dump(test_, f, indent=4, ensure_ascii=False)

100%|██████████| 4329/4329 [00:02<00:00, 1660.50it/s]
