In [1]:
import json
import os
import pandas as pd

# Games

In [47]:
path = "../../datasets/games"

In [2]:
os.makedirs("../../datasets/games/CTRPre", exist_ok=True)

In [48]:
import json
from tqdm import tqdm
with open(f'{path}/meta_Video_Games.json') as f:
    metadata = [json.loads(line) for line in f]
with open(f'{path}/Video_Games_5.json') as f:
    reviews = [json.loads(line) for line in f]
users = set()
items = set()
for review in tqdm(reviews):
    users.add(review['reviewerID'])
    items.add(review['asin'])
item2id = dict()
count = 0
for item in items:
    item2id[item] = count
    count += 1
print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))

100%|██████████| 497577/497577 [00:00<00:00, 1487319.73it/s]

55223 17408 497577 0.0005175966099616421





In [49]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if len(meta['title']) > 1: # remove the item without title
        id_title[meta['asin']] = meta['title']
users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': [],
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(int(int(review['overall']) > 3))
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 84819/84819 [00:00<00:00, 1010792.97it/s]
100%|██████████| 497577/497577 [00:01<00:00, 455162.29it/s]


In [50]:
with open(f'{path}/CTRPre/item_mapping.csv', 'w') as f:
    import csv
    writer = csv.writer(f)
    writer.writerow(['item_id', 'item_name'])
    for id, name in id_title.items():
        writer.writerow([id, name])

In [51]:
user_id = 0
interactions = []
B = []
for key in tqdm(users.keys()):
    items = users[key]['items']
    ratings = users[key]['ratings']
    timestamps = users[key]['timestamps']
    all = list(zip(items, ratings, timestamps))
    res = sorted(all, key=lambda x: int(x[-1]))
    items, ratings, timestamps = zip(*res)
    items, ratings, timestamps = list(items), list(ratings), list(timestamps)
    users[key]['items'] = items
    users[key]['item_ids'] = [item2id[x] for x in items]
    users[key]['item_titles'] = [id_title[x] for x in items]
    users[key]['ratings'] = ratings
    users[key]['timestamps'] = timestamps
    for i in range(min(10, len(items) - 1), len(items)):
        st = max(i - 10, 0)
        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   
print(len(interactions))
sequential_interaction_list = sorted(interactions, key=lambda x: x[-1])

100%|██████████| 55223/55223 [00:03<00:00, 17028.48it/s]

149796





In [52]:
test_size = 5000
valid_size = 5000
train_size = len(sequential_interaction_list) - test_size - valid_size
with open(f'{path}/CTRPre/train.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[:train_size])
with open(f'{path}/CTRPre/valid.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[train_size:(train_size+valid_size)])
with open(f'{path}/CTRPre/test.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[(train_size+valid_size):])

In [53]:
def csv_to_json(input_path, output_path):
    data = pd.read_csv(input_path)
    json_list = []
    for index, row in data.iterrows():
        row['history_item_title'] = eval(row['history_item_title'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_item_title'])
        preference = []
        unpreference = []
        for i in range(L):
            if int(row['history_rating'][i]) == 1:
                preference.append(row['history_item_title'][i])
            else:
                unpreference.append(row['history_item_title'][i])
        target_movie = str(row['item_title'])
        preference_str = ""
        unpreference_str = ""
        for i in range(len(preference)):
            if i == 0:
                preference_str += "\"" + preference[i] + "\""
            else:
                preference_str += ", \"" + preference[i] + "\""
        for i in range(len(unpreference)):
            if i == 0:
                unpreference_str += "\"" + unpreference[i] + "\""
            else:
                unpreference_str += ", \"" + unpreference[i] + "\""
        target_preference = int(row['rating'])
        target_movie_str = "\"" + target_movie + "\""
        target_preference_str = "Yes" if target_preference == 1 else "No"
        json_list.append({
            "instruction": "Given the user's preference and unpreference, identify whether the user will like the target game by answering \"Yes\" or \"No\".",
            "input": f"User Preference: {preference_str}\nUser Unpreference: {unpreference_str}\nWhether the user will like the target movie {target_movie_str}?",
            "output": target_preference_str,
        })
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [54]:
csv_to_json(f'{path}/CTRPre/train.csv', f'{path}/CTRPre/train.json')
csv_to_json(f'{path}/CTRPre/valid.csv', f'{path}/CTRPre/valid.json')
csv_to_json(f'{path}/CTRPre/test.csv', f'{path}/CTRPre/test.json')

# Movies

In [55]:
path = "../../datasets/movies"

In [3]:
os.makedirs("../../datasets/movies/CTRPre", exist_ok=True)

In [56]:
import json  
from tqdm import tqdm
results = []  
with open('../../datasets/movies/meta_Movies_and_TV.json', 'r') as f:  
    for line in tqdm(f):
        results.append(line)
import re
new_datas = []
for data in tqdm(results):
    new_data = {}
    asin = re.findall(r'"asin": "(.*?)"', data)
    title = re.findall(r'"title": "(.*?)"' , data.replace("\'", "_"))
    brand = re.findall(r'"brand": "(.*?)"' , data.replace("\'", "_"))
    if len(brand) > 0:
        brand = brand[0]
    else:
        brand = None
    if len(asin) > 0:
        new_data["asin"] = asin[0]
    if len(title) > 0:
        new_data["title"] = title[0]
        if brand is not None:
            new_data["title"] += f" - {brand}"
    new_datas.append(new_data)
with open('../../datasets/movies/CTRPre/meta_movie_process.json', "w") as f:
    json.dump(new_datas, f)

203766it [00:00, 348825.60it/s]
100%|██████████| 203766/203766 [00:01<00:00, 127314.70it/s]


In [57]:
import json
from tqdm import tqdm
with open('../../datasets/movies/CTRPre/meta_movie_process.json') as f:
    metadata = json.load(f)
reviews = []
with open('../../datasets/movies/Movies_and_TV_5.json') as f:
    for line in tqdm(f):
        review = json.loads(line)
        review = {
            'reviewerID' : review['reviewerID'],
            "asin" : review["asin"],
            "overall" : review["overall"],
            "unixReviewTime" : review["unixReviewTime"]
        }
        reviews.append(review)
    
users = set()
items = set()
for review in tqdm(reviews):
    users.add(review['reviewerID'])
    items.add(review['asin'])
item2id = dict()
count = 0
for item in items:
    item2id[item] = count
    count += 1
print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))

3410019it [00:24, 138111.56it/s]
100%|██████████| 3410019/3410019 [00:02<00:00, 1271125.88it/s]

297529 60175 3410019 0.00019046334058915956





In [58]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if "title" in meta and len(meta['title']) > 50:
        id_title[meta['asin']] = meta['title']

users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': []
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(int(int(review['overall']) > 3))
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 203766/203766 [00:00<00:00, 1940170.05it/s]
100%|██████████| 3410019/3410019 [00:05<00:00, 680258.97it/s] 


In [59]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if "title" in meta and len(meta['title']) > 50: # remove movies with titles that are too short to be distinguished
        id_title[meta['asin']] = meta['title']

users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': []
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(int(int(review['overall']) > 3))
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 203766/203766 [00:00<00:00, 1749384.50it/s]
100%|██████████| 3410019/3410019 [00:06<00:00, 548373.95it/s] 


In [60]:
with open(f'{path}/CTRPre/item_mapping.csv', 'w') as f:
    import csv
    writer = csv.writer(f)
    writer.writerow(['item_id', 'item_name'])
    for id, name in id_title.items():
        writer.writerow([id, name])

In [61]:
user_id = 0
interactions = []
B = []
for key in tqdm(users.keys()):
    items = users[key]['items']
    ratings = users[key]['ratings']
    timestamps = users[key]['timestamps']
    all = list(zip(items, ratings, timestamps))
    res = sorted(all, key=lambda x: int(x[-1]))
    items, ratings, timestamps = zip(*res)
    items, ratings, timestamps = list(items), list(ratings), list(timestamps)
    users[key]['items'] = items
    users[key]['item_ids'] = [item2id[x] for x in items]
    users[key]['item_titles'] = [id_title[x] for x in items]
    users[key]['ratings'] = ratings
    users[key]['timestamps'] = timestamps
    for i in range(min(10, len(items) - 1), len(items)):
        st = max(i - 10, 0)
        if i - st < 3:
            continue
        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   
print(len(interactions))
sequential_interaction_list = sorted(interactions, key=lambda x: x[-1])

100%|██████████| 216367/216367 [00:07<00:00, 27109.37it/s]

114594





In [62]:
test_size = 5000
valid_size = 5000
train_size = len(sequential_interaction_list) - test_size - valid_size
with open(f'{path}/CTRPre/train.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[:train_size])
with open(f'{path}/CTRPre/valid.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[train_size:(train_size+valid_size)])
with open(f'{path}/CTRPre/test.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[(train_size+valid_size):])

In [None]:
def csv_to_json(input_path, output_path):
    data = pd.read_csv(input_path)
    json_list = []
    for index, row in data.iterrows():
        row['history_item_title'] = eval(row['history_item_title'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_item_title'])
        preference = []
        unpreference = []
        for i in range(L):
            if int(row['history_rating'][i]) == 1:
                preference.append(row['history_item_title'][i])
            else:
                unpreference.append(row['history_item_title'][i])
        target_movie = str(row['item_title'])
        preference_str = ""
        unpreference_str = ""
        for i in range(len(preference)):
            if i == 0:
                preference_str += "\"" + preference[i] + "\""
            else:
                preference_str += ", \"" + preference[i] + "\""
        for i in range(len(unpreference)):
            if i == 0:
                unpreference_str += "\"" + unpreference[i] + "\""
            else:
                unpreference_str += ", \"" + unpreference[i] + "\""
        target_preference = int(row['rating'])
        target_movie_str = "\"" + target_movie + "\""
        target_preference_str = "Yes" if target_preference == 1 else "No"
        json_list.append({
            "instruction": "Given the user's preference and unpreference, identify whether the user will like the target movie by answering \"Yes\" or \"No\".",
            "input": f"User Preference: {preference_str}\nUser Unpreference: {unpreference_str}\nWhether the user will like the target movie titled {target_movie_str}?",
            "output": target_preference_str,
        })
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [63]:
csv_to_json(f'{path}/CTRPre/train.csv', f'{path}/CTRPre/train.json')
csv_to_json(f'{path}/CTRPre/valid.csv', f'{path}/CTRPre/valid.json')
csv_to_json(f'{path}/CTRPre/test.csv', f'{path}/CTRPre/test.json')

# Food

In [64]:
path = "../../datasets/food"

In [4]:
os.makedirs("../../datasets/food/CTRPre", exist_ok=True)

In [65]:
import json  
results = []  
with open('../../datasets/food/meta_Grocery_and_Gourmet_Food.json', 'r') as f:  
    for line in f:
        results.append(line)
import re
new_datas = []
for data in results:
    new_data = {}
    asin = re.findall(r"_asin_:\s*_(.*?)_" , data.replace("\'", "_"))
    title = re.findall(r"_title_:\s*_(.*?)_" , data.replace("\'", "_"))
    brand = re.findall(r"_brand_:\s*_(.*?)_" , data.replace("\'", "_"))
    if len(brand) > 0:
        brand = brand[0]
    else:
        brand = None
    if len(asin) > 0:
        new_data["asin"] = asin[0]
    if len(title) > 0:
        new_data["title"] = title[0]
        if brand is not None and len(brand)>0:
            new_data["title"] += f" - {brand}"
    new_datas.append(new_data)
with open('../../datasets/food/CTRPre/meta_food_process.json', "w") as f:
    json.dump(new_datas, f)

In [66]:
import json
from tqdm import tqdm
with open(f'{path}/CTRPre/meta_food_process.json') as f:
    metadata = json.load(f)
with open(f'{path}/Grocery_and_Gourmet_Food_5.json') as f:
    reviews = [json.loads(line) for line in tqdm(f)]
users = set()
items = set()
for review in tqdm(reviews):
    users.add(review['reviewerID'])
    items.add(review['asin'])
item2id = dict()
count = 0
for item in tqdm(items):
    item2id[item] = count
    count += 1
print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))

151254it [00:01, 84275.65it/s]
100%|██████████| 151254/151254 [00:00<00:00, 594334.07it/s]
100%|██████████| 8713/8713 [00:00<00:00, 429647.66it/s]

14681 8713 151254 0.0011824519884614812





In [67]:
id_title = {}
id_item = {}
cnt = 0
for meta in tqdm(metadata):
    if "title" in meta and len(meta['title']) > 1: # remove the item without title
        id_title[meta['asin']] = meta['title']

users = dict()
for review in tqdm(reviews):
    user = review['reviewerID']
    if 'asin' not in review:
        break
    item = review['asin']
    if item not in id_title:
        continue
    if review['asin'] not in id_item:
        id_item[review['asin']] = cnt
        cnt += 1
    if 'overall' not in review:
        continue
    if 'unixReviewTime' not in review:
        continue
    if user not in users:
        users[user] = {
            'items': [],
            'ratings': [],
            'timestamps': [],
            'reviews': []
        }
    users[user]['items'].append(item)
    users[user]['ratings'].append(int(int(review['overall']) > 3))
    users[user]['timestamps'].append(review['unixReviewTime'])

100%|██████████| 171760/171760 [00:00<00:00, 650365.99it/s]
100%|██████████| 151254/151254 [00:00<00:00, 222257.85it/s]


In [68]:
with open(f'{path}/CTRPre/item_mapping.csv', 'w') as f:
    import csv
    writer = csv.writer(f)
    writer.writerow(['item_id', 'item_name'])
    for id, name in id_title.items():
        writer.writerow([id, name])

In [69]:
user_id = 0
interactions = []
B = []
for key in tqdm(users.keys()):
    items = users[key]['items']
    ratings = users[key]['ratings']
    timestamps = users[key]['timestamps']
    all = list(zip(items, ratings, timestamps))
    res = sorted(all, key=lambda x: int(x[-1]))
    items, ratings, timestamps = zip(*res)
    items, ratings, timestamps = list(items), list(ratings), list(timestamps)
    users[key]['items'] = items
    users[key]['item_ids'] = [item2id[x] for x in items]
    users[key]['item_titles'] = [id_title[x] for x in items]
    users[key]['ratings'] = ratings
    users[key]['timestamps'] = timestamps
    for i in range(min(10, len(items) - 1), len(items)):
        st = max(i - 10, 0)
        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   
print(len(interactions))
sequential_interaction_list = sorted(interactions, key=lambda x: x[-1])

100%|██████████| 14641/14641 [00:01<00:00, 8338.59it/s] 

43293





In [70]:
test_size = 4329
valid_size = 4329
train_size = len(sequential_interaction_list) - test_size - valid_size
with open(f'{path}/CTRPre/train.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[:train_size])
with open(f'{path}/CTRPre/valid.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[train_size:(train_size+valid_size)])
with open(f'{path}/CTRPre/test.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])
    writer.writerows(sequential_interaction_list[(train_size+valid_size):])

In [71]:
def csv_to_json(input_path, output_path):
    data = pd.read_csv(input_path)
    json_list = []
    for index, row in data.iterrows():
        row['history_item_title'] = eval(row['history_item_title'])
        row['history_rating'] = eval(row['history_rating'])
        L = len(row['history_item_title'])
        preference = []
        unpreference = []
        for i in range(L):
            if int(row['history_rating'][i]) == 1:
                preference.append(row['history_item_title'][i])
            else:
                unpreference.append(row['history_item_title'][i])
        target_movie = str(row['item_title'])
        preference_str = ""
        unpreference_str = ""
        for i in range(len(preference)):
            if i == 0:
                preference_str += "\"" + preference[i] + "\""
            else:
                preference_str += ", \"" + preference[i] + "\""
        for i in range(len(unpreference)):
            if i == 0:
                unpreference_str += "\"" + unpreference[i] + "\""
            else:
                unpreference_str += ", \"" + unpreference[i] + "\""
        target_preference = int(row['rating'])
        target_movie_str = "\"" + target_movie + "\""
        target_preference_str = "Yes" if target_preference == 1 else "No"
        json_list.append({
            "instruction": "Given the user's preference and unpreference, identify whether the user will like the target food product by answering \"Yes\" or \"No\".",
            "input": f"User Preference: {preference_str}\nUser Unpreference: {unpreference_str}\nWhether the user will like the target food product named {target_movie_str}?",
            "output": target_preference_str,
        })
    with open(output_path, 'w') as f:
        json.dump(json_list, f, indent=4)

In [72]:
csv_to_json(f'{path}/CTRPre/train.csv', f'{path}/CTRPre/train.json')
csv_to_json(f'{path}/CTRPre/valid.csv', f'{path}/CTRPre/valid.json')
csv_to_json(f'{path}/CTRPre/test.csv', f'{path}/CTRPre/test.json')