In [1]:
# Download Amazon Beauty Dataset
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Beauty_5.json.gz

--2025-05-26 17:45:45--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Beauty_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44819276 (43M) [application/x-gzip]
Saving to: ‘reviews_Beauty_5.json.gz’


2025-05-26 17:48:58 (228 KB/s) - ‘reviews_Beauty_5.json.gz’ saved [44819276/44819276]



In [2]:
import gzip
import json
import copy
import math
import numpy as np

import torch

import random
random_seed = 1
torch.manual_seed(random_seed) # cpu
np.random.seed(random_seed) #numpy
random.seed(random_seed) #random and transforms
torch.backends.cudnn.deterministic=True # cudnn

In [10]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)

reviews_path = 'reviews_Beauty_5.json.gz'

reviews = parse(reviews_path)

In [12]:
def select_kcore(_interaction_dict, K=10, verbose=False):
    interaction_dict = copy.deepcopy(_interaction_dict)
    flag = 0
    while flag==0:
        item_cnt_dict = {}
        item_drop_dict = {}
        # create item_drop_dict, item_cnt_dict
        for user_id in interaction_dict:
            for item_id in interaction_dict[user_id]:
                item_cnt_dict[item_id] = item_cnt_dict.get(item_id, 0) + 1
                item_drop_dict[item_id] = 0

        assert len(item_drop_dict)==len(item_cnt_dict)

        # delete items < K
        del_iid_list = []
        for i_id in item_cnt_dict:
            if item_cnt_dict[i_id] < K:
                del_iid_list.append(i_id)

        for i_id in del_iid_list:
            item_drop_dict[i_id] = 1
        for u_id in interaction_dict:
            del_id_list = []
            for i_id in interaction_dict[u_id]:
                if item_drop_dict[i_id]:
                    del_id_list.append(i_id)
            for del_id in del_id_list:
                del interaction_dict[u_id][del_id]

        item_drop_num = 0
        for i_id in item_drop_dict:
            item_drop_num += item_drop_dict[i_id]
        item_num = len(item_drop_dict) - item_drop_num

        new_item_cnt = {}
        min_cnt=9999
        for u_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[u_id]))
            for i_id in interaction_dict[u_id]:
                new_item_cnt[i_id] = new_item_cnt.get(i_id, 0) + 1
            
        min_cnt_item = 9999
        for i_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[i_id])
            
        if verbose:
            print('min user interaction:',min_cnt)
            print('min item num:',min_cnt_item)
            
        if min_cnt>=K and min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num
        
        # delete users interactions<K
        del_uid_list = []
        for u_id in interaction_dict:
            if len(interaction_dict[u_id])<K:
                del_uid_list.append(u_id)
        for u_id in del_uid_list:
            del interaction_dict[u_id]
        
        # count min user-interaction and item appearance
        new_item_cnt = {}
        min_cnt=9999
        for u_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[u_id]))
            for i_id in interaction_dict[u_id]:
                new_item_cnt[i_id] = new_item_cnt.get(i_id, 0) + 1
                 
        min_cnt_item = 9999
        for i_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[i_id])

        if verbose:
            print('min user interaction:',min_cnt)
            print('min item num:',min_cnt_item)
            
        if min_cnt>=K and min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num

In [13]:
interaction_dict = {}
cnt=0
interaction_num = 0
raw_item = set()
for review in reviews:
    try:
        u_id, i_id, rating, time = review['reviewerID'], review['asin'], review['overall'], review['unixReviewTime']
        if int(rating) < 4:
            continue
        if u_id not in interaction_dict:
            interaction_dict[u_id] = {}
        interaction_dict[u_id][i_id] = time
        interaction_num += 1
        raw_item.add(i_id)
    except:
        print(review)
        cnt+=1
print('raw user num:',len(interaction_dict))
print('raw item num:', len(raw_item))
print('total interaction num:', interaction_num)
print(cnt)

raw user num: 22269
raw item num: 12086
total interaction num: 154272
0


In [14]:
# sort each user's interaction by timestamp
interaction_dict_new = copy.deepcopy(interaction_dict)
for u_id in interaction_dict_new:
    interaction_dict_new[u_id] = dict(sorted(interaction_dict_new[u_id].items(),key=lambda item:item[1]))

### 1. k-core selection 

In [15]:
# k-core selection
interaction_dict_new, user_num, item_num = select_kcore(interaction_dict_new,0)
print('after 0 core...')
print('user num:',user_num)
print('item num:',item_num)

after 0 core...
user num: 22269
item num: 12086


In [16]:
len(interaction_dict_new)

22269

### 2. k-core item selection

In [17]:
def select_kcore_item(_interaction_dict, K=20, verbose=False):
    interaction_dict = copy.deepcopy(_interaction_dict)
    flag = 0
    while flag==0:
        item_cnt_dict = {}
        item_drop_dict = {}
        # create item_drop_dict, item_cnt_dict
        for user_id in interaction_dict:
            for item_id in interaction_dict[user_id]:
                item_cnt_dict[item_id] = item_cnt_dict.get(item_id, 0) + 1
                item_drop_dict[item_id] = 0
                
        #print('user num:',len(interaction_dict))
        assert len(item_drop_dict)==len(item_cnt_dict)

        # delete items < K
        del_iid_list = []
        for i_id in item_cnt_dict:
            if item_cnt_dict[i_id] < K:
                del_iid_list.append(i_id)

        for i_id in del_iid_list:
            item_drop_dict[i_id] = 1
        for u_id in interaction_dict:
            del_id_list = []
            for i_id in interaction_dict[u_id]:
                if item_drop_dict[i_id]:
                    del_id_list.append(i_id)
            for del_id in del_id_list:
                del interaction_dict[u_id][del_id]

        item_drop_num = 0
        for i_id in item_drop_dict:
            item_drop_num += item_drop_dict[i_id]
        item_num = len(item_drop_dict) - item_drop_num
#         print(f'item num after item-{K}core:',item_num)

        new_item_cnt = {}
        min_cnt=9999
        for u_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[u_id]))
            for i_id in interaction_dict[u_id]:
                new_item_cnt[i_id] = new_item_cnt.get(i_id, 0) + 1
            
        min_cnt_item = 9999
        for i_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[i_id])
            
        if verbose:
            print('min user interaction:',min_cnt)
            print('min item num:',min_cnt_item)
            
        if min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num
        

In [18]:
# k-core selection
interaction_dict_new, user_num, item_num = select_kcore_item(interaction_dict_new,0,verbose=True)
print('after 5 core item...')
print('user num:',user_num)
print('item num:',item_num)

min user interaction: 1
min item num: 1
after 5 core item...
user num: 22269
item num: 12086


### 3. select kcore user

In [19]:
def select_kcore_user(_interaction_dict, K=10, verbose=False):
    interaction_dict = copy.deepcopy(_interaction_dict)
    flag = 0
    while flag==0:
        
        # delete users interactions<K
        del_uid_list = []
        for u_id in interaction_dict:
            if len(interaction_dict[u_id])<K:
                del_uid_list.append(u_id)
        for u_id in del_uid_list:
            del interaction_dict[u_id]
        
        # count min user-interaction and item appearance
        new_item_cnt = {}
        min_cnt=9999
        for u_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[u_id]))
            for i_id in interaction_dict[u_id]:
                new_item_cnt[i_id] = new_item_cnt.get(i_id, 0) + 1
                 
        min_cnt_item = 9999
        for i_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[i_id])

        if verbose:
            print('min user interaction:',min_cnt)
            print('min item num:',min_cnt_item)
            
        if min_cnt>=K:
            return interaction_dict, len(interaction_dict), item_num

In [20]:
# k-core selection
interaction_dict_new, user_num, item_num = select_kcore_user(interaction_dict_new,0,verbose=True)
print('after 5 core user...')
print('user num:',user_num)
print('item num:',item_num)

min user interaction: 1
min item num: 1
after 5 core user...
user num: 22269
item num: 12086


### ** following split process

In [21]:
time_list = []
for u_id in interaction_dict_new:
    for i_id in interaction_dict_new[u_id]:
        time_list.append(interaction_dict_new[u_id][i_id])
time_list = sorted(time_list)

In [22]:
len(time_list)

154272

In [23]:
training_old_dict, validation_old_dict, testing_old_dict = {}, {}, {}

split_ratio = 0.1
test_num = int(len(time_list)*split_ratio)
split_time1 = time_list[-test_num]
split_time2 = time_list[-math.ceil(2*test_num)]
print('*-------')
print(test_num, math.ceil(2*test_num))
print(split_time1, split_time2)
print(time_list[0], time_list[-1])

for u_id in interaction_dict_new:
    training_old_dict[u_id] = []
    validation_old_dict[u_id] = []
    testing_old_dict[u_id] = []
    for i_id, time in interaction_dict_new[u_id].items():
        if time < split_time2:
            training_old_dict[u_id].append(i_id)
        elif time < split_time1:
            validation_old_dict[u_id].append(i_id)
        else:
            testing_old_dict[u_id].append(i_id)

cut_user = 0

for u_id in interaction_dict_new:
    if len(training_old_dict[u_id]) >= 2: # remove users with less than 2 interactions in training dict
        continue
    else:
        cut_user += 1
        del training_old_dict[u_id]
        del validation_old_dict[u_id]
        del testing_old_dict[u_id]

print(cut_user)

*-------
15427 30854
1400112000 1394409600
1023840000 1406073600
2886


In [24]:
# use list to store user/item for map generation -> for reproducibility

def get_unique_sorted_elements(lst):
    return sorted(list(set(lst)))

user_set = []
item_set = []
for u_id in training_old_dict:
    user_set.append(u_id)
    for i_id in training_old_dict[u_id]:
        item_set.append(i_id)
    for i_id in validation_old_dict[u_id]:
        item_set.append(i_id)
    for i_id in testing_old_dict[u_id]:
        item_set.append(i_id)

item_set = get_unique_sorted_elements(item_set)
            
import random
random.seed(2023)
random.shuffle(item_set)

user_map = {old_id:new_id for new_id, old_id in enumerate(user_set)}
item_map = {old_id:new_id for new_id, old_id in enumerate(item_set)}

user_map = dict(sorted(user_map.items(),key=lambda item:item[1]))
item_map = dict(sorted(item_map.items(),key=lambda item:item[1]))

save_path = 'beauty/'
np.save(save_path + 'user_map.npy',user_map)
np.save(save_path + 'item_map.npy',item_map)

user_map_reverse = {k:v for v,k in user_map.items()}
item_map_reverse = {k:v for v,k in item_map.items()}
np.save(save_path + 'user_map_reverse.npy',user_map_reverse)
np.save(save_path + 'item_map_reverse.npy',item_map_reverse)

print('user num:', len(user_set))
print('item num:', len(item_set))

user num: 19383
item num: 12035


In [25]:
warm_item_set, cold_item_set = set(), set()
valid_warm_item_set, valid_cold_item_set, test_warm_item_set, test_cold_item_set = set(), set(), set(), set()
training_interaction, validation_warm_interaction, validation_cold_interaction, testing_warm_interaction, testing_cold_interaction = 0, 0, 0, 0, 0
training_user, validation_warm_user, validation_cold_user, testing_warm_user, testing_cold_user = 0, 0, 0, 0, 0
validation_overlap_user, testing_overlap_user = 0, 0
for u_id in training_old_dict:
    training_user += 1
    for i_id in training_old_dict[u_id]:
        warm_item_set.add(item_map[i_id])
        training_interaction += 1
for u_id in validation_old_dict:
    flag_w, flag_c = 0, 0
    for i_id in validation_old_dict[u_id]:
        if item_map[i_id] in warm_item_set:
            valid_warm_item_set.add(item_map[i_id])
            validation_warm_interaction += 1
            flag_w = 1
        else:
            cold_item_set.add(item_map[i_id])
            valid_cold_item_set.add(item_map[i_id])
            validation_cold_interaction += 1
            flag_c = 1
    if flag_w == 1:
        validation_warm_user += 1
    if flag_c == 1:
        validation_cold_user += 1
    if flag_w == 1 and flag_c == 1:
        validation_overlap_user += 1
for u_id in testing_old_dict:
    flag_w, flag_c = 0, 0
    for i_id in testing_old_dict[u_id]:
        if item_map[i_id] in warm_item_set:
            test_warm_item_set.add(item_map[i_id])
            testing_warm_interaction += 1
            flag_w = 1
        else:
            cold_item_set.add(item_map[i_id])
            test_cold_item_set.add(item_map[i_id])
            testing_cold_interaction += 1
            flag_c = 1
    if flag_w == 1:
        testing_warm_user += 1
    if flag_c == 1:
        testing_cold_user += 1
    if flag_w == 1 and flag_c == 1:
        testing_overlap_user += 1

test_user_num = 0
for u_id in testing_old_dict:
    if len(testing_old_dict[u_id]):
        test_user_num += 1
        
tot_interaction = training_interaction + validation_warm_interaction + validation_cold_interaction + testing_warm_interaction + testing_cold_interaction

print('warm item num:', len(warm_item_set))
print('cold item num:', len(cold_item_set))
print('valid warm item num:', len(valid_warm_item_set))
print('valid cold item num:', len(valid_cold_item_set))
print('test warm item num:', len(test_warm_item_set))
print('test cold item num:', len(test_cold_item_set))
print('----------------')
print('training interaction num:', training_interaction)
print('validation warm interaction num:', validation_warm_interaction)
print('validation cold interaction num:', validation_cold_interaction)
print('testing warm interaction num:', testing_warm_interaction)
print('testing cold interaction num:', testing_cold_interaction)
print('----------------')
print('training warm user num:', training_user)
print('validation warm user num:', validation_warm_user)
print('validation cold user num:', validation_cold_user)
print('testing warm user num:', testing_warm_user)
print('testing cold user num:', testing_cold_user)
print('----------------')
print('user num:', len(user_set))
print('item num:', len(item_set))
print('interaction:', tot_interaction)
print('density:', tot_interaction/(len(user_set)*len(item_set)))
print('----------------')
print('validation overlap user num:', validation_overlap_user)
print('testing overlap user num:', testing_overlap_user)
print('----------------')
print('test user num:', test_user_num)

warm item num: 11717
cold item num: 318
valid warm item num: 3927
valid cold item num: 176
test warm item num: 3313
test cold item num: 269
----------------
training interaction num: 122241
validation warm interaction num: 8465
validation cold interaction num: 851
testing warm interaction num: 5472
testing cold interaction num: 1841
----------------
training warm user num: 19383
validation warm user num: 4131
validation cold user num: 485
testing warm user num: 2932
testing cold user num: 974
----------------
user num: 19383
item num: 12035
interaction: 138870
density: 0.0005953074877631774
----------------
validation overlap user num: 321
testing overlap user num: 353
----------------
test user num: 3553


In [None]:
training_dict, validation_dict, testing_dict = {}, {}, {}
training_list, validation_list, testing_list = [], [], []
validation_warm_dict, validation_cold_dict, testing_warm_dict, testing_cold_dict = {}, {}, {}, {}

for u_id in training_old_dict:
    training_dict[user_map[u_id]] = []
    for i_id in training_old_dict[u_id]:
        training_dict[user_map[u_id]].append(item_map[i_id])
        training_list.append([user_map[u_id], item_map[i_id]])
for u_id in validation_old_dict:
    validation_dict[user_map[u_id]] = []
    validation_warm_dict[user_map[u_id]] = []
    validation_cold_dict[user_map[u_id]] = []
    for i_id in validation_old_dict[u_id]:
        validation_dict[user_map[u_id]].append(item_map[i_id])
        validation_list.append([user_map[u_id], item_map[i_id]])
        if item_map[i_id] in warm_item_set:
            validation_warm_dict[user_map[u_id]].append(item_map[i_id])
        else:
            validation_cold_dict[user_map[u_id]].append(item_map[i_id])
for u_id in testing_old_dict:
    testing_dict[user_map[u_id]] = []
    testing_warm_dict[user_map[u_id]] = []
    testing_cold_dict[user_map[u_id]] = []
    for i_id in testing_old_dict[u_id]:
        testing_dict[user_map[u_id]].append(item_map[i_id])
        testing_list.append([user_map[u_id], item_map[i_id]])
        if item_map[i_id] in warm_item_set:
            testing_warm_dict[user_map[u_id]].append(item_map[i_id])
        else:
            testing_cold_dict[user_map[u_id]].append(item_map[i_id])

training sample num: 19383
validation sample num: 19383
testing sample num: 19383


In [None]:
np.save(save_path + 'training_dict.npy', training_dict)
np.save(save_path + 'validation_dict.npy', validation_dict)
np.save(save_path + 'testing_dict.npy', testing_dict)

### overlap check

In [31]:
def overlap(dict0,dict1,dict2):
    
    count1 = {key:0 for key in dict0}
    count2 = {key:0 for key in dict0}
    res = {key:set() for key in dict0}

    for u_id,items in dict1.items():
        count1[u_id] = len(items)
    for u_id,items in dict2.items():
        count2[u_id] = len(items)
    for u_id in res:
        try:
            for item in dict1[u_id]:
                res[u_id].add(item)
        except:
            pass
        try:
            for item in dict2[u_id]:
                res[u_id].add(item)
        except:
            pass

    cnt=0
    interaction_cnt=0
    for u_id in res:
        if len(res[u_id])!= count1[u_id] + count2[u_id]:
            cnt = cnt + count1[u_id] + count2[u_id] - len(res[u_id])
        interaction_cnt += len(res[u_id])
    ratio = cnt/interaction_cnt
    return cnt,interaction_cnt,ratio

In [32]:
time_tr  = np.load(save_path + 'training_dict.npy', allow_pickle=True).item()
time_val = np.load(save_path + 'validation_dict.npy', allow_pickle=True).item()
time_tst = np.load(save_path + 'testing_dict.npy', allow_pickle=True).item()
print('tr - val:',overlap(time_tr, time_tr, time_val ))
print('tr - tst:',overlap(time_tr, time_tr, time_tst ))
print('val - tst:',overlap(time_tr, time_tst, time_val ))

tr - val: (0, 131557, 0.0)
tr - tst: (0, 129554, 0.0)
val - tst: (0, 16629, 0.0)
