## AB Test: Check how persona has effect on rec sys

In [1]:
# builtin
import importlib
# internal
import read_data
import train_model
import params
from tqdm.contrib.concurrent import process_map
# external
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os

### Check the item name recommend to each user

In [2]:
res_dir = "experiment_result/Instacart/"
lgcn_dir = res_dir + "LGCN/"
tri_lgcn_dir = res_dir + "LGCN_tri/"
tri_lgcn_approach_dir = res_dir + "LGCN_tri_approach/"
AB_test_dir = res_dir + "AB_test/"

In [3]:
# load Instacart_LGCN_result.json and Instacart_LGCN_tri_result.json
with open(AB_test_dir + 'Instacart_LGCN_result_20620.json', 'r') as f:
    lgcn_res = json.load(f)
with open(AB_test_dir + 'Instacart_LGCN_tri_result_20620.json', 'r') as f:
    lgcn_tri_res = json.load(f)

In [4]:
len(lgcn_res), len(lgcn_tri_res)

(20593, 20593)

In [6]:
# 把json按user_id的int排序，并检查是否一致
lgcn_res_sorted = {int(k): v for k,v in lgcn_res.items()}
lgcn_tri_res_sorted = {int(k): v for k,v in lgcn_tri_res.items()}
assert sorted(lgcn_res.keys()) == sorted(lgcn_tri_res.keys())

In [7]:
list_empty = []
for i in range(1, 20620):
    if i not in lgcn_res_sorted:
        list_empty.append(i)
# 导入tri_graph_uid2tidx_valid.json，检查list_empty里的user_id在这个json里对应的value是否为空
import json
with open('dataset/Instacart/tri_graph_uidx2tidx_valid.json', 'r') as f:
    uid2tidx = json.load(f)
# 检查list_empty里的user_id在uid2tidx里对应的value是否都为空
for i in list_empty:
    assert uid2tidx[str(i)] == []


In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def insta_load_data(ds_path="dataset/Instacart/10%_sampled_insta_df.csv", debug=False):
    # -> [Instacart_df, user_ids, user_num, user_ids_kv, item_names, item_num, items_kv, G_user, G_item]
    
    print(f'Loading instacart dataset from path:{ds_path}')
    
    ## load dataset and basic clean
    insta_df = pd.read_csv(ds_path)
    if debug: print(insta_df.head())
    # clean nan rows
    if insta_df.isna().sum().sum() > 0:
        print('all nan eliminated')
        insta_df = insta_df.dropna()
    # transfer types
    insta_df['product_id'] = insta_df['product_id'].astype('int32')
    insta_df['user_id'] = insta_df['user_id'].astype('int32')
    insta_df['product_name'] = insta_df['product_name'].astype('string')

    ## Identifications
    # for user nodes
    user_ids = insta_df['user_id'].unique()
    user_num = len(user_ids)
    print(f'totally {user_num} unique users')
    user_ids.sort()
    user_ids_kv = {}
    for ui in range(user_num):
        user_ids_kv[user_ids[ui]] = ui
    # for item nodes
    item_names = insta_df['product_name'].unique()
    item_num = len(item_names)
    print(f'totally {item_num} unique items')
    # item_names.sort()
    items_kv = {}
    for ii in range(item_num):
        items_kv[item_names[ii]] = ii

    ## construct the bi-partite graph
    G_user = {} # {uidx: [tidx,]}
    G_item = {} # {tidx: [uidx,]}

    for index,row in tqdm(insta_df.iterrows()):
        user_index = user_ids_kv[row['user_id']]
        item_index = items_kv[row['product_name']]
        
        # update user side
        if G_user.get(user_index) is None:
            G_user[user_index] = {item_index}
        else:
            G_user[user_index].update([item_index])
        
        # update item side
        if G_item.get(item_index) is None:
            G_item[item_index] = {user_index}
        else:
            G_item[item_index].update([user_index])

    assert len(G_item.keys()) == item_num and len(G_user.keys()) == user_num

    return insta_df, user_ids, user_num, user_ids_kv, item_names, item_num, items_kv, G_user, G_item

In [9]:
# load dataset
ds_path = "dataset/Instacart/10%_sampled_insta_df.csv"
[insta_df, user_ids, user_num, user_ids_kv, item_names, item_num, items_kv, G_user, G_item] = insta_load_data(ds_path)

assert len(G_item.keys()) == item_num and len(G_user.keys()) == user_num

Loading instacart dataset from path:dataset/Instacart/10%_sampled_insta_df.csv
totally 20620 unique users
totally 41521 unique items


3259469it [00:42, 77119.43it/s]


In [10]:
item_names[9]

'Organic Roast Beef'

In [11]:
# 把Instacart_LGCN_result_sorted里，第二个和第三个[]里的item_id转换成item_name，也就是test_list和pred_list
lgcn_res_sorted_item_name = {}
for k, v in lgcn_res_sorted.items():
    v[1] = [item_names[i] for i in v[1]]
    v[2] = [item_names[i] for i in v[2]]
    lgcn_res_sorted_item_name[k] = v

# 把Instacart_LGCN_tri_result_sorted里，第二个和第三个[]里的item_id转换成item_name，也就是test_list和pred_list
lgcn_tri_res_sorted_item_name = {}
for k, v in lgcn_tri_res_sorted.items():
    v[1] = [item_names[i] for i in v[1]]
    v[2] = [item_names[i] for i in v[2]]
    lgcn_tri_res_sorted_item_name[k] = v

# # 重新写进json
# with open(AB_test_dir + 'Instacart_LGCN_result_sorted_product_name.json', 'w') as f:
#     json.dump(lgcn_res_sorted_item_name, f)
# with open(AB_test_dir + 'Instacart_LGCN_tri_result_sorted_product_name.json', 'w') as f:
#     json.dump(lgcn_tri_res_sorted_item_name, f)

### Check the difference of LGCN recommend items and LGCN_tri recommend items

In [18]:
# 读入tri_graph_uidx2pidx.json，以便知道每个user被assign了哪些persona
with open('dataset/Instacart/tri_graph_uidx2pidx.json', 'r') as f:
    uid2pidx = json.load(f)
# 按user_id排序
uid2pidx_sorted = {int(k): v for k,v in uid2pidx.items()}

# 再读入tri_graph_tidx2pidx.json，以便知道每个item有哪些persona
with open('dataset/Instacart/tri_graph_tidx2pidx.json', 'r') as f:
    tidx2pidx = json.load(f)
# 按item_id排序
tidx2pidx_sorted = {int(k): v for k,v in tidx2pidx.items()}

In [19]:
len(uid2pidx), len(tidx2pidx), len(uid2pidx_sorted), len(tidx2pidx_sorted)

(18388, 41521, 18388, 41521)

In [20]:
str(uid2pidx_sorted[10268])

'[31, 37, 21, 30]'

In [21]:
# 对于lgcn_res_sorted_item_name和lgcn_tri_res_sorted_item_name，看看他们的pred_list里有哪些不一样的item_name
# 对于每个user，记录item_in_lgcn_not_in_lgcn_tri和item_in_lgcn_tri_not_in_lgcn

item_in_lgcn_not_in_lgcn_tri = {}
item_in_lgcn_tri_not_in_lgcn = {}
for k in lgcn_res_sorted_item_name.keys():
    v1 = lgcn_res_sorted_item_name[k]
    v2 = lgcn_tri_res_sorted_item_name[k]
    item_in_lgcn_not_in_lgcn_tri[k] = [i for i in v1[2] if i not in v2[2]]
    item_in_lgcn_tri_not_in_lgcn[k] = [i for i in v2[2] if i not in v1[2]]

# # 保存到json
# with open(AB_test_dir + 'item_in_lgcn_not_in_lgcn_tri.json', 'w') as f:
#     json.dump(item_in_lgcn_not_in_lgcn_tri, f)
# with open(AB_test_dir + 'item_in_lgcn_tri_not_in_lgcn.json', 'w') as f:
#     json.dump(item_in_lgcn_tri_not_in_lgcn, f)

In [22]:
# 找出在lgcn和lgcn_tri两个方法中被推荐item区别最大的user，也就是说，对于这个user，它在item_in_lgcn_not_in_lgcn_tri和item_in_lgcn_tri_not_in_lgcn里的item数量总数最多
max_diff_user = -1
max_diff_num = 0
for k in item_in_lgcn_not_in_lgcn_tri.keys():
    diff_num = len(item_in_lgcn_not_in_lgcn_tri[k]) + len(item_in_lgcn_tri_not_in_lgcn[k])
    if diff_num > max_diff_num:
        max_diff_num = diff_num
        max_diff_user = k

max_diff_user, max_diff_num


(1008, 200)

In [25]:
# 看看这个user的persona是什么
uid2pidx_sorted[max_diff_user]

[24, 18, 13, 25]

In [18]:
# 把这个用户的item_in_lgcn_not_in_lgcn_tri和item_in_lgcn_tri_not_in_lgcn里的item_name打印出来
# item_in_lgcn_not_in_lgcn_tri[max_diff_user], item_in_lgcn_tri_not_in_lgcn[max_diff_user]
# item_in_lgcn_not_in_lgcn_tri[5]
item_in_lgcn_tri_not_in_lgcn[15302]

['Organic Baby Spinach',
 'Cheez-It Baked Snack Crackers',
 'Organic Marinara Sauce',
 'Hazelnut Spread with Cocoa',
 'Smartwater',
 'Aged White Cheddar Popcorn',
 'Sinfully Sweet Campari Tomatoes',
 'Organic Blueberries',
 '8\\" Flour Tortillas De Harina',
 'Beef Hot Dogs',
 'Original Rice Krispies Treats',
 'Fresh Mozzarella All-Natural Sliced Cheese',
 'Grilled Chicken Strips',
 'Organic Chicken Stock',
 'Organic Baby Carrots',
 'Beef Dinner Franks',
 'Chocolate Chip Cookies',
 'Organic Turkey Breast',
 'Organic Strawberries',
 'Stir Fry Vegetables',
 'Veggie Cheese & Pretzel Snack Pac Prepacked',
 'Classic Hot Dog Buns',
 'Organic Green Beans',
 'Raw Shrimp',
 'Brioche Hamburger Buns',
 'Prosciutto Di Parma',
 'Raisin Bran',
 'Artichoke And Jalapeño Dip & Spread',
 'Organic Spinach And Cheese Ravioli',
 'Organic Spring Mix']

In [19]:
# 看item_in_lgcn_tri_not_in_lgcn[5][0]对应的persona是什么
# position_list = [3, 5, 8, 9, 10, 11, 13, 16, 22, 25, 26, 33, 34, 41]
count_lgcn = 0
count_lgcn_tri = 0
for i in range (0, 99):
    print(f'{i}: {tidx2pidx_sorted[items_kv[item_in_lgcn_not_in_lgcn_tri[15302][i]]]}')
    print(f'{i}: {tidx2pidx_sorted[items_kv[item_in_lgcn_tri_not_in_lgcn[15302][i]]]}')
    if 7 in tidx2pidx_sorted[items_kv[item_in_lgcn_not_in_lgcn_tri[15302][i]]]:
        count_lgcn += 1
    if 7 in tidx2pidx_sorted[items_kv[item_in_lgcn_tri_not_in_lgcn[15302][i]]]:
        count_lgcn_tri += 1
# tidx2pidx_sorted[items_kv[item_in_lgcn_tri_not_in_lgcn[5][3]]]
print(count_lgcn, count_lgcn_tri)



0: [31, 7, 2, 48, 47]
0: [42, 31, 37, 7, 35, 40, 45, 32]
1: [31, 14, 48, 11]
1: [21, 16, 8]
2: [31, 37, 11, 2, 34]
2: [42, 31, 37, 7, 45, 35]
3: [31, 10, 37, 49, 23, 34, 11, 36]
3: [16, 43, 23, 49]
4: [42, 3, 6, 10]
4: [31, 30, 17, 3]
5: [31, 37, 40, 9]
5: [16, 21, 37]
6: [31, 48, 49, 37]
6: [31, 37, 7, 19, 40, 41]
7: [42, 3, 31, 37, 7, 45, 38, 29]
7: [42, 31, 37, 9, 3, 7, 40, 45]
8: [31, 7, 35, 43, 34, 11, 36, 14]
8: [26, 4, 8, 16]
9: [31, 30, 3, 17, 42]
9: [10, 46, 28, 8, 16]
10: [31, 17, 37, 40, 9]
10: [26, 10, 16, 8]
11: [31, 3, 17, 7, 30, 37, 45]
11: [42, 31, 37, 2, 34, 48]
12: [31, 37, 17, 7, 9, 40, 34, 11]
12: [31, 17, 26, 11, 8, 36]
13: [42, 19, 37, 31]
13: [42, 31, 37, 3, 49, 45]
14: [37, 40, 7, 31, 41, 19, 42, 3]
14: [42, 31, 37, 3, 7, 45, 40, 22]
15: [31, 37, 7, 35, 23, 45, 11, 34]
15: [46, 28, 26, 16]
16: [42, 31, 37, 3, 7, 43, 40, 45]
16: [16, 21, 43]
17: [21, 16, 8]
17: [42, 31, 37, 49, 3, 45]
18: [42, 31, 3, 37, 7, 9, 45, 20]
18: [42, 31, 3, 37, 40, 7, 9]
19: [31, 0, 19,

IndexError: list index out of range

### Narrow down user range: 5 purchased items, top 20, overlap percentage inceases most: 2/20->5/20;

In [None]:
overlap_count_lgcn = {}
overlap_percentage_lgcn = {}
overlap_count_lgcn_tri = {}
overlap_percentage_lgcn_tri = {}
incresed_percentage = {}

for k in lgcn_res_sorted_item_name.keys():
# for k in range (1, 2):
    v1 = lgcn_res_sorted_item_name[k]
    v2 = lgcn_tri_res_sorted_item_name[k]
    # print(set(v1[2][:20]))
    # print(set(v1[1][0:]))
    overlap_count_lgcn[k] = len(set(v1[2][:20]) & set(v1[1][0:]))
    overlap_percentage_lgcn[k] = overlap_count_lgcn[k] / len(set(v1[1][0:]))
    # print(overlap_percentage_lgcn[k])
    overlap_count_lgcn_tri[k] = len(set(v2[2][:20]) & set(v2[1][0:]))
    overlap_percentage_lgcn_tri[k] = overlap_count_lgcn_tri[k] / len(set(v2[1][0:]))
    # print(overlap_percentage_lgcn_tri[k])
    incresed_percentage[k] = overlap_percentage_lgcn_tri[k] - overlap_percentage_lgcn[k]

# 找出increase_percentage最大的100个user
sorted_incresed_percentage = sorted(incresed_percentage.items(), key=lambda x: x[1], reverse=True)
# sorted_incresed_percentage[:100]
# 把这些user写进一个list
top_100_incresed_percentage = [k for k, v in sorted_incresed_percentage[:100]]
print(top_100_incresed_percentage)

[255, 330, 511, 520, 2194, 2520, 2993, 3558, 4079, 4195, 5345, 6030, 6073, 6272, 6543, 8204, 8432, 8853, 9226, 9774, 10218, 10636, 11195, 11372, 11454, 11464, 12485, 12609, 14534, 14952, 14981, 15858, 15993, 18282, 19348, 20124, 20423, 13340, 443, 2228, 5393, 5997, 7005, 9707, 10616, 12081, 13152, 14353, 356, 494, 569, 584, 742, 968, 1325, 1384, 1400, 1407, 1787, 1832, 1987, 2361, 2823, 2902, 3019, 3108, 3816, 3929, 3984, 4296, 4310, 4436, 4655, 5443, 5974, 6183, 6191, 6230, 6240, 6686, 6859, 8090, 8348, 8358, 8408, 8498, 8971, 9221, 9395, 9580, 9737, 9908, 9990, 10339, 10497, 10570, 10753, 11039, 11049, 11072]


### For those newly corrected chosen items, check if they are related with assigned personas;

In [None]:
new_correct_items = {}
removed_wrong_items = {}

for k in top_100_incresed_percentage:
    v1 = lgcn_res_sorted_item_name[k]
    v2 = lgcn_tri_res_sorted_item_name[k]
    new_correct_items[k] = [i for i in v2[2][:20] if i not in v1[2][:20] and i in v2[1]]
    removed_wrong_items[k] = [i for i in v1[2][:20] if i not in v2[2][:20] and i not in v1[1]]

In [None]:
# new_correct_items
# 检查这些new_correct_items里的item被assign的persona，以及有多少item被assign的persona也是这个user的persona
persona_related_item_count = {}
# new_correct_items_persona_count
for k in top_100_incresed_percentage:
    # print(f'{k}: {uid2pidx_sorted[k]}')
    for i in new_correct_items[k]:
        # print(f'{i}: {tidx2pidx_sorted[items_kv[i]]}')
        if len(set(tidx2pidx_sorted[items_kv[i]]) & set(uid2pidx_sorted[k])) > 0:
            persona_related_item_count[k] = persona_related_item_count.get(k, 0) + 1
            persona_related_item_count[k] = persona_related_item_count[k] / len(new_correct_items[k]) # ratio

# 算一下平均的ratio
average_ratio_related_with_persona = sum(persona_related_item_count.values()) / len(persona_related_item_count)
print(average_ratio_related_with_persona)
# len(persona_related_item_count)
# persona_related_item_count

KeyError: 330

In [None]:
# 把这些new correct对于每个user的item写进json
with open(AB_test_dir + 'new_correct_items.json', 'w') as f:
    json.dump(new_correct_items, f)

In [None]:
# removed_wrong_items

### Check about top 20 occurred items

In [12]:
# 从G_item里找出最多的20个item
item_count = {}
for k in G_item.keys():
    item_count[k] = len(G_item[k])
sorted_item_count = sorted(item_count.items(), key=lambda x: x[1], reverse=True)
# sorted_item_count[:20]
# 把最多的20个item写进一个list
overall_top_20_item = [k for k, v in sorted_item_count[:20]]
# 然后把id转换成name
overall_top_20_item_name = [item_names[i] for i in overall_top_20_item]
print(overall_top_20_item_name)

['Banana', 'Bag of Organic Bananas', 'Organic Strawberries', 'Organic Baby Spinach', 'Large Lemon', 'Limes', 'Strawberries', 'Organic Hass Avocado', 'Organic Avocado', 'Organic Blueberries', 'Organic Garlic', 'Organic Yellow Onion', 'Organic Zucchini', 'Organic Raspberries', 'Cucumber Kirby', 'Organic Grape Tomatoes', 'Yellow Onions', 'Seedless Red Grapes', 'Organic Lemon', 'Organic Baby Carrots']


In [None]:
# 把这些removed wrong对于每个user的item写进json
with open(AB_test_dir + 'removed_wrong_popular_items.json', 'w') as f:
    # 如果这些remove_wrong item是最热门的20个item之一，则写进json，否则不写
    removed_wrong_items_top_20 = {}
    for k in removed_wrong_items.keys():
        removed_wrong_items_top_20[k] = [i for i in removed_wrong_items[k] if i in overall_top_20_item_name]
    json.dump(removed_wrong_items_top_20, f)

In [None]:
# 首先检查对于lgcn_res_sorted_item_name的top20预测，有多少正确的item在overall_top_20_item_name里，有多少不正确的item在overall_top_20_item里
correct_item_in_top_20_item = {}
wrong_item_in_top_20_item = {}
# for k in top_100_incresed_percentage:
for k in lgcn_res_sorted_item_name.keys():
    v1 = lgcn_res_sorted_item_name[k]
    correct_item_in_top_20_item[k] = [i for i in v1[2][:20] if i in overall_top_20_item_name and i in v1[1]]
    wrong_item_in_top_20_item[k] = [i for i in v1[2][:20] if i in overall_top_20_item_name and i not in v1[1]]

# correct_item_in_top_20_item
count_correct = 0
for k in correct_item_in_top_20_item.keys():
    count_correct += len(correct_item_in_top_20_item[k])
print(count_correct)

# wrong_item_in_top_20_item
count_wrong = 0
for k in wrong_item_in_top_20_item.keys():
    count_wrong += len(wrong_item_in_top_20_item[k])
print(count_wrong)

14578
164107


In [None]:
# 类似的对于lgcn_tri_res_sorted_item_name的top20预测，有多少正确的item在overall_top_20_item_name里，有多少不正确的item在overall_top_20_item里
correct_item_in_top_20_item_tri = {}
wrong_item_in_top_20_item_tri = {}
# for k in top_100_incresed_percentage:
for k in lgcn_tri_res_sorted_item_name.keys():
    v2 = lgcn_tri_res_sorted_item_name[k]
    correct_item_in_top_20_item_tri[k] = [i for i in v2[2][:20] if i in overall_top_20_item_name and i in v2[1]]
    wrong_item_in_top_20_item_tri[k] = [i for i in v2[2][:20] if i in overall_top_20_item_name and i not in v2[1]]
13
# correct_item_in_top_20_item_tri
count_correct_tri = 0
for k in correct_item_in_top_20_item_tri.keys():
    count_correct_tri += len(correct_item_in_top_20_item_tri[k])
print(count_correct_tri)
# wrong_item_in_top_20_item_tri
count_wrong_tri = 0
for k in wrong_item_in_top_20_item_tri.keys():
    count_wrong_tri += len(wrong_item_in_top_20_item_tri[k])
print(count_wrong_tri)

13371
123091


### Specific Case Study:

In [None]:
# Part 1: New Correct Items Due to Right Persona

# Case 0:
# user 572 persona
print(uid2pidx_sorted[572]) # [1, 3, 9, 14]
# new correct items
print(tidx2pidx_sorted[items_kv["TEA TIME PARTY BUNTING"]]) # [3, 14, 10] -> 3, 14
# where 3 is "Seasonal and Festive Decorator" and 14 is "Party and Event Planners"
print("====================================")

# Case 1:
# user 960 persona
print(uid2pidx_sorted[960]) # [2, 3, 4, 5, 6]
# new correct items
print(tidx2pidx_sorted[items_kv["PENNY FARTHING BIRTHDAY CARD"]]) # [2, 7, 12] -> 2
print(tidx2pidx_sorted[items_kv["CARD WEDDING DAY"]]) # [14, 7, 4] -> 4
# where 2 is "Vintage and Retro Enthusiast" and 4 is "Crafting and DIY Hobbyist"
print("====================================")


[1, 3, 9, 14]
[3, 14, 10]
[3, 2, 4, 5, 6]
[2, 7, 12]
[14, 7, 4]


In [None]:
# Part 2: Wrong Removed Popular Items

# Case 0:
# user 572 persona
print(uid2pidx_sorted[572]) # [1, 3, 9, 14]
# removed wrong items
print(tidx2pidx_sorted[items_kv["ASSORTED COLOUR BIRD ORNAMENT"]]) # [1, 18, 7, 9] -> 7, 18
print(tidx2pidx_sorted[items_kv["HEART OF WICKER LARGE"]]) # [1, 2, 4] -> 2, 4
# where 2 is "Vintage and Retro Enthusiast", 4 is "Crafting and DIY Hobbyist", 7 is "Child and Family-centric Shopper" and 18 is "Animal and Pet Advocates"
print("====================================")

# Case 1:
# user 479 persona
print(uid2pidx_sorted[479]) # [1, 5, 7]
# removed wrong items
print(tidx2pidx_sorted[items_kv["NATURAL SLATE HEART CHALKBOARD"]]) # [1, 4, 12] -> 5, 7
# where 5 is "Baking and Confectionery Fans" and 7 is "Child and Family-centric Shopper"
print("====================================")





[1, 3, 9, 14]
[1, 18, 7, 9]
[1, 2, 4]
[1, 5, 7]
[1, 4, 12]


### Only those with 10 test_list

In [13]:
# Part 3: Narrow down a range for user: only select those with 5-10 items in the test set
# 选择test set里item数量=10的user，以及他们在lgcn_res_sorted_item_name和lgcn_tri_res_sorted_item_name里的item，保存为两个json
user_item_count = {}
for k in lgcn_res_sorted_item_name.keys():
    user_item_count[k] = len(lgcn_res_sorted_item_name[k][1])
# user_item_count
selected_user = [k for k, v in user_item_count.items() if v >= 10 and v <= 20]
len(selected_user)

selected_lgcn_res_sorted_item_name = {k: lgcn_res_sorted_item_name[k] for k in selected_user}
selected_lgcn_tri_res_sorted_item_name = {k: lgcn_tri_res_sorted_item_name[k] for k in selected_user}

# 保存到json
# with open(AB_test_dir + 'selected_lgcn_res_sorted_item_name.json', 'w') as f:
#     json.dump(selected_lgcn_res_sorted_item_name, f)
# with open(AB_test_dir + 'selected_lgcn_tri_res_sorted_item_name.json', 'w') as f:
#     json.dump(selected_lgcn_tri_res_sorted_item_name, f)


In [14]:
# 看看user一致不一致
assert sorted(selected_lgcn_res_sorted_item_name.keys()) == sorted(selected_lgcn_tri_res_sorted_item_name.keys())

In [15]:
# Part 4: 在这些test_list长度为10的case中，找到一个典型的user，对于这个user，看他的pred_list里的top 10，对比lgcn和lgcn_tri，看看有没有new correct items和removed wrong popular items

# 首先看看这些user在lgcn和lgcn_tri里的top 10有多少overlap
overlap_count_lgcn_10 = {}
overlap_percentage_lgcn_10 = {}
overlap_count_lgcn_tri_10 = {}
overlap_percentage_lgcn_tri_10 = {}
incresed_percentage_10 = {}

for k in selected_lgcn_res_sorted_item_name.keys():
    v1 = selected_lgcn_res_sorted_item_name[k]
    v2 = selected_lgcn_tri_res_sorted_item_name[k]
    # print(set(v1[2][:20]))
    # print(set(v1[1][0:]))
    overlap_count_lgcn_10[k] = len(set(v1[2][:10]) & set(v1[1][0:]))
    overlap_percentage_lgcn_10[k] = overlap_count_lgcn_10[k] / len(set(v1[1][0:]))
    # print(overlap_percentage_lgcn[k])
    overlap_count_lgcn_tri_10[k] = len(set(v2[2][:10]) & set(v2[1][0:]))
    overlap_percentage_lgcn_tri_10[k] = overlap_count_lgcn_tri_10[k] / len(set(v2[1][0:]))
    # print(overlap_percentage_lgcn_tri[k])
    incresed_percentage_10[k] = overlap_percentage_lgcn_tri_10[k] - overlap_percentage_lgcn_10[k]

# 找出increase_percentage最大的100个user
sorted_incresed_percentage_10 = sorted(incresed_percentage_10.items(), key=lambda x: x[1], reverse=True)
print(sorted_incresed_percentage_10[:100])

# 把这些user写进一个list
top_100_incresed_percentage_10 = [k for k, v in sorted_incresed_percentage_10[:100]]
# print(top_100_incresed_percentage_10)

[(1657, 0.30000000000000004), (12972, 0.30000000000000004), (8123, 0.2727272727272727), (10780, 0.23076923076923078), (12074, 0.21428571428571427), (561, 0.2), (3897, 0.2), (10268, 0.2), (10372, 0.2), (14710, 0.2), (20407, 0.2), (10665, 0.19999999999999998), (11644, 0.19999999999999998), (15594, 0.19999999999999998), (18636, 0.19999999999999998), (2735, 0.18181818181818182), (2986, 0.18181818181818182), (6570, 0.18181818181818182), (8405, 0.18181818181818182), (14933, 0.18181818181818182), (16884, 0.18181818181818182), (2852, 0.1818181818181818), (385, 0.16666666666666669), (4786, 0.16666666666666669), (5681, 0.16666666666666669), (16226, 0.16666666666666669), (1012, 0.16666666666666666), (12706, 0.16666666666666666), (15251, 0.16666666666666666), (16924, 0.16666666666666666), (17984, 0.16666666666666666), (10370, 0.15789473684210525), (5712, 0.15384615384615385), (9187, 0.15384615384615385), (11690, 0.15384615384615385), (12831, 0.15384615384615385), (13776, 0.15384615384615385), (166

In [16]:
len(sorted_incresed_percentage_10)

6708

In [30]:
# Part 5: 具体看看某个user和其推荐情况
specific_user = 12972

# 打印1657这个user的test_list
print("For user " + str(specific_user) + ": items that he really purchased are:" + str(selected_lgcn_res_sorted_item_name[specific_user][1]))
# 再打印1657这个user的pred_list（使用LGCN的top 10）
print("For user " + str(specific_user) + ": items that LGCN recommended are:" + str(selected_lgcn_res_sorted_item_name[specific_user][2][:20]))
# 再打印1657这个user的pred_list（使用LGCN_tri的top 10）
print("For user " + str(specific_user) + ": items that LGCN_tri recommended are:" + str(selected_lgcn_tri_res_sorted_item_name[specific_user][2][:20]))
print("====================================")

# user specific_user persona
print("And user " +str(specific_user) + " has persona: " + str(uid2pidx_sorted[specific_user])) # [5, 3, 9, 14, 8, 7]
# where 5 is "Baking and Confectionery Fans", 3 is "Seasonal and Festive Decorator";
# 7 is "Child and Family-centric Shopper", 8 is "Gardening Lover", 
# 9 is "Comfort and Coziness Seeker" and 14 is "Party and Event Planners"

# find new correct items
new_correct_items_specific = [i for i in selected_lgcn_tri_res_sorted_item_name[specific_user][2][:10] if i not in selected_lgcn_res_sorted_item_name[specific_user][2][:10] and i in selected_lgcn_tri_res_sorted_item_name[specific_user][1][:10]]
print("For user " + str(specific_user) + ", new correct items are:" + str(new_correct_items_specific))
# find removed wrong popular items
removed_wrong_items_specfic = [i for i in selected_lgcn_res_sorted_item_name[specific_user][2][:10] if i not in selected_lgcn_tri_res_sorted_item_name[specific_user][2][:10] and i not in selected_lgcn_res_sorted_item_name[specific_user][1][:10] and i in overall_top_20_item_name]
print("For user " + str(specific_user) + ", removed wrong popular items are:" + str(removed_wrong_items_specfic))
print("====================================")

# new correct item analysis
print("NEW CORRECT ITEMS:")
persona_related_item_count_specific = {}
for i in new_correct_items_specific:
    print("Item: " + i + " is related with persona: " + str(tidx2pidx_sorted[items_kv[i]]))
    if len(set(tidx2pidx_sorted[items_kv[i]]) & set(uid2pidx_sorted[specific_user])) > 0:
        print("And this item is related with user "+ str(specific_user) + "'s persona: " + str(set(tidx2pidx_sorted[items_kv[i]]) & set(uid2pidx_sorted[specific_user])))
    print("====================================")

# removed wrong items analysis
print("REMOVED WRONG ITEMS:")
if len(removed_wrong_items_specfic) == 0:
    print("No removed wrong popular items for user " + str(specific_user))
for i in removed_wrong_items_specfic:
    print("Item: " + i + " is related with persona: " + str(tidx2pidx_sorted[items_kv[i]]))
    if len(set(tidx2pidx_sorted[items_kv[i]]) & set(uid2pidx_sorted[specific_user])) > 0:
        print("And this item is related with user "+ str(specific_user) + "'s persona: " + str(set(tidx2pidx_sorted[items_kv[i]]) & set(uid2pidx_sorted[specific_user])))
    print("====================================")


For user 12972: items that he really purchased are:['Organic Cucumber', 'Organic Granny Smith Apple', 'Pine Fresh Natural Toilet Cleaner', 'Hass Avocado', 'Organic Carrot Bunch', 'Organic Red Onion', 'Organic Small Bunch Celery', 'Thin Rice Noodles', 'Organic Lemon', 'Organic Raspberries']
For user 12972: items that LGCN recommended are:['Organic Baby Spinach', 'Organic Strawberries', 'Organic Garlic', 'Organic Yellow Onion', 'Limes', 'Organic Avocado', 'Banana', 'Large Lemon', 'Organic Grape Tomatoes', 'Organic Lemon', 'Organic Red Onion', 'Organic Raspberries', 'Organic Small Bunch Celery', 'Apple Honeycrisp Organic', 'Organic Blueberries', 'Organic Baby Carrots', 'Organic Cucumber', 'Organic Whole Milk', 'Organic Baby Arugula', 'Asparagus']
For user 12972: items that LGCN_tri recommended are:['Organic Baby Spinach', 'Organic Yellow Onion', 'Organic Lemon', 'Organic Garlic', 'Organic Strawberries', 'Limes', 'Organic Red Onion', 'Organic Small Bunch Celery', 'Organic Raspberries', 'Or

In [None]:
persona2idx_whole = {'The Homebody': 0,
       'The Tea Lover': 1,
       'The Dairy-free Shopper': 2,
       'The Eco-friendly Shopper': 3,
       'The International Cuisine Lover': 4,
       'The Pet Owner': 5,
       'The Home Entertainer': 6,
       'The Vegan': 7,
       'The Lunchbox Packer': 8,
       'The Smoothie Maker': 9,
       'The Busy Parent': 10,
       'The High-protein Shopper': 11,
       'The Seafood Lover': 12,
       'The Wine Connoisseur': 13,
       'The Nut-free Shopper': 14,
       'The Luxury Lover': 15,
       'The Comfort Food Lover': 16,
       'The Fitness Fanatic': 17,
       'The Artisanal Cheese Lover': 18,
       'The Gourmet Chef': 19,
       'The Frozen Food Fan': 20,
       'The Snack Adventurer': 21,
       'The Baby Care Provider': 22,
       'The Breakfast Lover': 23,
       'The Party Planner': 24,
       'The DIY Cocktail Mixer': 25,
       'The Quick Meals Shopper': 26,
       'The Tech Savvy': 27,
       'The Grill Master': 28,
       'The Fair-trade Supporter': 29,
       'The Hydration Focused': 30,
       'The Health Enthusiast': 31,
       'The Allergy-conscious Shopper': 32,
       'The Spice Explorer': 33,
       'The Keto Diet Follower': 34,
       'The Gluten-free Shopper': 35,
       'The Paleo Diet Follower': 36,
       'The Organic Foodie': 37,
       'The Zero-waste Advocate': 38,
       'The Craft Beer Enthusiast': 39,
       'The Fresh Produce Fanatic': 40,
       'The Localvore': 41,
       'The Conscious Consumer': 42,
       'The Baker': 43,
       'The Supplements User': 44,
       'The Non-GMO Shopper': 45,
       'The Meat Lover': 46,
       'The Sugar-free Shopper': 47,
       'The Low-sodium Shopper': 48,
       'The Health Nut': 49,
       'The Coffee Aficionado': 50}

In [None]:
# 类似的看看701这个user和其推荐情况

# 打印701这个user的test_list
print("For user 701, items that he really purchased are:" + str(selected_lgcn_res_sorted_item_name[701][1]))
# 再打印701这个user的pred_list（使用LGCN的top 10）
print("For user 701, items that LGCN recommended are:" + str(selected_lgcn_res_sorted_item_name[701][2][:10]))
# 再打印701这个user的pred_list（使用LGCN_tri的top 10）
print("For user 701, items that LGCN_tri recommended are:" + str(selected_lgcn_tri_res_sorted_item_name[701][2][:10]))

# user 701 persona
print("And user 701 has persona: " + str(uid2pidx_sorted[701])) # [3, 5, 4, 2, 1, 9, 7]

# 首先找出对于701有哪些new correct items
new_correct_items_701 = [i for i in selected_lgcn_tri_res_sorted_item_name[701][2][:10] if i not in selected_lgcn_res_sorted_item_name[701][2][:10] and i in selected_lgcn_tri_res_sorted_item_name[701][1][:10]]
print("For user 701, new correct items are:" + str(new_correct_items_701))
# 再找出对于701有哪些removed wrong popular items
removed_wrong_items_701 = [i for i in selected_lgcn_res_sorted_item_name[701][2][:10] if i not in selected_lgcn_tri_res_sorted_item_name[701][2][:10] and i not in selected_lgcn_res_sorted_item_name[701][1][:10] and i in overall_top_20_item_name]
print("For user 701, removed wrong popular items are:" + str(removed_wrong_items_701))

# new correct items
print("NEW CORRECT ITEMS:")

print("Item: PINK FLORAL FELTCRAFT SHOULDER BAG: " + str(tidx2pidx_sorted[items_kv["PINK FLORAL FELTCRAFT SHOULDER BAG"]])) # [16, 4, 7] -> 4, 7
print("And this item is related with user 701's persona: 'Crafting and DIY Hobbyist' and 'Collector of Unique and Novelty Items'")
print("Item: FELTCRAFT PRINCESS OLIVIA DOLL: " + str(tidx2pidx_sorted[items_kv["FELTCRAFT PRINCESS OLIVIA DOLL"]])) # [4, 7, 6] -> 4, 7
print("And this item is related with user 701's persona: 'Crafting and DIY Hobbyist' and 'Collector of Unique and Novelty Items'")

# removed wrong items
print("REMOVED WRONG ITEMS:")
print("Item: REGENCY CAKESTAND 3 TIER: " + str(tidx2pidx_sorted[items_kv["REGENCY CAKESTAND 3 TIER"]])) # [5, 1, 2, 0, 10] -> 0, 10
print("And this item is related with user 701's persona: 'Kitchen and Culinary Enthusiast' and 'Tea and Coffee Lover'")
print("Item: JAM MAKING SET WITH JARS: " + str(tidx2pidx_sorted[items_kv["JAM MAKING SET WITH JARS"]])) # [4, 5, 0, 13] -> 0, 13
print("And this item is related with user 701's persona: 'Kitchen and Culinary Enthusiast' and 'Eco-friendly and Sustainable Goods Supporter'")
# print("Item: WHITE HANGING HEART T-LIGHT HOLDER: " + str(tidx2pidx_sorted[items_kv["WHITE HANGING HEART T-LIGHT HOLDER"]])) # [1, 2, 3] -> None
# where 0 is "Kitchen and Culinary Enthusiast" and 10 is "Tea and Coffee Lover" and 13 is "Eco-friendly and Sustainable Goods Supporter"

print("====================================")


For user 701, items that he really purchased are:['WOODEN BOX OF DOMINOES', 'CHRISTMAS CRAFT WHITE FAIRY', '3 STRIPEY MICE FELTCRAFT', 'PAPER BUNTING VINTAGE PAISLEY', 'PINK FLORAL FELTCRAFT SHOULDER BAG', 'FELTCRAFT PRINCESS OLIVIA DOLL', 'PACK OF 12 SUKI TISSUES', 'TRADITIONAL WOODEN SKIPPING ROPE', 'FELTCRAFT DOLL EMILY', 'WORLD WAR 2 GLIDERS ASSTD DESIGNS']
For user 701, items that LGCN recommended are:['PINK CREAM FELT CRAFT TRINKET BOX', '3 STRIPEY MICE FELTCRAFT', 'PINK BLUE FELT CRAFT TRINKET BOX', 'REGENCY CAKESTAND 3 TIER', 'FELTCRAFT DOLL MOLLY', 'JAM MAKING SET WITH JARS', 'PARTY BUNTING', 'FELTCRAFT DOLL EMILY', 'WHITE HANGING HEART T-LIGHT HOLDER', 'TRADITIONAL KNITTING NANCY']
For user 701, items that LGCN_tri recommended are:['PINK CREAM FELT CRAFT TRINKET BOX', 'PINK BLUE FELT CRAFT TRINKET BOX', 'PARTY BUNTING', 'FELTCRAFT DOLL MOLLY', '3 STRIPEY MICE FELTCRAFT', 'SPOTTY BUNTING', 'PINK FLORAL FELTCRAFT SHOULDER BAG', 'FELTCRAFT DOLL EMILY', 'FELTCRAFT PRINCESS OLIVIA

In [None]:
# 1. 这两个item的popularity在所有item中分别处于什么位置：首先把这两个item_name对应到index，然后在sorted_item_count里找到这两个item的位置
print("For item 'PINK FLORAL FELTCRAFT SHOULDER BAG':")
print(items_kv["PINK FLORAL FELTCRAFT SHOULDER BAG"])
# print(item_count[items_kv["PINK FLORAL FELTCRAFT SHOULDER BAG"]])
print("The popularity of this item is: " + str(item_count[items_kv["PINK FLORAL FELTCRAFT SHOULDER BAG"]]))
print(sorted_item_count[601])
print("====================================")

print("For item 'FELTCRAFT PRINCESS OLIVIA DOLL':")
print(items_kv["FELTCRAFT PRINCESS OLIVIA DOLL"])
# print(item_count[items_kv["FELTCRAFT PRINCESS OLIVIA DOLL"]])
print("The popularity of this item is: " + str(item_count[items_kv["FELTCRAFT PRINCESS OLIVIA DOLL"]]))
print(sorted_item_count[379])
print("====================================")


# print(overall_top_20_item_name)
# print("====================================")

# For original recommended popular 3 items:['REGENCY CAKESTAND 3 TIER', 'JAM MAKING SET WITH JARS', 'WHITE HANGING HEART T-LIGHT HOLDER']
print("For item 'REGENCY CAKESTAND 3 TIER':")
# print(items_kv["REGENCY CAKESTAND 3 TIER"])
# print(sorted_item_count[items_kv["REGENCY CAKESTAND 3 TIER"]])
print("The popularity of this item is: " + str(item_count[items_kv["REGENCY CAKESTAND 3 TIER"]]))
print("====================================")


print("For item 'JAM MAKING SET WITH JARS':")
# print(items_kv["JAM MAKING SET WITH JARS"])
# print(sorted_item_count[items_kv["JAM MAKING SET WITH JARS"]])
print("The popularity of this item is: " + str(item_count[items_kv["JAM MAKING SET WITH JARS"]]))
print("====================================")


print("For item 'WHITE HANGING HEART T-LIGHT HOLDER':")
# print(items_kv["WHITE HANGING HEART T-LIGHT HOLDER"])
# print(sorted_item_count[items_kv["WHITE HANGING HEART T-LIGHT HOLDER"]])
print("The popularity of this item is: " + str(item_count[items_kv["WHITE HANGING HEART T-LIGHT HOLDER"]]))
print("====================================")


# user 701
user_index = 701
# item 'PINK FLORAL FELTCRAFT SHOULDER BAG'
item_index = items_kv["PINK FLORAL FELTCRAFT SHOULDER BAG"]
# item 'FELTCRAFT PRINCESS OLIVIA DOLL'
item_index_2 = items_kv["FELTCRAFT PRINCESS OLIVIA DOLL"]

# user 701 <-> item 'PINK FLORAL FELTCRAFT SHOULDER BAG'
if G_user.get(user_index) is not None:
    if item_index in G_user[user_index]:
        print("User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' are connected.")
    else:
        print("User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' are not connected.")
print("====================================")

# user 701 <-> item 'FELTCRAFT PRINCESS OLIVIA DOLL'
if G_user.get(user_index) is not None:
    if item_index_2 in G_user[user_index]:
        print("User 701 and item 'FELTCRAFT PRINCESS OLIVIA DOLL' are connected.")
    else:
        print("User 701 and item 'FELTCRAFT PRINCESS OLIVIA DOLL' are not connected.")
print("====================================")

# 在train的graph里找到这两个item的连接的user，看有几跳
with open('dataset/Instacart/tri_graph_uidx2tidx_train.json', 'r') as f:
    uidx2tidx_train = json.load(f)

print(item_index, item_index_2)




In [None]:
# 在train的graph里找到这两个item的连接的user，看有几跳
with open('dataset/Instacart/tri_graph_uidx2tidx_train.json', 'r') as f:
    uidx2tidx_train = json.load(f)
# Check the training set of user 1657

dict_values([[9, 8, 38, 28, 33, 37, 57, 26, 47, 53, 6, 22, 54, 49, 21, 2, 4, 34, 11, 63, 29, 13, 24, 12, 42, 56, 19, 15, 31, 16, 20, 10, 41, 36, 43, 45, 62, 17, 60, 0, 50, 35, 32, 1, 66, 14, 61, 5, 46, 55, 52, 27, 23, 59], [5587, 332, 10831, 8254, 4340, 234, 11712, 10835, 3531, 1121, 2863, 8659, 1097, 3712, 403, 4903, 484, 3147, 11824, 196, 6952, 14505, 2162, 5281, 312, 3246, 435, 213, 2160, 3630, 11274, 1583, 944, 11502, 7262, 14900, 320, 44, 2271, 17223, 14304, 3261, 3209, 7513, 3180, 1039, 3814], [2333, 2032, 10036, 1831, 7879, 9122, 11447, 15609, 6868, 2845, 405, 11920, 1318, 5766, 6805, 2210, 1331, 2904, 16315, 5484, 27496, 6569, 1851, 7695, 28706, 4131, 218, 11889, 1026, 3877, 3769, 2775, 13324, 12479, 130, 114, 113, 3568, 5496, 9952, 6647, 6577, 16840, 766, 5492, 3876, 7689, 2153, 1097, 5229, 163, 3814, 44, 1852, 7673, 5651, 17163, 10928, 5342, 721, 6642, 5, 6808, 57, 18663, 15453, 473, 164, 1598, 1681, 742, 18373, 4990, 7134, 6155, 875, 635, 27370, 1432, 786, 3199, 9659, 2327, 

In [None]:
import itertools

value_1657 = next(itertools.islice(uidx2tidx_train.values(), 1656, None))
# 把value 1657里的item index转换成item name
item_names_1657 = [item_names[i] for i in value_1657]
print(item_names_1657)
print(len(item_names_1657))

['Blueberry Muffins', 'Baby Cucumbers', 'Glass Cleaner', 'Heavy Duty Scrub Sponge', 'Cinnamon Raisin Bagels', 'Whole Grain Cheddar Baked Snack Crackers', 'XL Pick-A-Size Paper Towel Rolls', 'Heavy Duty Aluminum Foil', 'Fresh Asparagus', 'Tall Kitchen Bag With Febreze Odor Shield', 'Cherrios Honey Nut', 'Organic Sweet Cherries', 'Strawberries', 'Sparkling Mineral Water', 'Cheez-It Cheddar Cracker', 'Plain Bagels', 'Organic Blueberries']
17


In [None]:
# 首先检查在uidx2tidx_train里，对于用户701，这两个item是否有连接
if uidx2tidx_train.get(str(user_index)) is not None:
    if item_index in uidx2tidx_train[str(user_index)]:
        print("User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' are connected.")
    else:
        print("User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' are not connected.")
print("====================================")

if uidx2tidx_train.get(str(user_index)) is not None:
    if item_index_2 in uidx2tidx_train[str(user_index)]:
        print("User 701 and item 'FELTCRAFT PRINCESS OLIVIA DOLL' are connected.")
    else:
        print("User 701 and item 'FELTCRAFT PRINCESS OLIVIA DOLL' are not connected.")
print("====================================")

# 但是可以检查uidx2tidx，是否在701的item里有别的item，这个item连接到另一个user，然后另一个user的item里有这两个item
items_for_701 = uidx2tidx_train[str(user_index)]
num_of_connection_user = 0

for item in items_for_701:
    for user in uidx2tidx_train.keys():
        if user != str(user_index):
            if item in uidx2tidx_train[user]:
                if item_index in uidx2tidx_train[user]: # and item_index_2 in uidx2tidx_train[user]:
                    # print("User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' and 'FELTCRAFT PRINCESS OLIVIA DOLL' are connected through another user.")
                    # print("And this user is: " + str(user))
                    num_of_connection_user += 1

print("User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' and 'FELTCRAFT PRINCESS OLIVIA DOLL' are connected through another user.")
print("And there are " + str(num_of_connection_user) + " users that connect them.")


User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' are not connected.
User 701 and item 'FELTCRAFT PRINCESS OLIVIA DOLL' are not connected.
User 701 and item 'PINK FLORAL FELTCRAFT SHOULDER BAG' and 'FELTCRAFT PRINCESS OLIVIA DOLL' are connected through another user.
And there are 758 users that connect them.


In [None]:
wrong_item_index_1 = items_kv["REGENCY CAKESTAND 3 TIER"]
wrong_item_index_2 = items_kv["JAM MAKING SET WITH JARS"]
wrong_item_index_3 = items_kv["WHITE HANGING HEART T-LIGHT HOLDER"]

# 再检查wrong item是不是和user 701有连接
if uidx2tidx_train.get(str(user_index)) is not None:
    if wrong_item_index_1 in uidx2tidx_train[str(user_index)]:
        print("User 701 and item 'REGENCY CAKESTAND 3 TIER' are connected.")
    else:
        print("User 701 and item 'REGENCY CAKESTAND 3 TIER' are not connected.")
print("====================================")

if uidx2tidx_train.get(str(user_index)) is not None:
    if wrong_item_index_2 in uidx2tidx_train[str(user_index)]:
        print("User 701 and item 'JAM MAKING SET WITH JARS' are connected.")
    else:
        print("User 701 and item 'JAM MAKING SET WITH JARS' are not connected.")
print("====================================")

if uidx2tidx_train.get(str(user_index)) is not None:
    if wrong_item_index_3 in uidx2tidx_train[str(user_index)]:
        print("User 701 and item 'WHITE HANGING HEART T-LIGHT HOLDER' are connected.")
    else:
        print("User 701 and item 'WHITE HANGING HEART T-LIGHT HOLDER' are not connected.")
print("====================================")

# 但是可以检查uidx2tidx，是否在701的item里有别的item，这个item连接到另一个user，然后另一个user的item里有这三个错误item
items_for_701 = uidx2tidx_train[str(user_index)]
num_of_connection_user_wrong = 0

for item in items_for_701:
    for user in uidx2tidx_train.keys():
        if user != str(user_index):
            if item in uidx2tidx_train[user]:
                if wrong_item_index_3 in uidx2tidx_train[user]: # and wrong_item_index_2 in uidx2tidx_train[user] and wrong_item_index_3 in uidx2tidx_train[user]:
                    # print("User 701 and item 'REGENCY CAKESTAND 3 TIER' and 'JAM MAKING SET WITH JARS' and 'WHITE HANGING HEART T-LIGHT HOLDER' are connected through another user.")
                    # print("And this user is: " + str(user))
                    num_of_connection_user_wrong += 1

print("User 701 and item 'REGENCY CAKESTAND 3 TIER' and 'JAM MAKING SET WITH JARS' and 'WHITE HANGING HEART T-LIGHT HOLDER' are connected through another user.")
print("And there are " + str(num_of_connection_user_wrong) + " users that connect them.")

User 701 and item 'REGENCY CAKESTAND 3 TIER' are not connected.
User 701 and item 'JAM MAKING SET WITH JARS' are not connected.
User 701 and item 'WHITE HANGING HEART T-LIGHT HOLDER' are not connected.
User 701 and item 'REGENCY CAKESTAND 3 TIER' and 'JAM MAKING SET WITH JARS' and 'WHITE HANGING HEART T-LIGHT HOLDER' are connected through another user.
And there are 1878 users that connect them.


In [None]:
# uidx为701的user的uid
for user in user_ids_kv.keys():
    if user_ids_kv[user] == 1657:
        print(user)
        break

1657
