In [47]:
import os
import json
from tqdm import tqdm
import re
import pickle

In [48]:
## nodes: items, brands
## item features: title, description, category, price, img
## item edges: also_viewed, buy_after_viewing, also_bought, bought_together
## brand features: name

In [49]:
raw_data_dir="/Users/yehaoran/Desktop/KGAgentEcno/Graph-CoT-main/data/raw_data/amazon"
save_dir="/Users/yehaoran/Desktop/KGAgentEcno/Graph-CoT-main/data/processed_data/amazon"

In [50]:
## read raw data files

def read_json_lines(file, id_key):
    data = {}
    with open(file) as f:
        readin = f.readlines()
        for line in tqdm(readin):
            try:
                tmp = json.loads(line)  # 使用json.loads替代eval
                data[tmp[id_key]] = tmp
            except json.JSONDecodeError as e:
                print(f"解析JSON失败: {e}")
                continue
    return data

item_raw_data = read_json_lines(os.path.join(raw_data_dir, 'meta_Magazine_Subscriptions.json'), 'asin')
# 检查数据是否正确读取
print(f"Total items loaded: {len(item_raw_data)}")
print("First 10 keys:")
keys_list = list(item_raw_data.keys())
for i, key in enumerate(keys_list[:10]):
    print(f"{i+1}: {key}")

100%|██████████| 3385/3385 [00:00<00:00, 123482.05it/s]

Total items loaded: 2320
First 10 keys:
1: B00005N7NQ
2: B00005N7OC
3: B00005N7OD
4: B00005N7O9
5: B00005N7O6
6: B00005N7P0
7: B00005N7QG
8: B00005N7PI
9: B00005N7OP
10: B00005N7Q5





In [51]:
keys_list[:5] if len(keys_list) >= 5 else keys_list

['B00005N7NQ', 'B00005N7OC', 'B00005N7OD', 'B00005N7O9', 'B00005N7O6']

In [52]:
# 使用实际存在的键
if keys_list:
    first_key = keys_list[0]
    print(f"Sample data for key '{first_key}':")
    item_raw_data[first_key]

Sample data for key 'B00005N7NQ':


In [53]:
related_name = set()
for itt in tqdm(item_raw_data):
    if "also_view" in item_raw_data[itt]:
        related_name.add('also_view')
    if "also_buy" in item_raw_data[itt]:
        related_name.add('also_buy')
print(related_name)

100%|██████████| 2320/2320 [00:00<00:00, 2452933.02it/s]

{'also_buy', 'also_view'}





In [54]:
## construct node dictionary
## item features: title, description, category, price, img
## item edges: also_viewed, buy_after_viewing, also_bought, bought_together
## item neighbors: item, brand

## brand features: name
## brand neighbors: item

item_nodes = {}
brand_nodes = {}

brand_name2id = {}

for item_id in tqdm(item_raw_data):
    
    # brand nodes
    if 'brand' in item_raw_data[item_id] and item_raw_data[item_id]['brand'] != '':
        if item_raw_data[item_id]['brand'] not in brand_name2id:
            idd = f'brand_{len(brand_nodes)}'
            brand_name2id[item_raw_data[item_id]['brand']] = idd
            brand_nodes[idd] = {'features': {}, 'neighbors': {}}
            brand_nodes[idd]['features']['name'] = item_raw_data[item_id]['brand']
            brand_nodes[idd]['neighbors']['item'] = [item_id]
        else:
            brand_nodes[brand_name2id[item_raw_data[item_id]['brand']]]['neighbors']['item'].append(item_id)

    # item nodes
    item_nodes[item_id] = {'features': {}, 'neighbors': {}}
    ## add features
    # 添加清理HTML标签的函数
    import re
    def clean_html_tags(text):
        """移除HTML标签"""
        if not text:
            return ''
        # 移除HTML标签
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text).strip()

    def get_title_from_data(item_data):
        """
        从数据中提取标题，优先级如下：
        1. 如果title字段有实际内容，使用清理后的title
        2. 如果没有，尝试从description的第一句获取
        3. 如果还没有，使用brand + main_cat作为标题
        4. 最后使用ASIN作为标题
        """
        # 方案1：尝试清理title字段
        if "title" in item_data and item_data["title"]:
            cleaned_title = clean_html_tags(item_data["title"])
            if cleaned_title.strip():  # 如果清理后有内容
                return cleaned_title

        # 方案2：从description第一句提取
        if "description" in item_data and item_data["description"]:
            descriptions = item_data["description"]
            if isinstance(descriptions, list) and len(descriptions) > 0:
                first_desc = descriptions[0]
                if first_desc and len(first_desc.strip()) > 0:
                    # 取前50个字符作为标题
                    title_from_desc = first_desc.strip()[:50]
                    if len(first_desc) > 50:
                        title_from_desc += "..."
                    return title_from_desc
            elif isinstance(descriptions, str) and descriptions.strip():
                title_from_desc = descriptions.strip()[:50]
                if len(descriptions) > 50:
                    title_from_desc += "..."
                return title_from_desc

        # 方案3：使用brand + category
        title_parts = []
        if "brand" in item_data and item_data["brand"]:
            title_parts.append(item_data["brand"])
        if "main_cat" in item_data and item_data["main_cat"]:
            title_parts.append(item_data["main_cat"])

        if title_parts:
            return " - ".join(title_parts)

        # 方案4：最后使用ASIN
        return item_data.get("asin", "Unknown Item")

    # 在处理title时使用清理函数
    # 使用新的标题提取函数
    item_nodes[item_id]['features']['title'] = get_title_from_data(item_raw_data[item_id])
    item_nodes[item_id]['features']['description'] = item_raw_data[item_id]['description'] if "description" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['price'] = item_raw_data[item_id]['price'] if "price" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['img'] = item_raw_data[item_id]['imageURL'][0] if "imageURL" in item_raw_data[item_id] and len(item_raw_data[item_id]['imageURL']) > 0 else ''
    item_nodes[item_id]['features']['category'] = item_raw_data[item_id]['category'] if "category" in item_raw_data[item_id] else []
    item_nodes[item_id]['features']['main_cat'] = item_raw_data[item_id]['main_cat'] if "main_cat" in item_raw_data[item_id] else ''
    item_nodes[item_id]['features']['details'] = item_raw_data[item_id]['details'] if "details" in item_raw_data[item_id] else {}
    ## add neighbors
    item_nodes[item_id]['neighbors']['also_viewed_item'] = item_raw_data[item_id]['also_view'] if "also_view" in item_raw_data[item_id] else []
    item_nodes[item_id]['neighbors']['buy_after_viewing_item'] = []
    item_nodes[item_id]['neighbors']['also_bought_item'] = item_raw_data[item_id]['also_buy'] if "also_buy" in item_raw_data[item_id] else []
    item_nodes[item_id]['neighbors']['bought_together_item'] = []
    item_nodes[item_id]['neighbors']['brand'] = [brand_name2id[item_raw_data[item_id]['brand']]] if ('brand' in item_raw_data[item_id] and item_raw_data[item_id]['brand'] != '') else []

# make the edges bidirectional
for item_id in tqdm(item_nodes):
    for rel in ['also_viewed_item', 'also_bought_item', 'bought_together_item']:
        for nid in item_nodes[item_id]['neighbors'][rel]:
            if nid not in item_nodes:
                item_nodes[item_id]['neighbors'][rel].remove(nid)
                continue
            if item_id not in item_nodes[nid]['neighbors'][rel]:
                item_nodes[nid]['neighbors'][rel].append(item_id)

100%|██████████| 2320/2320 [00:00<00:00, 84036.77it/s]
100%|██████████| 2320/2320 [00:00<00:00, 139188.19it/s]


In [55]:
## save graph
#pickle.dump({
#    'item_nodes': item_nodes,
#    'brand_nodes': brand_nodes,
#}, open(os.path.join(save_dir, 'graph.pkl'),"wb"))

json.dump({
    'item_nodes': item_nodes,
    'brand_nodes': brand_nodes,
}, open(os.path.join(save_dir, 'magazine_graph.json'),"w"), indent=4)