# 数据合并

合并``polish``文件夹下的JSON文件。

In [1]:
import json

# 合并两个文件
files = [
    'polish/原质监总局地理标志保护产品批准公告.json',
    'polish/国家知识产权局地理标志保护产品批准公告.json',
]

products = []

for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        products.extend(json.load(f))

with open('products.json', 'w', encoding='utf-8') as f:
    json.dump(products, f, ensure_ascii=False, indent=4)

print(f"合并后总条目数: {len(products)}")
print(f"结果已保存为至 products.json")

合并后总条目数: 4567
结果已保存为至 products.json


# 数据清洗

对``product_name``、``province``、``city``三个字段均相同的产品进行去重。

In [2]:
import json

# 读取原始数据
with open('products.json', 'r', encoding='utf-8') as f:
    products = json.load(f)

# 使用集合记录已见过的 (product_name, province, city) 组合
seen = set()
unique_products = []

for product in products:
    name = product.get("product_name")
    province = product.get("province")
    city = product.get("city")
    
    # 构建唯一键：三元组
    key = (name, province, city)
    
    if key not in seen:
        seen.add(key)
        unique_products.append(product)

# 输出统计信息
print(f"原始条目数: {len(products)}")
print(f"去重后条目数: {len(unique_products)}")

# 保存清洗结果
with open('products.json', 'w', encoding='utf-8') as f:
    json.dump(unique_products, f, ensure_ascii=False, indent=2)

print("清洗完成！结果已保存为 products.json")

原始条目数: 4567
去重后条目数: 4404
清洗完成！结果已保存为 products.json


清洗非中国产品。

In [3]:
import os
os.chdir(os.getcwd())

import json

# 中国所有省级行政区（含港澳台）
CHINA_PROVINCES = {
    "北京市", "天津市", "上海市", "重庆市",
    "河北省", "山西省", "辽宁省", "吉林省", "黑龙江省",
    "江苏省", "浙江省", "安徽省", "福建省", "江西省",
    "山东省", "河南省", "湖北省", "湖南省", "广东省",
    "海南省", "四川省", "贵州省", "云南省", "陕西省",
    "甘肃省", "青海省", "台湾省",
    "内蒙古自治区", "广西壮族自治区", "西藏自治区",
    "宁夏回族自治区", "新疆维吾尔自治区",
    "香港特别行政区", "澳门特别行政区"
}

# 读取文件
with open('products.json', 'r', encoding='utf-8') as f:
    products = json.load(f)

# 过滤：province 非空 且 属于中国
filtered_data = [
    item for item in products
    if item.get("province")  # 非空（排除 None, "", 等 falsy 值）
    and item["province"] in CHINA_PROVINCES
]

# 输出或保存
print(f"原始条目数: {len(products)}")
print(f"过滤后条目数: {len(filtered_data)}")

# 可选：写入新文件
with open('products.json', 'w', encoding='utf-8') as f:
    json.dump(filtered_data, f, ensure_ascii=False, indent=2)

print("已保存到 products.json")

原始条目数: 4404
过滤后条目数: 4400
已保存到 products.json
