# Data cleaning

先通过stata生成变量amount，num_goods, price_goods,读入数据，检查基本信息。
中国政府采购、公共采购主要分为货物、服务、工程三大类，有不同的公开招标金额标准。各省不同年份的标准不同，其中货物和服务的标准相同，工程按内容不同有不同标准。鉴于根据公开的采购信息，难以逐一判断工程类合同的具体内容，选择通过关键词判断合同是否为工程相关，直接排除工程部分，保留货物和服务采购数据。
下面的程序主要进行以下数据清洗：
1. 提取"项目名称","采购人甲方","采购人地址"中的省市县信息，结合2020年中国省市县三级行政区划表，匹配采购数据的地区(省级)
2. 根据"主要标的名称"和"项目名称"中词汇判断采购类型。
   出现的词频，选出top 100关键词，对关键词进行手动分类，排除其中容易出现歧义的部分，比如油，可能是“燃油采购”（货物），也可能是“加油服务”（服务）。对包含关键词的采购项目进行分类，在通过机器学习进行分类。

中国政府采购分为分散采购和集中采购两种，在大部分省份，当采购金额低于50万元时由地方分散采购，所以在我们的数据中，保留了50万元以上的采购项目。各省集中采购公开招标的门槛最大为400万元，以100万元为基础的带宽，保留数据到500万元。

In [3]:
import pandas as pd

pd.set_option("display.max_columns", None)   
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", None)     

In [4]:

csv_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_clean1.csv"

df = pd.read_csv(csv_file, low_memory=False)

df_filtered = df[(df['amount'] >= 50) & (df['amount'] <= 500) & (df['年份'] >= 2020)]

KeyboardInterrupt: 

## 确定省份

In [None]:
import geopandas as gpd

shp_path = "/Users/yxy/UChi/Summer2025/Procurement/raw/Countylevel_Admin_2020/China2020County.shp"
gdf = gpd.read_file(shp_path)

gdf = gdf[['省级', '地级', '县级']]

In [None]:
import re
def extract_prov_city_county(text: str):
    if not isinstance(text, str):  
        return None, None, None
    text = re.sub(r"[-_\s·、，,\.]()（）*。", "", text) 
    # text = re.sub(r"[^\u4e00-\u9fa5]", "", text)

    prov_pattern = r"(.*?(省|自治区|市))"    
    city_pattern = r"(.*?(市|地区|盟|州))"  
    county_pattern = r"(.*?(县|区|旗))"     
    
    prov, city, county = None, None, None
    municipalities = ["北京市","天津市","上海市","重庆市"]

    prov_match = re.match(prov_pattern, text)
    if prov_match:
        if '市' in prov_match.group(1) and prov_match.group(1) not in municipalities:
            prov = None
        else: 
            prov = prov_match.group(1)
            text = text[len(prov):]
    city_match = re.match(city_pattern, text)
    if city_match:
        city_candidate = city_match.group(1)
        if city_candidate.endswith("州") and text.startswith("市", len(city_candidate)):
            city = city_candidate + "市"
            text = text[len(city_candidate) + 1:] 
        else:
            city = city_candidate
            text = text[len(city):]
        
    county_match = re.match(county_pattern, text)
    if county_match:
        county = county_match.group(1)

    return prov, city, county


In [None]:

def fill_location(row):
    prov, city, county = None, None, None

    p1, c1, ct1 = extract_prov_city_county(row["采购人地址"])
    prov, city, county = p1, c1, ct1

    if prov is None or city is None or county is None:
        p2, c2, ct2 = extract_prov_city_county(row["采购人甲方"])
        if prov is None: prov = p2
        if city is None: city = c2
        if county is None: county = ct2

    if prov is None or city is None or county is None:
        p3, c3, ct3 = extract_prov_city_county(row["项目名称"])
        if prov is None: prov = p3
        if city is None: city = c3
        if county is None: county = ct3

    return pd.Series([prov, city, county])


In [None]:
def match_region(row, gdf):
    if row["prov"]:
        match = gdf[gdf["省级"] == row["prov"]]
        if not match.empty:
            return match["省级"].iloc[0]
    if row["city"]:
        match = gdf[gdf["地级"].str.contains(str(row["city"]).replace("市",""), na=False, regex=False)]
        if not match.empty:
            return match["省级"].iloc[0]
        match = gdf[gdf["县级"].str.contains(str(row["city"]).replace("市",""), na=False, regex=False)]
        if not match.empty:
            return match["省级"].iloc[0]
    if row["county"]:
        match = gdf[gdf["县级"] == row["county"]]
        if not match.empty:
            return match["省级"].iloc[0]

    return None


In [None]:
df_filtered[["prov", "city", "county"]] = df_filtered.apply(fill_location, axis=1)
df_filtered["region"] = df_filtered.apply(lambda x: match_region(x, gdf), axis=1)


In [None]:
import jieba

def match_region_by_jieba(row, gdf):
    fields = ["采购人地址", "采购人甲方"]

    for field in fields:
        text = row.get(field, "")
        if not isinstance(text, str) or text.strip() == "":
            continue

        words = jieba.lcut(text)
        if not words:
            continue
        first_word = words[0]

        match = gdf[gdf["省级"].str.contains(first_word, na=False, regex=False)]
        if not match.empty:
            return match["省级"].iloc[0]

    return None

In [None]:
mask = df_filtered["region"].isna()
df_filtered.loc[mask, "region"] = df_filtered[mask].apply(
    lambda x: match_region_by_jieba(x, gdf), axis=1
)
df_filtered = df_filtered[
    ~df_filtered["region"].isin(["澳门特别行政区", "台湾省","新疆维吾尔自治区","西藏自治区"])
]


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4g/6_8lhyp147394q93651p5kyw0000gn/T/jieba.cache
Loading model cost 0.527 seconds.
Prefix dict has been built successfully.


## 采购方式

In [None]:
methodlist = [
    "公开招标", "协议供货", "单一来源", "定点采购",
    "电子卖场", "竞争性磋商", "竞争性谈判", "询价", "邀请招标"
]

not_in_list = df_filtered.loc[~df_filtered["采购方式"].isin(methodlist)]

print("row count not in methodlist:", len(not_in_list))
print("ratio:", len(not_in_list) / len(df_filtered))


row count not in methodlist: 114808
ratio: 0.20292596016353137


In [None]:
def clean_method(method: str, methodlist):
    if not isinstance(method, str):
        return None
    for m in methodlist:
        if m in method:  
            return m
    return method  

methodlist = ["公开招标", "协议供货", "单一来源", "定点采购",
              "电子卖场", "竞争性磋商", "竞争性谈判", "询价", "邀请招标"]

mask = ~df_filtered["采购方式"].isin(methodlist)

df_filtered.loc[mask, "采购方式"] = df_filtered.loc[mask, "采购方式"].apply(
    lambda x: clean_method(x, methodlist)
)


In [None]:
df_filtered.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_regionall.csv", index=False, encoding="utf-8-sig")

## 确定类型
### top 200 items
给出现频率前200的标的物名称手动标注类别，使用了chatgpt+人工检查，对一部分“无”，“详情见合同”，标注了类别 “无分类”

In [1]:
out_path_kw = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords.csv"

In [22]:
df_filtered = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_regionall.csv", low_memory=False)

In [5]:
df_filtered = df_filtered.dropna(subset=["region"])

In [6]:
df_filtered['年份'].value_counts(dropna=False)

年份
2023    204721
2022    169965
2021    114735
2020     51999
2024     25794
Name: count, dtype: int64

In [23]:
df_filtered.shape

(593818, 26)

In [8]:
import pandas as pd
out_path_kw = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords2.csv"
top100_items = df_filtered["主要标的名称"].value_counts().head(200).reset_index()
top100_items.columns = ["keyword", "count"]

top100_items["category"] = ""

top100_items.to_csv(out_path_kw, index=False, encoding="utf-8-sig")

print("Top 200 items exported to keywords2.csv for manual categorization.")

Top 200 items exported to keywords2.csv for manual categorization.


In [24]:
classified = pd.read_csv(out_path_kw)
df_filtered = df_filtered.merge(classified[["keyword", "category"]], 
              left_on="主要标的名称", 
              right_on="keyword", 
              how="left")
df_filtered.rename(columns={"category": "cat"}, inplace=True)
df_filtered.drop(columns=["keyword"], inplace=True)

In [25]:
import pandas as pd

# 读入两个csv
df1 = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/keywords2.csv")   
df2 = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/cat_list.csv")  

df1 = df1[["keyword", "category"]]
df2 = df2[["keyword", "category"]]

df_all = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset=["keyword"])

df_all.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/keywords_with_cat2.csv", index=False)


In [26]:
import pandas as pd
import jieba
import re
import numpy as np


kw_df = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/keywords_with_cat2.csv")

def clean_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r"[^\u4e00-\u9fa50-9]", "", text)

def match_cat_from_text(text, kw_df):
    kw_df = kw_df.rename(columns={"category": "cat"})
    text = clean_text(text)
    if not text:
        return None
    words = jieba.lcut(text)
    for w in words:
        if w == "项目":
            continue
        match = kw_df[kw_df["keyword"].str.contains(w, na=False)]
        if not match.empty:
            valid = match[match["cat"] != "无分类"]
            if not valid.empty:
                return valid["cat"].iloc[0]
    return None


def assign_cat(row, kw_df):
    for col in ["主要标的名称", "项目名称"]:
        text = row.get(col, "")
        cat = match_cat_from_text(text, kw_df)
        if cat is not None:
            return cat
    return None



In [27]:
mask = df_filtered["cat"].isna() | (df_filtered["cat"] == "未分类")

df_filtered.loc[mask, "cat"] = df_filtered.loc[mask].apply(
    lambda x: assign_cat(x, kw_df), axis=1
)


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4g/6_8lhyp147394q93651p5kyw0000gn/T/jieba.cache
Loading model cost 0.396 seconds.
Prefix dict has been built successfully.


In [28]:
df_filtered['cat'].value_counts(dropna=False)

cat
服务      231816
货物      172302
工程       85306
未分类      65238
None     39156
Name: count, dtype: int64

In [None]:
df_unmatched = df_filtered[df_filtered['cat'].isna()]
df_unmatched.sample(10)

In [29]:
df_filtered.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_bunchingfinal.csv", index=False, encoding="utf-8-sig")


In [None]:
df_filtered_2cat = df_filtered[df_filtered["cat"].isin(["服务", "货物"])]
df_filtered_2cat.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_2cat.csv", index=False, encoding="utf-8-sig")

## policy threshold

In [None]:
import pandas as pd


df = pd.read_excel(
    "/Users/yxy/UChi/Summer2025/Procurement/raw/China_Procurement_Thresholds_2014_2024.xlsx"
)

df = df[["prov", "year", "cat", "threshold"]]

years = list(range(2020, 2025))
records = []

for prov, g1 in df.groupby("prov"):
    for cat, g2 in g1.groupby("cat"):
        g2 = g2.sort_values("year")
        min_year, max_year = g2["year"].min(), g2["year"].max()
        min_thr = g2.loc[g2["year"] == min_year, "threshold"].iloc[0]
        max_thr = g2.loc[g2["year"] == max_year, "threshold"].iloc[0]
        vals = []
        for y in years:
            if y <= min_year:
                thr = min_thr
            elif y >= max_year:
                thr = max_thr
            else:
                thr = g2[g2["year"] <= y].iloc[-1]["threshold"]
            vals.append(thr)
            records.append([prov, cat, y, thr, None]) 
        change_flag = int(len(set(vals)) > 1)

        for i in range(len(years)):
            records[-len(years) + i][4] = change_flag

threshold = pd.DataFrame(records, columns=["prov", "cat", "year", "threshold", "changed"])

threshold.to_csv(
    "/Users/yxy/UChi/Summer2025/Procurement/dta/threshold.csv",
    index=False,
    encoding="utf-8-sig"
)


In [None]:
threshold['changed'].sum()

## To measure efficiency

给每个合同分出具体的类别，大语言模型

In [None]:
import pandas as pd

df = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_region.csv", low_memory=False)

In [None]:
df.shape

(565763, 26)

In [None]:
import pandas as pd
import re
import numpy as np

def keep_number(s):
    if pd.isna(s):
        return np.nan
    match = re.findall(r"\d+\.?\d*", str(s))
    if len(match) == 1:          # 只允许一个数
        return float(match[0])   # 转成数值
    else:                        # 没有或超过1个数
        return np.nan

# 先保存原始列
df["数量_raw"] = df["主要标的数量"]
df["单价_raw"] = df["主要标的单价"]

# 清理得到新列
df["主要标的数量"] = df["数量_raw"].apply(keep_number).astype(float)
df["主要标的单价"] = df["单价_raw"].apply(keep_number).astype(float)

# 查看变化的内容
changed_qty = df[df["数量_raw"].astype(str) != df["主要标的数量"].astype(str)][["数量_raw", "主要标的数量","单价_raw", "主要标的单价"]]
changed_price = df[df["单价_raw"].astype(str) != df["主要标的单价"].astype(str)][["数量_raw", "主要标的数量","单价_raw", "主要标的单价"]]

# 统计变化数量
n_changed_qty = changed_qty.shape[0]
n_changed_price = changed_price.shape[0]

print(f"数量列发生变化的行数: {n_changed_qty}")
print(f"单价列发生变化的行数: {n_changed_price}")


数量列发生变化的行数: 564838
单价列发生变化的行数: 523261


In [None]:
df = df.drop(columns=["数量_raw", "单价_raw"])
df = df.dropna(subset=["主要标的数量", "主要标的单价"])


In [None]:
df.shape

(469879, 26)

### method 1: keyword

In [None]:
import pandas as pd

# ---------- 构建关键词字典 ----------
catalog = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/raw/all_tables_with_category.csv")

keyword_dict = {}

for _, row in catalog.iterrows():
    code = str(row["编  码"]).strip()
    name = str(row["品目名称"]).strip() if pd.notna(row["品目名称"]) else None

    if not name:  # 跳过空名称
        continue

    # 检查编码结尾
    if len(name) <= 2:
        continue
    if code.endswith("00000") or code.endswith("0000") or code.endswith("00"):
        if len(name) <= 2:
            continue  # 跳过过短的品目名称

    # 直接用品目名称作为关键词
    keyword_dict[name] = (code, name)

# 转成 DataFrame
df_keywords = pd.DataFrame(
    [(kw, code, name) for kw, (code, name) in keyword_dict.items()],
    columns=["关键词", "编码", "品目名称"]
)

# 去重
df_keywords = df_keywords.drop_duplicates(subset=["关键词", "编码"])

# 保存到 CSV
output_path = "/Users/yxy/UChi/Summer2025/Procurement/dta/keyword_dict.csv"
df_keywords.to_csv(output_path, index=False, encoding="utf-8-sig")

print("关键词总数:", df_keywords.shape[0])
print("输出文件:", output_path)


关键词总数: 4035
输出文件: /Users/yxy/UChi/Summer2025/Procurement/dta/keyword_dict.csv


#### classify by 精简后的关键词表

In [None]:
import pandas as pd

keywords = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/keyword_dict.csv")

# 构建关键词字典
keyword_dict = dict(zip(keywords["关键词"], zip(keywords["编码"], keywords["品目名称"])))

def match_category(text):
    if pd.isna(text):
        return None, None
    text = str(text).strip()

    for kw, (code, name) in keyword_dict.items():
        if text == name:
            return code, name

    # 没有匹配到
    return None, None


def classify_row(row):
    # 优先主要标的名称
    code, name = match_category(row["主要标的名称"])
    if code is None:
        code, name = match_category(row["项目名称"])
    if code is None:  # 没匹配到
        return pd.Series(["未分类", "未分类"])
    return pd.Series([code, name])

# 应用到采购数据
df[["subcategory_code", "subcategory_name"]] = df.apply(classify_row, axis=1)

# 大类推断
def get_category(code):
    if isinstance(code, str):
        if code.startswith("A"):
            return "货物"
        elif code.startswith("B"):
            return "工程"
        elif code.startswith("C"):
            return "服务"
    return "未分类"

df["category"] = df["subcategory_code"].apply(get_category)

# 覆盖率
coverage = (df["subcategory_code"] != "未分类").mean()
print(f"匹配覆盖率: {coverage:.2%}")


匹配覆盖率: 12.99%


In [None]:
df['category'].value_counts(dropna=False)

category
未分类    408863
服务      30869
工程      17259
货物      12888
Name: count, dtype: int64

In [None]:
df['subcategory_name'].value_counts(dropna=False).head(20)

subcategory_name
未分类       349609
物业管理服务     15786
医疗设备       12036
装修工程        5773
房屋修缮        4784
计算机         3850
消防车         3643
其他服务        3176
构筑物         3035
修缮工程        2679
咨询服务        2626
保安服务        2247
其他建筑工程      2091
安全服务        1852
服务器         1678
显微镜         1524
触控一体机       1460
运行维护服务      1405
餐饮服务        1334
运营服务        1235
Name: count, dtype: int64

In [31]:
df[df['subcategory_name'] != '未分类'].to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_matched_items.csv", index=False, encoding="utf-8-sig")
df[df['subcategory_name'] == '未分类'].to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_unmatched_items.csv", index=False, encoding="utf-8-sig")

### method 2: embedding

#### embedding category data

In [None]:
import time
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# ========== 参数设置 ==========
client = OpenAI(api_key="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")

catalog_file = "/Users/yxy/UChi/Summer2025/Procurement/raw/all_tables_with_category.csv"
data_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_unmatched_items.csv"

catalog_out = "/Users/yxy/UChi/Summer2025/Procurement/dta/catalog_with_embeddings.parquet"
data_out = "/Users/yxy/UChi/Summer2025/Procurement/dta/procurement_with_embeddings.parquet"

batch_size = 1000
model = "text-embedding-3-small"


# ========== 通用函数 ==========
def get_embeddings(texts, retries=5, delay=5):
    """批量获取 embedding，带重试机制"""
    for attempt in range(retries):
        try:
            resp = client.embeddings.create(model=model, input=texts)
            return [d.embedding for d in resp.data]
        except Exception as e:
            wait = delay * (2 ** attempt)
            print(f"请求失败: {e}, {wait}s 后重试...")
            time.sleep(wait)
    raise RuntimeError("多次重试失败")


# ========== 1. 目录数据 ==========
def embed_catalog():
    df_cat = pd.read_csv(catalog_file)
    
    df_cat["text"] = (
        df_cat["编  码"].fillna("").astype(str)
        + " " + df_cat["品目名称"].fillna("").astype(str)
        + " " + df_cat["说  明"].fillna("").astype(str)
    )

    texts = [str(x) if x is not None else "" for x in df_cat["text"].tolist()]

    embeddings = []
    for i in range(0, len(texts), batch_size):  # batch_size = 比如 500 或 1000
        batch = texts[i:i+batch_size]
        embs = get_embeddings(batch)
        embeddings.extend(embs)

    df_cat["embedding"] = embeddings
    # 写前做一致性检查
    assert len(embeddings) == len(df_cat)

    df_cat["embedding"] = embeddings
    try:
        df_cat.to_parquet(catalog_out, index=False)   # 可能触发 ArrowKeyError
    except Exception as e:
        print("Parquet 保存失败，错误信息:", e)
        # fallback 存成其他格式，避免丢数据
        df_cat.to_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/catalog_with_embeddings.pkl")
        df_cat.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/catalog_with_embeddings.csv", index=False)

    print("目录 embedding 完成（至少保存成 pickle/csv）")


embed_catalog()



目录 embedding 完成（至少保存成 pickle/csv）


#### embedding procurement data

In [None]:
df_sample = pd.read_csv(data_file).sample(10000)
df_sample.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/procurement_sample.csv", index=False)

##### batch

In [19]:
import os

def split_csv(data_file, out_dir, chunk_size=10000):
    os.makedirs(out_dir, exist_ok=True)
    df_iter = pd.read_csv(data_file, chunksize=chunk_size, encoding="utf-8")
    for idx, chunk in enumerate(df_iter):
        chunk_file = os.path.join(out_dir, f"part_{idx:04d}.pkl")
        chunk.to_pickle(chunk_file)
        print(f"保存切分文件: {chunk_file}")

In [None]:
df = pd.read_csv(data_file, low_memory=False)
df.shape

(408863, 29)

In [20]:
part_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_unmatched_items.csv"
split_csv(part_file, "/Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta", chunk_size=10000)

保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0000.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0001.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0002.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0003.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0004.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0005.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0006.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0007.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0008.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0009.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0010.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0011.pkl
保存切分文件: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0012.pkl
保存切分文件: /Users/yxy/UChi/S

##### embedding

In [None]:
import os
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
import time
from openai import OpenAI
from tqdm import tqdm
import numpy as np


def get_embeddings_pro(texts, retries=5, delay=5):
    """获取 embedding，带输入/输出检查"""
    for attempt in range(retries):
        try:
            resp = client.embeddings.create(model=model, input=texts)
            embs = [d.embedding for d in resp.data]

            if len(embs) != len(texts):
                raise ValueError(
                    f"输入 {len(texts)}，输出 {len(embs)}，不一致"
                )
            return embs
        except Exception as e:
            wait = delay * (2 ** attempt)
            print(f"请求失败: {repr(e)}, {wait}s 后重试...")
            time.sleep(wait)
    raise RuntimeError("多次重试失败")

def process_part_2emb(
    part_file,
    batch_size=1000,
    out_dir="/Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb"
):
    df = pd.read_pickle(part_file)

    # 初始化 embedding 列
    for col in ["embedding_project", "embedding_item"]:
        if col not in df.columns:
            df[col] = None

    for i in range(0, len(df), batch_size):
        end = min(i + batch_size, len(df))

        batch_project = df["项目名称"].iloc[i:end].fillna("").astype(str).tolist()
        batch_item = df["主要标的名称"].iloc[i:end].fillna("").astype(str).tolist()

        try:
            embs_project = get_embeddings_pro(batch_project)
            embs_item = get_embeddings_pro(batch_item)
            embs_project = [list(v) for v in embs_project]
            embs_item    = [list(v) for v in embs_item]

            if len(embs_project) == (end - i):
                df.loc[df.index[i:end], "embedding_project"] = pd.Series(embs_project, index=df.index[i:end])
            else:
                print(f"{part_file} 项目 {i}:{end} 长度不匹配, 输入={end-i}, 输出={len(embs_project)}")

            if len(embs_item) == (end - i):
                df.loc[df.index[i:end], "embedding_item"]    = pd.Series(embs_item, index=df.index[i:end])
            else:
                print(f"{part_file} 标的 {i}:{end} 长度不匹配, 输入={end-i}, 输出={len(embs_item)}")

        except Exception as e:
            print(f"{part_file} 任务 {i}:{end} 出错: {e}")


    # 确保输出目录存在
    os.makedirs(out_dir, exist_ok=True)

    # 拼接新文件名
    base_name = os.path.basename(part_file).replace(".pkl", "_emb.pkl")
    out_file = os.path.join(out_dir, base_name)

    df.to_pickle(out_file)
    print(f"batch处理完成: {out_file}")
    return out_file





In [22]:

from concurrent.futures import ThreadPoolExecutor, as_completed
import glob
import os

# ========= 参数 =========
client = OpenAI(api_key="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")
model = "text-embedding-3-small"
max_workers = 5   # 并行线程数，根据 API 限流调整
part_dir = "/Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta"       # 存放拆分小文件的目录
out_dir  = "/Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb"   # 每个小文件处理结果输出目录
merged_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/merged.pkl"

# ========= 并行处理所有 part =========
part_files = sorted(glob.glob(os.path.join(part_dir, "*.pkl")))
print(f"共找到 {len(part_files)} 个小文件")


with ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(process_part_2emb, f, 1000, out_dir): f for f in part_files}
    for f in as_completed(futures):
        try:
            result = f.result()
            print(f"{futures[f]} 完成: {result}")
        except Exception as e:
            print(f"{futures[f]} 出错: {e}")

print("所有小文件处理完成")




共找到 41 个小文件
batch处理完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0000_emb.pkl
/Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0000.pkl 完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0000_emb.pkl
batch处理完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0001_emb.pkl
/Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0001.pkl 完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0001_emb.pkl
batch处理完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0002_emb.pkl
/Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0002.pkl 完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0002_emb.pkl
batch处理完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0003_emb.pkl
/Users/yxy/UChi/Summer2025/Procurement/dta/batch_dta/part_0003.pkl 完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0003_emb.pkl
batch处理完成: /Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/part_0004_emb.pkl
/Users/yxy

In [None]:
df_sample_emb = pd.read_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/merged.pkl")
df_sample_emb.head()


#### classify

In [4]:
cat_df = pd.read_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/catalog_with_embeddings.pkl")
cat_df.head()

Unnamed: 0,table_id,编 码,品目名称,说 明,品目类别,text,embedding
0,table_1,A01000000,房屋和构筑物,,货物,A01000000 房屋和构筑物,"[0.04326699674129486, 0.0030790253076702356, 0..."
1,table_1,A01010000,房屋,,货物,A01010000 房屋,"[0.020689578726887703, -0.021339982748031616, ..."
2,table_1,A01010100,办公用房,包括办公室、服务用房、设备用房、附属用房等办公用房。,货物,A01010100 办公用房 包括办公室、服务用房、设备用房、附属用房等办公用房。,"[-0.03157906234264374, 0.00473583210259676, 0...."
3,table_1,A01010200,业务用房,,货物,A01010200 业务用房,"[-0.01865195669233799, -0.00908493809401989, 0..."
4,table_1,A01010201,警察业务用房,包括公安、安全等业务工作用房。,货物,A01010201 警察业务用房 包括公安、安全等业务工作用房。,"[-0.025146184489130974, -0.017792731523513794,..."


In [None]:
proc = pd.read_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/merged.pkl")
proc.head()

In [6]:
proc.shape

(10000, 32)

In [3]:
import pandas as pd
import numpy as np

# ====== 加载数据 ======
catalog =  pd.read_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/catalog_with_embeddings.pkl")
proc = pd.read_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/merged.pkl")

# ====== 预处理 embedding ======
catalog_emb = np.vstack(catalog["embedding"].apply(np.array).values)

# 保存原始信息，后面要赋值用
catalog_codes = catalog["编  码"].values
catalog_names = catalog["品目名称"].values
catalog_cats  = catalog["品目类别"].values

proj_emb = np.vstack(proc["embedding_project"].apply(np.array).values)
item_emb = np.vstack(proc["embedding_item"].apply(np.array).values)

# ====== 归一化 ======
def normalize(mat):
    return mat / np.linalg.norm(mat, axis=1, keepdims=True)

catalog_emb = normalize(catalog_emb)
proj_emb = normalize(proj_emb)
item_emb = normalize(item_emb)

# ====== 计算相似度 ======
sim_proj = proj_emb @ catalog_emb.T
sim_item = item_emb @ catalog_emb.T
sim_combined = np.maximum(sim_proj, sim_item)

# ====== 找到最佳匹配索引 ======
best_idx = np.argmax(sim_combined, axis=1)
best_score = np.max(sim_combined, axis=1)

# ====== 根据索引取目录表的列 ======
proc["subcategory_code"] = catalog_codes[best_idx]
proc["subcategory_name"] = catalog_names[best_idx]
proc["category"]         = catalog_cats[best_idx]
proc["pred_score"]       = best_score

# ====== 保存结果 ======
proc.to_pickle("procurement_classified.pkl")
proc.to_csv("procurement_classified.csv", index=False)

proc[["项目名称", "主要标的名称", 
            "subcategory_code", "subcategory_name", "category", "pred_score"]].sample(10)


Unnamed: 0,项目名称,主要标的名称,subcategory_code,subcategory_name,category,pred_score
390,井陉县非物质文化遗产博物馆项目家具采购,井陉县非物质文化遗产博物馆项目家具采购,A03021700,家具（文物）,货物,0.630838
2258,固原市原州区综合执法局环卫专业作业车辆政府采购项目三标段（高压清洗车）,清洗车,A02030629,街道清洗清扫车,货物,0.673537
1087,保定市第三中心医院医疗设备采购项目（第二批）,超声波诊断仪,A02320500,医用超声波仪器及设备,货物,0.666203
6396,湛江经开区水库标准化建设项目,湛江经开区水库标准化建设,B02090800,水库工程施工,工程,0.582859
829,河北省中医院财政贴息贷款项目(一)电子鼻咽喉镜\\64排及以上CT等设备,超声诊断系统,A02052403,真空检测设备,货物,0.577025
7600,河北省中医院2023年省级医疗机构发展项目,神经肌肉刺激治疗仪,A02320000,医疗设备,货物,0.515028
3480,广州市启明学校2023年视障学生生活托管服务,视障学生生活托管服务,C02030000,中等教育服务,服务,0.480576
6209,华阴市柳叶河西片区历史遗留废弃矿山生态保护修复项目勘查设计,华阴市柳叶河西片区历史遗留废弃矿山生态保护修复项目勘查设计,B02090500,滞蓄洪区工程施工,工程,0.522945
508,广东省未成年犯管教所（白云监狱）物业管理服务项目,广东省未成年犯管教所（白云监狱）物业管理服务,B01023000,监狱用房施工,工程,0.577706
4262,全民健康平台软件原有基础上升级,全民健康软件平台原有基础上升级,A08060301,基础软件,货物,0.468871


In [17]:
import pandas as pd

# 读取分类结果
proc = pd.read_pickle("procurement_classified.pkl")

# 统计 pred_score < 0.6 的数量
low_score_count = (proc["pred_score"] <  0.6).sum()
low_score_ratio = low_score_count / len(proc)

print(f"低于 0.6 的数量: {low_score_count}")
print(f"占比: {low_score_ratio:.2%}")


低于 0.6 的数量: 8013
占比: 80.13%


In [4]:
proc["pred_score"].describe()

count    10000.000000
mean         0.550874
std          0.063897
min          0.335380
25%          0.507778
50%          0.547022
75%          0.588977
max          0.987418
Name: pred_score, dtype: float64

In [None]:
proc[proc["pred_score"] >= 0.6].sample(10)

Unnamed: 0,合同编号,项目编号,项目名称,采购人甲方,采购人地址,采购人联系方式,供应商乙方,供应商地址,供应商联系方式,主要标的名称,...,prov,city,county,region,subcategory_code,subcategory_name,category,embedding_project,embedding_item,pred_score
9397,[230826]zzgj[GK]20230002,[230826]zzgj[GK]20230002,桦川县公租房采购项目,桦川县住房和城乡建设局,桦川建设大厦7楼,15145464477,黑龙江天堃房地产开发有限公司,桦川县华溪名苑,18945-077770,住宅,...,,,桦川县,黑龙江省,A01010500,住宅,货物,"[-0.03411426022648811, -0.033378422260284424, ...","[0.020089391618967056, 0.029727019369602203, 0...",0.636485
3460,N5119212023000062-1,N5119212023000062,通江县铁佛镇潘家河水库枢纽除险加固工程,通江县水利局,通江县红峰大厦,0827-7220996,四川旺乐建筑工程有限公司,成都市锦江区红星路一段35号附1号,15182757999,通江县铁佛镇潘家河水库枢纽除险加固工程,...,,,通江县,四川省,B02090100,水利枢纽工程施工,工程,"[-0.012981835752725601, 0.02558642067015171, 0...","[-0.012981835752725601, 0.02558642067015171, 0...",0.603723
6677,AZQZCS-C-F-220013,AZQZCS-C-F-220013,阿拉善左旗兽医社会化服务工作项目,阿拉善左旗农牧局,阿拉善左旗巴彦浩特镇西城区农牧大楼7楼,13948020123,阿拉善左旗巴彦浩特镇塔文额日敦兽药店,阿拉善左旗西城区,13948004618,兽医社会化服务,...,,,阿拉善左旗,内蒙古自治区,C09030100,兽医和动物病防治服务,服务,"[0.015827743336558342, 0.007691537961363792, 0...","[0.008424617350101471, 0.007616101298481226, 0...",0.590235
4897,N5133112022000064,N5133112022000064,甘孜州大数据中心数字甘孜总体规划编制服务采购项目,甘孜藏族自治州人民政府办公室,康定市炉城镇光明路136号,08362866265,四川通信科研规划设计有限责任公司,成都高新区天韵路186号高新国际广场E座5/6楼,02885935120,城市规划和设计服务,...,,康定市,,四川省,C13010000,区域规划和设计服务,服务,"[-0.043347470462322235, -0.02950195223093033, ...","[0.015374705195426941, 0.0017902054823935032, ...",0.673692
375,20201124HT(WZ)000010,D6403000141003043-1,同心县住房和城乡建设局户外灭蚊器采购项目,同心县住房和城乡建设局,同心县住房和城乡建设局,0953-8022479,宁夏文旭科贸有限公司,吴忠市利通区利华街东侧吴灵公路南侧湖苑名邸九号住宅楼2102号,13309539906,户外蚊虫诱灭器,...,,,同心县,宁夏回族自治区,A02061912,除害虫用灯,货物,"[0.018913060426712036, -0.0009647423285059631,...","[0.033046696335077286, 0.02068505994975567, -0...",0.597496
5013,N5117222022000071,N5117222022000071,全数字高档彩色多普勒超声诊断仪等设备采购项目,宣汉县中医院,宣汉县蒲江街道西华大道700号,0818-5223135,四川柯迈生物科技有限公司,成都市金牛区星辉大厦0725号,13981723090,半导体激光脱毛仪,...,,,宣汉县,四川省,A02100309,激光仪器,货物,"[-0.04066682979464531, -0.026238828897476196, ...","[0.0090708676725626, 0.017399396747350693, -0....",0.618648
1,N5101162023000017-1,N5101162023000017,2022年第三批医疗设备采购项目,成都市双流区中医医院,成都市双流区东升街道花园路二段,69803260,四川省恒世康医药有限责任公司,成都市金牛区友联一街18号8幢7层702-711号,028-83184908,一体化便携监护仪等,...,,成都市,双流区,四川省,A02102400,航天仪器,货物,"[-0.05551446974277496, -0.009064814075827599, ...","[0.027859341353178024, 0.025057915598154068, 0...",0.599179
2457,BAZB20304101,HB2020093610040038,1号住院楼医学影像科DSA、CT机房防护修缮工程,河北医科大学第二医院,河北医科大学第二医院,0311-66002163,河北鸿程建筑工程有限公司,石家庄市新华区中华大街298号颐宏大厦02单元1909,0311-89296927,河北医科大学第二医院1号住院楼医学影像科DSA、CT机房防护修缮工程,...,,,,河北省,B01021200,医疗卫生用房施工,工程,"[0.0049405391328036785, 0.005491976160556078, ...","[0.004262268077582121, -0.018243659287691116, ...",0.590985
2083,HBCJ【2021】1208号-03,HB2021128230010005,鹿泉经济开发区投资项目咨询评估机构委托服务项目,河北鹿泉经济开发区管理委员会本级,河北鹿泉经济开发区管理委员会本级,0311-67361673,大成工程咨询有限公司,郑州市金水区经三路15号1号楼A区12层1202号,15131158251,投资项目咨询评估机构委托服务（投资额 3000 万元-1 亿（含））；,...,,,河北鹿泉经济开发区,河北省,C20030900,评审咨询服务,服务,"[-0.028036698698997498, 0.019285915419459343, ...","[-0.004890156909823418, 0.03959038108587265, 0...",0.596204
1926,WHZCHNS-C-G-220040,WHZCHNS-C-G-220040,拉僧庙镇农作物收储加工基础设施项目消防水池建设项目,乌海市海南区拉僧庙镇人民政府,内蒙古自治区乌海市 海南区拉僧庙镇,13947308883,内蒙古泰利工程建设有限公司,内蒙古呼和浩特市,15648883434,拉僧庙镇农作物收储加工基础设施项目消防水池建设项目,...,内蒙古自治区,乌海市,海南区,内蒙古自治区,B02090800,水库工程施工,工程,"[-0.034181147813797, 0.010854965075850487, 0.0...","[-0.03419281542301178, 0.010889963246881962, 0...",0.580302


In [7]:
df = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_unmatched_items.csv", low_memory=False)

In [8]:
df.shape

(408863, 29)

In [25]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm import tqdm   # pip install tqdm

# ====== 读取目录 embedding，只加载一次 ======
catalog = pd.read_pickle("/Users/yxy/UChi/Summer2025/Procurement/dta/catalog_with_embeddings.pkl")
catalog_emb = np.vstack(catalog["embedding"].apply(np.array).values)
catalog_emb = catalog_emb / np.linalg.norm(catalog_emb, axis=1, keepdims=True)

codes = catalog["编  码"].values
names = catalog["品目名称"].values
cats  = catalog["品目类别"].values

def normalize(m):
    return m / np.linalg.norm(m, axis=1, keepdims=True)

os.makedirs("/Users/yxy/UChi/Summer2025/Procurement/dta/classified", exist_ok=True)

# ====== 批量处理所有文件 ======
files = glob.glob("/Users/yxy/UChi/Summer2025/Procurement/dta/batch_emb/*.pkl")

for fname in tqdm(files, desc="Processing files"):
    proc = pd.read_pickle(fname)

    proj_emb = np.vstack(proc["embedding_project"].apply(np.array).values)
    item_emb = np.vstack(proc["embedding_item"].apply(np.array).values)

    proj_emb = normalize(proj_emb)
    item_emb = normalize(item_emb)

    sim_proj = proj_emb @ catalog_emb.T
    sim_item = item_emb @ catalog_emb.T
    sim_comb = np.maximum(sim_proj, sim_item)

    best_idx = np.argmax(sim_comb, axis=1)
    best_score = np.max(sim_comb, axis=1)

    proc["subcategory_code"] = codes[best_idx]
    proc["subcategory_name"] = names[best_idx]
    proc["category"] = cats[best_idx]
    proc["pred_score"] = best_score

    # 删除 embedding 列
    proc = proc.drop(columns=["embedding_project", "embedding_item"])

    # 输出到 classified 文件夹
    base = os.path.basename(fname).replace(".pkl", "_classified.csv")
    out = os.path.join("/Users/yxy/UChi/Summer2025/Procurement/dta/classified", base)
    proc.to_csv(out, index=False)

print("全部文件处理完成。")


Processing files: 100%|██████████| 41/41 [03:29<00:00,  5.11s/it]

全部文件处理完成。





In [26]:
import pandas as pd
import glob
import os

# ====== 找到所有 CSV 文件 ======
files = glob.glob("/Users/yxy/UChi/Summer2025/Procurement/dta/classified/*.csv")

# ====== 读入并合并 ======
dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append(df)

merged = pd.concat(dfs, ignore_index=True)

# ====== 保存合并后的结果 ======
merged.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/classified_emb.csv", index=False)

print(f"合并完成，共 {len(files)} 个文件，总行数 {len(merged)}")


合并完成，共 41 个文件，总行数 408863


In [28]:
low_score_count = (merged["pred_score"] <  0.6).sum()
low_score_ratio = low_score_count / len(merged)

print(f"低于 0.6 的数量: {low_score_count}")
print(f"占比: {low_score_ratio:.2%}")

低于 0.6 的数量: 325989
占比: 79.73%


In [None]:
merged[merged["pred_score"] >= 0.6].sample(10)

In [32]:
import pandas as pd


merged_well = merged[merged["pred_score"] >= 0.6]

df_mat = pd.read_csv(
    "/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_matched_items.csv",
    low_memory=False
)

# 上下拼接
df_final = pd.concat([df_mat, merged_well], ignore_index=True)

# 保存
df_final.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_classified.csv", index=False)

print(f"拼接完成，总行数 {len(df_final)}，保存为 china_procurement_classified.csv")


拼接完成，总行数 92874，保存为 china_procurement_classified.csv


In [None]:
df_final['subcategory_name'].value_counts(dropna=False).head(50)

In [39]:
df_final['category'].value_counts(dropna=False)

category
货物    54110
服务    19589
工程    19175
Name: count, dtype: int64