# Data cleaning

先通过stata生成变量amount，num_goods, price_goods,读入数据，检查基本信息。中国政府采购、公共采购主要分为货物、服务、工程三大类，分别有不同的公开招标金额标准。
根据主要标的名称和项目名称中词汇出现的词频，选出top 100关键词，对关键词进行手动分类，排除其中容易出现歧义的部分，比如油，可能是“燃油采购”（货物），也可能是“加油服务”（服务）。对包含关键词的采购项目进行分类，在通过机器学习进行分类。

Policy：
关于印发《地方预算单位政府集中采购目录及标准指引（2020年版）》的通知 （2021年实施）
各地应逐步规范集中采购范围，取消市、县级集中采购目录，实现集中采购目录省域范围相对统一，充分发挥集中采购制度优势，不断提升集中采购服务质量和专业水平
省级单位政府采购货物、服务项目分散采购限额标准不应低于50万元，市县级单位政府采购货物、服务项目分散采购限额标准不应低于30万元，政府采购工程项目分散采购限额标准不应低于60万元

In [4]:
import pandas as pd

pd.set_option("display.max_columns", None)   
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", None)     

In [2]:

csv_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_clean1.csv"

df = pd.read_csv(csv_file, low_memory=False)


## Check data

In [3]:
df.columns

Index(['合同编号', '项目编号', '项目名称', '采购人甲方', '采购人地址', '采购人联系方式', '供应商乙方', '供应商地址',
       '供应商联系方式', '主要标的名称', '规格型号或服务要求', '主要标的数量', '主要标的单价', '合同金额万元',
       '履约期限地点等简要信息', '采购方式', '合同签订日期', '合同公告日期', '年份', 'amount', 'num_goods',
       'price_goods'],
      dtype='object')

In [15]:
df_filtered = df[(df['amount'] >= 50) & (df['amount'] <= 500) & (df['年份'] >= 2020)]

In [16]:
df_filtered.shape

(568380, 22)

In [26]:
df_filtered['年份'].value_counts()

年份
2023    204080
2022    169625
2021    115001
2020     54029
2024     25645
Name: count, dtype: int64

In [25]:
df_filtered['amount'].describe()

count    568380.000000
mean        167.217049
std         105.774489
min          50.000000
25%          84.500000
50%         134.560000
75%         220.000000
max         500.000000
Name: amount, dtype: float64

## Generate training set
### top 200 items
给出现频率前200的标的物名称手动标注类别，使用了chatgpt+人工检查，对一部分“无”，“详情见合同”，标注了类别notsure

In [211]:
out_path_kw = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords.csv"

In [218]:
import pandas as pd
out_path_kw = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords.csv"
top100_items = df_filtered["主要标的名称"].value_counts().head(200).reset_index()
top100_items.columns = ["keyword", "count"]

top100_items["category"] = ""

top100_items.to_csv(out_path_kw, index=False, encoding="utf-8-sig")

print("Top 200 items exported to keywords.csv for manual categorization.")


Top 200 items exported to keywords.csv for manual categorization.


In [233]:

classified = pd.read_csv(out_path_kw)

df_filtered = df_filtered.merge(classified[["keyword", "category"]], 
              left_on="主要标的名称", 
              right_on="keyword", 
              how="left")

df_filtered.rename(columns={"category": "cat"}, inplace=True)

df_filtered.drop(columns=["keyword"], inplace=True)



In [234]:
df_filtered['cat'].value_counts(dropna=False)

cat
NaN        475544
服务          59403
notsure     24573
货物          21737
工程          17656
Name: count, dtype: int64

In [None]:
df_uncat = df_filtered[df_filtered['cat'].isna() | (df_filtered['cat'] == 'notsure')]
print(df_uncat['主要标的名称'].value_counts().iloc[300:350])

In [None]:
df_uncat['主要标的名称'].value_counts().head(50)

### top 100 words
把还未完成分类的部分，主要标的物名称拆分成词组，选词频前100的词汇，手动标注类别

In [236]:
import re
from collections import Counter

df_nacat = df_filtered[df_filtered['cat'].isna()]

all_words = []
for name in df_nacat["主要标的名称"].astype(str):
    words = re.split(r"[、，,（）() ]", name)
    all_words.extend(words)

counter = Counter(all_words)
top100 = [w for w, _ in counter.most_common(100) if w.strip() != ""]

mask = df_nacat["主要标的名称"].astype(str).apply(
    lambda x: any(w in x for w in top100)
)
covered_rows = mask.sum()
total_rows = mask.shape[0]

print("rows containing top 100 words: {}".format(covered_rows))
print("cover rate : {:.2f}%".format(covered_rows / total_rows * 100))


rows containing top 100 words: 349918
cover rate : 73.58%


In [237]:
top100_words = counter.most_common(100)
top100_words = pd.DataFrame(top100_words, columns=["keyword", "count"])
top100_words["category"] = ""
top100_words.to_csv(out_path_kw, mode="a", index=False, header=False, encoding="utf-8-sig")

print("Top 100 words exported to keywords.csv for manual categorization.")

Top 100 words exported to keywords.csv for manual categorization.


In [242]:

kw_df = pd.read_csv(out_path_kw)

mapping = dict(zip(kw_df["keyword"].astype(str), kw_df["category"].astype(str)))

def classify_name(name):
    if pd.isna(name) or not isinstance(name, str) or name.strip() == "":
        return pd.NA
    
    words = re.split(r"[、，,（）() ]", name)
    categories = set()

    for w in words:
        if w in mapping:
            categories.add(mapping[w])
    
    if len(categories) == 1:
        return categories.pop()
    elif len(categories) > 1:
        return "notsure"
    else:
        return pd.NA

mask = df_filtered["cat"].isna()
df_filtered.loc[mask, "cat"] = df_filtered.loc[mask, "主要标的名称"].apply(classify_name)


print(df_filtered["cat"].value_counts(dropna=False))


cat
<NA>       448454
服务          61176
notsure     43047
货物          27644
工程          18592
Name: count, dtype: int64


In [246]:
valid_cats = ["货物", "工程", "服务"]

df_filtered[df_filtered["cat"].isin(valid_cats)]["年份"].value_counts()



年份
2022    43230
2023    37938
2021    17856
2024     5360
2020     3016
2019        8
2016        3
2018        1
Name: count, dtype: int64

### 用项目名称分类

In [253]:
invalid_keywords = {
    "详见合同",
    "详见附件",
    "无",
    "1",
    "详见合同文本。",
    "无无",
    "见附件",
    "详见合同附件",
    "-",
    "/",
    "详见招标文件",
    "见合同",
    "1,采购数量1;",
    "详见附件。",
    "第一包",
    "详见中标公告",
    "见附件合同",
    "货物交付"
}
df_filtered.loc[df['cat'].isin(invalid_keywords), 'cat'] = 'notsure'

In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


train_df = kw_df[kw_df["category"].isin(["货物","服务","工程"])]
X_train = train_df["keyword"].astype(str)
y_train = train_df["category"]

vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1,3))
X_train_vec = vectorizer.fit_transform(X_train)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

def classify_ml(text):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return pd.NA
    vec = vectorizer.transform([text])
    return clf.predict(vec)[0]

def classify_row(row):
    # 原来已经有明确分类的直接保留
    if pd.notna(row["cat"]) and row["cat"] in ["货物", "服务", "工程"]:
        return row["cat"]

    # 否则尝试重新分类
    cat = classify_name(row["主要标的名称"])
    if pd.isna(cat) or cat == "notsure":
        if (isinstance(row["主要标的名称"], str) and row["主要标的名称"] in invalid_keywords) or cat == "notsure":
            return classify_ml(row["项目名称"])
        else:
            return pd.NA
    else:
        return cat


df_filtered["cat_final"] = df_filtered.apply(classify_row, axis=1)



In [265]:
df_filtered['cat'].value_counts(dropna=False)

cat
<NA>       448454
服务          61176
notsure     43047
货物          27644
工程          18592
Name: count, dtype: int64

In [264]:
df_filtered['cat_final'].value_counts(dropna=False)

cat_final
<NA>    452278
服务       68105
货物       62294
工程       16236
Name: count, dtype: int64

In [None]:
out_path_kw = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords.csv"

In [None]:
import pandas as pd
out_path_kw = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords.csv"
top100_items = df_filtered["主要标的名称"].value_counts().head(200).reset_index()
top100_items.columns = ["keyword", "count"]

top100_items["category"] = ""

top100_items.to_csv(out_path_kw, index=False, encoding="utf-8-sig")

print("Top 200 items exported to keywords.csv for manual categorization.")


Top 200 items exported to keywords.csv for manual categorization.


In [None]:

classified = pd.read_csv(out_path_kw)

df_filtered = df_filtered.merge(classified[["keyword", "category"]], 
              left_on="主要标的名称", 
              right_on="keyword", 
              how="left")

df_filtered.rename(columns={"category": "cat"}, inplace=True)

df_filtered.drop(columns=["keyword"], inplace=True)



In [None]:
df_filtered['cat'].value_counts(dropna=False)

cat
NaN        475544
服务          59403
notsure     24573
货物          21737
工程          17656
Name: count, dtype: int64

In [None]:
df_uncat = df_filtered[df_filtered['cat'].isna() | (df_filtered['cat'] == 'notsure')]
print(df_uncat['主要标的名称'].value_counts().iloc[300:350])

In [None]:
df_uncat['主要标的名称'].value_counts().head(50)

### top 100 words
把还未完成分类的部分，主要标的物名称拆分成词组，选词频前100的词汇，手动标注类别

In [None]:
import re
from collections import Counter

df_nacat = df_filtered[df_filtered['cat'].isna()]

all_words = []
for name in df_nacat["主要标的名称"].astype(str):
    words = re.split(r"[、，,（）() ]", name)
    all_words.extend(words)

counter = Counter(all_words)
top100 = [w for w, _ in counter.most_common(100) if w.strip() != ""]

mask = df_nacat["主要标的名称"].astype(str).apply(
    lambda x: any(w in x for w in top100)
)
covered_rows = mask.sum()
total_rows = mask.shape[0]

print("rows containing top 100 words: {}".format(covered_rows))
print("cover rate : {:.2f}%".format(covered_rows / total_rows * 100))


rows containing top 100 words: 349918
cover rate : 73.58%


In [None]:
top100_words = counter.most_common(100)
top100_words = pd.DataFrame(top100_words, columns=["keyword", "count"])
top100_words["category"] = ""
top100_words.to_csv(out_path_kw, mode="a", index=False, header=False, encoding="utf-8-sig")

print("Top 100 words exported to keywords.csv for manual categorization.")

Top 100 words exported to keywords.csv for manual categorization.


In [None]:

kw_df = pd.read_csv(out_path_kw)

mapping = dict(zip(kw_df["keyword"].astype(str), kw_df["category"].astype(str)))

def classify_name(name):
    if pd.isna(name) or not isinstance(name, str) or name.strip() == "":
        return pd.NA
    
    words = re.split(r"[、，,（）() ]", name)
    categories = set()

    for w in words:
        if w in mapping:
            categories.add(mapping[w])
    
    if len(categories) == 1:
        return categories.pop()
    elif len(categories) > 1:
        return "notsure"
    else:
        return pd.NA

mask = df_filtered["cat"].isna()
df_filtered.loc[mask, "cat"] = df_filtered.loc[mask, "主要标的名称"].apply(classify_name)


print(df_filtered["cat"].value_counts(dropna=False))


cat
<NA>       448454
服务          61176
notsure     43047
货物          27644
工程          18592
Name: count, dtype: int64


In [None]:
valid_cats = ["货物", "工程", "服务"]

df_filtered[df_filtered["cat"].isin(valid_cats)]["年份"].value_counts()



年份
2022    43230
2023    37938
2021    17856
2024     5360
2020     3016
2019        8
2016        3
2018        1
Name: count, dtype: int64

### 用项目名称分类

In [None]:
invalid_keywords = {
    "详见合同",
    "详见附件",
    "无",
    "1",
    "详见合同文本。",
    "无无",
    "见附件",
    "详见合同附件",
    "-",
    "/",
    "详见招标文件",
    "见合同",
    "1,采购数量1;",
    "详见附件。",
    "第一包",
    "详见中标公告",
    "见附件合同",
    "货物交付"
}
df_filtered.loc[df['cat'].isin(invalid_keywords), 'cat'] = 'notsure'

In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


train_df = kw_df[kw_df["category"].isin(["货物","服务","工程"])]
X_train = train_df["keyword"].astype(str)
y_train = train_df["category"]

vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1,3))
X_train_vec = vectorizer.fit_transform(X_train)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

def classify_ml(text):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return pd.NA
    vec = vectorizer.transform([text])
    return clf.predict(vec)[0]

def classify_row(row):
    # 原来已经有明确分类的直接保留
    if pd.notna(row["cat"]) and row["cat"] in ["货物", "服务", "工程"]:
        return row["cat"]

    # 否则尝试重新分类
    cat = classify_name(row["主要标的名称"])
    if pd.isna(cat) or cat == "notsure":
        if (isinstance(row["主要标的名称"], str) and row["主要标的名称"] in invalid_keywords) or cat == "notsure":
            return classify_ml(row["项目名称"])
        else:
            return pd.NA
    else:
        return cat


df_filtered["cat_final"] = df_filtered.apply(classify_row, axis=1)



In [None]:
df_filtered['cat'].value_counts(dropna=False)

cat
<NA>       448454
服务          61176
notsure     43047
货物          27644
工程          18592
Name: count, dtype: int64

In [None]:
df_filtered['cat_final'].value_counts(dropna=False)

cat_final
<NA>    452278
服务       68105
货物       62294
工程       16236
Name: count, dtype: int64

## 确定省份

In [5]:
import geopandas as gpd

shp_path = "/Users/yxy/UChi/Summer2025/Procurement/raw/Countylevel_Admin_2020/China2020County.shp"
gdf = gpd.read_file(shp_path)

gdf.columns


Index(['地名', '区划码', '县级', '县级码', '县级类', '地级', '地级码', '地级类', '省级', '省级码', '省级类',
       '曾用名', '备注', 'ENG_NAME', 'VAR_NAME', 'code', 'NAME_3', 'VAR_NAME3',
       'GID_3', 'TYPE_3', 'NAME_2', 'VAR_NAME2', 'GID_2', 'TYPE_2', 'NAME_1',
       'VAR_NAME1', 'GID_1', 'TYPE_1', 'year', 'geometry'],
      dtype='object')

In [23]:
gdf.shape

(2857, 30)

In [6]:
gdf = gdf[['省级', '地级', '县级']]

In [7]:
import re
def extract_prov_city_county(text: str):
    if not isinstance(text, str):  
        return None, None, None
    text = re.sub(r"[-_\s·、，,\.]()（）*。", "", text) 
    # text = re.sub(r"[^\u4e00-\u9fa5]", "", text)

    prov_pattern = r"(.*?(省|自治区|市))"    
    city_pattern = r"(.*?(市|地区|盟|州))"  
    county_pattern = r"(.*?(县|区|旗))"     
    
    prov, city, county = None, None, None
    municipalities = ["北京市","天津市","上海市","重庆市"]

    prov_match = re.match(prov_pattern, text)
    if prov_match:
        if '市' in prov_match.group(1) and prov_match.group(1) not in municipalities:
            prov = None
        else: 
            prov = prov_match.group(1)
            text = text[len(prov):]
    city_match = re.match(city_pattern, text)
    if city_match:
        city_candidate = city_match.group(1)
        if city_candidate.endswith("州") and text.startswith("市", len(city_candidate)):
            city = city_candidate + "市"
            text = text[len(city_candidate) + 1:] 
        else:
            city = city_candidate
            text = text[len(city):]
        
    county_match = re.match(county_pattern, text)
    if county_match:
        county = county_match.group(1)

    return prov, city, county

examples = [
    "南沙区横沥镇明珠湾起步区横沥岛西侧",
    "稀土高新区区属中小学校幼儿园校舍消防安全评估",
    "土默特左旗",
    "北京市朝阳区",
    "天津市",
    1,
    "石家庄市长安区",
    "兰州市市政工程服务中心",
    "广东省-佛山市-禅城区广东省佛山市禅城区汾江中路139号",
    "广州市越秀区广州大道中367号B26G"
]

for ex in examples:
    print(ex, "->", extract_prov_city_county(ex))



南沙区横沥镇明珠湾起步区横沥岛西侧 -> (None, None, '南沙区')
稀土高新区区属中小学校幼儿园校舍消防安全评估 -> (None, None, '稀土高新区')
土默特左旗 -> (None, None, '土默特左旗')
北京市朝阳区 -> ('北京市', None, '朝阳区')
天津市 -> ('天津市', None, None)
1 -> (None, None, None)
石家庄市长安区 -> (None, '石家庄市', '长安区')
兰州市市政工程服务中心 -> (None, '兰州市', None)
广东省-佛山市-禅城区广东省佛山市禅城区汾江中路139号 -> ('广东省', '-佛山市', '-禅城区')
广州市越秀区广州大道中367号B26G -> (None, '广州市', '越秀区')


In [None]:
df_sampled = df_filtered.sample(10,random_state=42)
df_sampled

In [88]:

def fill_location(row):
    prov, city, county = None, None, None

    p1, c1, ct1 = extract_prov_city_county(row["采购人地址"])
    prov, city, county = p1, c1, ct1

    if prov is None or city is None or county is None:
        p2, c2, ct2 = extract_prov_city_county(row["采购人甲方"])
        if prov is None: prov = p2
        if city is None: city = c2
        if county is None: county = ct2

    if prov is None or city is None or county is None:
        p3, c3, ct3 = extract_prov_city_county(row["项目名称"])
        if prov is None: prov = p3
        if city is None: city = c3
        if county is None: county = ct3

    return pd.Series([prov, city, county])



df_filtered[["prov", "city", "county"]] = df_filtered.apply(fill_location, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[["prov", "city", "county"]] = df_filtered.apply(fill_location, axis=1)


In [None]:
df_filtered.sample(5)

In [102]:
def match_region(row, gdf):
    if row["prov"]:
        match = gdf[gdf["省级"] == row["prov"]]
        if not match.empty:
            return match["省级"].iloc[0]
    if row["city"]:
        match = gdf[gdf["地级"].str.contains(str(row["city"]).replace("市",""), na=False, regex=False)]
        if not match.empty:
            return match["省级"].iloc[0]
        match = gdf[gdf["县级"].str.contains(str(row["city"]).replace("市",""), na=False, regex=False)]
        if not match.empty:
            return match["省级"].iloc[0]
    if row["county"]:
        match = gdf[gdf["县级"] == row["county"]]
        if not match.empty:
            return match["省级"].iloc[0]

    return None



df_filtered["region"] = df_filtered.apply(lambda x: match_region(x, gdf), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["region"] = df_filtered.apply(lambda x: match_region(x, gdf), axis=1)


In [114]:
df_filtered.to_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_region.csv", index=False, encoding="utf-8-sig")

In [8]:
df_filtered = pd.read_csv("/Users/yxy/UChi/Summer2025/Procurement/dta/china_procurement_region.csv", low_memory=False)

In [9]:
df_filtered['region'].value_counts(dropna=False)

region
广东省         103265
河北省          98872
四川省          63229
NaN          40601
甘肃省          35541
北京市          33474
内蒙古自治区       29781
陕西省          24342
宁夏回族自治区      17988
辽宁省          17254
福建省          15739
黑龙江省         15019
贵州省          14011
江苏省          10076
海南省           8738
上海市           4242
山东省           4074
云南省           3999
重庆市           3823
吉林省           3496
湖北省           3311
江西省           3157
浙江省           2180
天津市           2172
安徽省           2089
湖南省           1918
河南省           1530
广西壮族自治区       1343
新疆维吾尔自治区      1127
山西省            867
西藏自治区          689
青海省            433
Name: count, dtype: int64

In [16]:
df_unmatched = df_filtered[df_filtered['region'].isna()]
df_unmatched.sample(5)

Unnamed: 0,合同编号,项目编号,项目名称,采购人甲方,采购人地址,采购人联系方式,供应商乙方,供应商地址,供应商联系方式,主要标的名称,规格型号或服务要求,主要标的数量,主要标的单价,合同金额万元,履约期限地点等简要信息,采购方式,合同签订日期,合同公告日期,年份,amount,num_goods,price_goods,prov,city,county,region
191735,HW20230275,ZKQ2023-020403093ZF（H）,武汉大学学生宿舍实木靠背椅采购项目,武汉大学,武汉大学,027-68754597,江西宏辉实业有限公司,江西省抚州市南城县株良镇,17607948819,学生宿舍实木靠背椅,型号YH-01 规格440×460×850mm,1套,896000,89.6,武汉,公开招标,2023-04-06,2023-04-23,2023,89.599998,,896000.0,,,,
41495,gyc2021-0151,SDSM2021-3183,山东大学齐鲁医院超声心动图信息系统升级项目,山东大学齐鲁医院,济南文化西路107号,0531-82169126,山东玺诺信息技术有限公司,山东济南华强国际A栋901,山东玺诺信息技术有限公司,美迪康医学影像管理系统,美迪康,1,644500,64.45,山东大学齐鲁医院,公开招标,2021-04-12,2021-06-28,2021,64.449997,1.0,644500.0,,,,
566823,HiGT0250-GDS-CO020,4008830171695698322219,高效低碳燃气轮机试验装置国家重大科技基础设施项目-高效新型循环试验台天然气调压站,中国科学院工程热物理研究所,北四环西路11号,13681033652,天津市华旭燃气设备制造厂,天津市北辰区高端装备制造产业园兴河路12号,022-26995310,天然气调压站,/,1,1690000.00,169.0,江苏连云港，交货进度90天。,竞争性磋商,2023-12-22,2024-01-15,2024,169.0,1.0,1690000.0,,,,
168683,N5109232022000165-1,N5109232022000165,“无差别综合窗口‘省内通办’试点”建设采购项目(二次),大英县行政审批局,大县天星大道行政服务大楼,13778705912,中国广电四川网络股份有限公司大英县分公司,大英县卓筒大道5号,19218119677,“无差别综合窗口‘省内通办’试点”建设,详见合同,1.0000项,1278000.000000元,127.8,大英县行政审批局,竞争性磋商,2023-01-03,2023-01-10,2023,127.8,,,“无差别综合窗口‘省,,大县,
559628,HT202306050,SDJDHD20230647-Z407,山东大学全合成生物筛选平台采购项目,山东大学,山东大学中心校区明德楼,0531-88365560,上海山里宏生物科技有限公司,上海市奉贤区奉金路559号2幢,18785762349,全合成生物筛选平台,Molecular Devices 等,1套,399999.00美元,286.399284,详见附件,公开招标,2024-01-02,2024-01-18,2024,286.39929,,,,,山东大学中心校区,


### use jieba to cut rows that are not matched

In [11]:
import jieba

def match_region_by_jieba(row, gdf):
    fields = ["采购人地址", "采购人甲方", "项目名称"]

    for field in fields:
        text = row.get(field, "")
        if not isinstance(text, str) or text.strip() == "":
            continue

        words = jieba.lcut(text)
        if not words:
            continue
        first_word = words[0]

        match = gdf[gdf["省级"].str.contains(first_word, na=False, regex=False)]
        if not match.empty:
            return match["省级"].iloc[0]

    return None

In [12]:
mask = df_filtered["region"].isna()
df_filtered.loc[mask, "region"] = df_filtered[mask].apply(
    lambda x: match_region_by_jieba(x, gdf), axis=1
)


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/4g/6_8lhyp147394q93651p5kyw0000gn/T/jieba.cache
Loading model cost 0.375 seconds.
Prefix dict has been built successfully.


In [None]:
df_filtered = df_filtered[
    ~df_filtered["region"].isin(["澳门特别行政区", "台湾省"])
]

df_filtered['region'].value_counts(dropna=False)

### test

# test code

In [239]:

keywords = ["项目", "二次", "一期", "二期", "一标段", "二标段", "第二批"]
pattern = "|".join(keywords)

mask = df["主要标的名称"].astype(str).str.contains(pattern, na=False)

df_subset = df[mask]

df_subset[["项目名称","主要标的名称", "合同金额万元"]].head(20)


Unnamed: 0,项目名称,主要标的名称,合同金额万元
12,江苏省消防救援总队2023年度全省器材（第二批）采购项目,江苏省消防救援总队2023年全省器材（第二批）,0.4539
25,2023年公用基础设施建设工程,公用基础设施建设项目EPC 总承包,937.25
28,成都市天府新区消防救援大队2023年伙食物资保障及配送服务项目,2023年伙食物资保障及配送服务项目,328.59
33,白山市森林消防支队长白山池南区大队迁建营区配套设施建设项目,吉林省森林消防总队白山市支队长白山池南区大队迁建营区配套设施项目,225.0
99,江苏省消防救援总队2023年度全省器材（第二批）采购项目,江苏省消防救援总队2023年全省器材（第二批）,4.2
103,北京林业大学学7、10号楼部分电梯采购安装项目,学7、10号楼部分电梯采购安装项目,187.52
203,第七次全国荒漠化和沙化调查技术准备,荒漠化石漠化等调查管理项目,180.0
242,天津市消防救援总队2023年常规消耗性器材第二批部门集中采购项目（高新支队）,常规消耗性器材（高新第二批第8包）,18.0
248,云南复杂地形下精细化温度预报产品研发,云南复杂地形下精细化温度预报产品研发项目,44.8
277,一张图综合决策移动APP建设,一张图综合决策移动APP建设项目,179.0


In [None]:
df.head()

In [None]:
df["主要标的名称"].value_counts()

In [None]:
df['项目名称'].unique()

In [None]:
df['项目名称'].nunique()

In [None]:
import re
from collections import Counter
import pandas as pd

# 统计所有词
all_words = []
for name in df["主要标的名称"].dropna().astype(str):
    words = re.split(r"[、，,（）() ]", name)
    all_words.extend(words)

# 前100高频词
counter = Counter(all_words)
top100 = [(w, c) for w, c in counter.most_common(100) if w.strip() != ""]

# 转成 DataFrame
kw_df = pd.DataFrame(top100, columns=["keyword", "count"])

# 新建一列 category，空着等你手动填
kw_df["category"] = ""

# 导出 CSV
kw_df.to_csv("keywords.csv", index=False, encoding="utf-8-sig")

print("已导出 keywords.csv，可手动编辑 category 列")


In [None]:
import re
from collections import Counter
import pandas as pd
import os

def export_top_keywords(df, col_name, csv_path="/Users/yxy/UChi/Summer2025/Procurement/dta/keywords_classified.csv", topn=100):
    """
    从 df[col_name] 提取高频词，生成/追加到 csv 文件
    """
    # 分词
    all_words = []
    for name in df[col_name].dropna().astype(str):
        words = re.split(r"[、，,（）() ]", name)
        all_words.extend(words)

    # 高频词
    counter = Counter(all_words)
    topn_words = [(w, c) for w, c in counter.most_common(topn) if w.strip() != ""]
    new_df = pd.DataFrame(topn_words, columns=["keyword", "count"])
    new_df["category"] = ""

    # 如果文件已存在 → 读入旧文件并合并
    if os.path.exists(csv_path):
        old_df = pd.read_csv(csv_path)
        combined = pd.concat([old_df, new_df], ignore_index=True)
        # 去重（同一个 keyword 保留第一次出现）
        combined = combined.drop_duplicates(subset=["keyword"], keep="first")
    else:
        combined = new_df

    # 保存
    combined.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"已更新 {csv_path}, 当前总词数: {len(combined)}")




In [None]:
import pandas as pd
import re

# 读取关键词分类文件
kw_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords_classified.csv"
kw_df = pd.read_csv(kw_file)

# 构造关键词 -> 类别映射
mapping = dict(zip(kw_df["keyword"].astype(str), kw_df["category"].astype(str)))

def classify_name(name):
    if pd.isna(name) or not isinstance(name, str) or name.strip() == "":
        return pd.NA
    
    words = re.split(r"[、，,（）() ]", name)
    categories = set()

    for w in words:
        if w in mapping:
            categories.add(mapping[w])
    
    if len(categories) == 1:
        return categories.pop()
    elif len(categories) > 1:
        return "notsure"
    else:
        return pd.NA

# 应用分类
df["cat"] = df["主要标的名称"].apply(classify_name)

# 检查结果
print(df["cat"].value_counts(dropna=False))


In [None]:
df.loc[df['cat'] == 'notsure', 'cat'] = '服务'


In [None]:
df["cat"].value_counts(dropna=False)

In [None]:
df_miss1 = df[df["cat"].isna()]
df_miss1.shape

In [None]:
export_top_keywords(df_miss1, "主要标的名称", "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords_classified.csv", topn=100)

In [None]:
mask = df["主要标的名称"].astype(str).str.contains("汽油", na=False)
result = df.loc[mask]

print("匹配行数:", len(result))
result.head()


In [None]:
import pandas as pd
import re

kw_file = "/Users/yxy/UChi/Summer2025/Procurement/dta/keywords_classified.csv"
kw_df = pd.read_csv(kw_file)

mapping = dict(zip(kw_df["keyword"].astype(str), kw_df["category"].astype(str)))

def classify_name(name, current_cat):
    if isinstance(current_cat, str) and current_cat.strip() != "":
        return current_cat
    
    if pd.isna(name) or not isinstance(name, str) or name.strip() == "":
        return pd.NA
    
    words = re.split(r"[、，,（）() ]", name)
    categories = set()

    for w in words:
        if w in mapping:
            categories.add(mapping[w])
    
    if len(categories) == 1:
        return categories.pop()
    elif len(categories) > 1:
        return "notsure"
    else:
        return pd.NA

# 应用分类（对已有 cat=NA 的进行补充）
df["cat"] = df.apply(lambda row: classify_name(row["主要标的名称"], row.get("cat", "")), axis=1)

# 检查结果
print(df["cat"].value_counts(dropna=False))


In [None]:
pip install jieba


In [None]:
import jieba

text = "吉林省救援总队2023年全省抢险救灾装备项目（三）"
words = jieba.lcut(text)
print(words)
