In [17]:
# %%
import re

import pandas as pd

In [18]:
# ruff :noqa: E501
prov_town = r'(?P<省>[^省]{2,5}(省|自治区))?(?P<市>[^市]{2,3}市)?(?P<县>[\u4e00-\u9fa5]{2,4}(县|市|区))?(?P<镇>[\u4e00-\u9fa5]{2,4}(街道|镇))?'
cun_street = r'(?P<行政村>[\u4e00-\u9fa5]{2,4}?(行政村|(?<!自然)村|社区))?(?P<自然村>[\u4e00-\u9fa5]{2,4}(自然村|村|居委会|委员会))?(?P<路>[\u4e00-\u9fa5]{1,4}(路|街|大道|道|巷))?'
num_none = r'(?P<号>\d{1,4}号)?(?P<区>[\u4e00-\u9fa5A-Za-z\d]{2,4}?(小区|区|园|洲|苑|府|宅|塘|公寓|屋|城))?(?P<ignore>[\u4e00-\u9fa5\d]*)?'
pattern = prov_town + cun_street + num_none
keywords = {}


def extract(addr: str, address_pattern=pattern) -> dict:
    match = re.match(address_pattern, addr)
    res = {}
    if match:
        res: dict[str, str] = match.groupdict(default='')
    # 识别关键字
    for key, value in keywords.items():
        if key in res['ignore']:
            res[value] += key
            res['ignore'] = res['ignore'].replace(key, '', 1)
    # 识别第二次
    match2 = re.search(address_pattern, res['ignore'])
    if match2:
        for key, value in match2.groupdict(default='').items():
            if res[key] == '':
                res[key] = value
                res['ignore'] = res['ignore'].replace(value, '')
            elif res[key] != value and key in {'路', '自然村'}:
                res[key] += value
                res['ignore'] = res['ignore'].replace(value, '')
    # 特别，加上号
    match3 = re.match(r'[\u4e00-\u9fa5\d]+?\d+号', res['ignore'])
    if match3 and res['区'] is None:
        res['号'] = match3.group()
        res['ignore'] = res['ignore'].replace(match3.group(), '')
    return res


In [19]:
import sys

word = sys.argv[1].replace(' ', '')
fp_excel = sys.argv[2] if len(sys.argv) > 2 else 'address.xlsx'


In [20]:
series = pd.read_excel(fp_excel)['联系地址']
df_excel = series.apply(extract).apply(pd.Series)
df_excel['地址'] = series

In [26]:
word = '塘下自然村游埠镇'
query_dict = extract(word)
search_dict = [[k, v] for k, v in query_dict.items() if v]
df_res = df_excel.copy(deep=True)
for k, v in search_dict:
    df_res = df_res.loc[df_excel[k] == v]
print(df_res['地址'])


3    浙江省金华市兰溪市游埠镇前童村塘下自然村
Name: 地址, dtype: object
