In [None]:
import re
from pathlib import Path

import pandas as pd


In [None]:
def cut_head(s: str) -> str:
    pat_ignore = r'[^\u4e00-\u9fa5]*([\u4e00-\u9fa5\d]*)[^\u4e00-\u9fa5\d]?'
    s = str(s)
    match = re.match(pat_ignore, s)
    if match:
        addr = match.group(1)
        search_phone = re.search(r'(/d{11})', s)
        if search_phone:
            addr += search_phone.group(1)
        return addr
    else:
        return ''


# ruff :noqa: E501
prov_town = r'(?P<省>[^省]{2,5}(省|自治区))?(?P<市>[^市]{2,3}市)?(?P<县>[\u4e00-\u9fa5]{2,4}(县|市|区))?(?P<镇>[\u4e00-\u9fa5]{2,4}(街道|镇))?'
cun_street = r'(?P<行政村>[\u4e00-\u9fa5]{2,4}?(行政村|(?<!自然)村|社区))?(?P<自然村>[\u4e00-\u9fa5]{2,4}(自然村|村|居委会|委员会))?(?P<路>[\u4e00-\u9fa5]{1,4}(路|街|大道|道|巷))?'
num_none = r'(?P<号>\d{1,4}号)?(?P<区>[\u4e00-\u9fa5A-Za-z\d]{2,4}?(小区|区|园|洲|苑|府|宅|塘|公寓|屋|城))?(?P<ignore>[\u4e00-\u9fa5\d]*)?'
pattern = prov_town + cun_street + num_none
keywords = {}


def extract(addr: str, address_pattern=pattern) -> dict:
    match = re.match(address_pattern, addr)
    res = {}
    if match:
        res: dict[str, str] = match.groupdict(default='')
    # 识别关键字
    for key, value in keywords.items():
        if key in res['ignore']:
            res[value] += key
            res['ignore'] = res['ignore'].replace(key, '', 1)
    # 识别第二次
    match2 = re.search(address_pattern, res['ignore'])
    if match2:
        for key, value in match2.groupdict(default='').items():
            if res[key] == '':
                res[key] = value
                res['ignore'] = res['ignore'].replace(value, '')
            elif res[key] != value and key in {'路', '自然村'}:
                res[key] += value
                res['ignore'] = res['ignore'].replace(value, '')
    # 特别，加上号
    match3 = re.match(r'[\u4e00-\u9fa5\d]+?\d+号', res['ignore'])
    if match3 and res['区'] is None:
        res['号'] = match3.group()
        res['ignore'] = res['ignore'].replace(match3.group(), '')
    return res


def convert_file(file_path: str) -> None:
    """Convert file to excel."""
    df1: pd.Series = pd.read_excel(file_path).iloc[:, 0]
    df2: pd.Series = df1.apply(cut_head)
    df3: pd.DataFrame = df2.apply(extract).apply(pd.Series)
    df3['raw'] = df1
    df3.columns = ['省', '市', '县', '镇', '行政村', '自然村', '路', '号', '区', 'ignore', 'raw']
    # df3 = df3.drop(columns=['ignore'])
    fp = Path(file_path)
    df3.to_excel(fp.parent / ('convert_' + str(fp.name)), index=False)

In [None]:
if __name__ == '__main__':
    # convert_file('地址.xlsx')
    addr = '浙江省金华市兰溪市马涧镇严宅行政村塘下自然村8号'
    asf = extract(cut_head(addr))
    print(asf)