In [1]:
import re
import unicodedata

In [28]:
class JapaneseAddressParser:
    def __init__(self):
        self.PREFECTURE_PATTERN = re.compile(r"([^ -~]{2,3}県|..府|東京都|北海道)(.+)")
        self.CITY_PATTERN = re.compile(
            r"^((?:旭川|伊達|石狩|盛岡|奥州|田村|南相馬|那須塩原|東村山|武蔵村山|羽村|十日町|上越|富山|野々市|大町|蒲郡|四日市|姫路|大和郡山|廿日市|下松|岩国|田川|大村)市|.+?郡(?:玉村|大町|.+?)[町村]|.+?市.+?区|.+?[市区町村])(.+)"
        )
        # ここで微調整する
        self.ADDRESS_DETAILS_PATTERN = re.compile(
            r"(\d+|[一二三四五六七八九十百千万])+(\d+|[一二三四五六七八九十百千万]|丁目|丁|番地|番|号|-|‐|ー|−|の|東|西|南|北)*(丁目|丁|番地|番|号|\d+)"
        )

    @staticmethod
    def cleansing(location):
        location = unicodedata.normalize("NFKC", location)
        location = re.sub(r"[.,。、:・*゛'_+/\\+]", "", location)
        location = location.upper()
        location = re.sub(r"[【】≪≫《》〔〕\[\]<>「」()]", "", location)
        location = re.sub(r"[字大字小字]", "", location)
        location = location.replace("ケ", "ヶ")
        location = location.replace("之", "の").replace("ノ", "の")
        location = location.replace("通り", "通").replace("通リ", "通")
        location = location.replace("上ル", "上る")
        location = location.replace("下ル", "下る")
        location = re.sub(r"\s+", "", location)
        location = location.replace("~", "～")

        return location

    @staticmethod
    def convert_kanji_to_int(string):
        result = string.translate(str.maketrans("零〇一壱二弐三参四五六七八九拾", "00112233456789十", ""))
        convert_table = {"十": "0", "百": "00"}
        unit_list = "|".join(convert_table.keys())
        while re.search(unit_list, result):
            for unit in convert_table.keys():
                zeros = convert_table[unit]
                for numbers in re.findall(f"(\d+){unit}(\d+)", result):
                    result = result.replace(numbers[0] + unit + numbers[1], numbers[0] + zeros[len(numbers[1]):len(zeros)] + numbers[1])
                for number in re.findall(f"(\d+){unit}", result):
                    result = result.replace(number + unit, number + zeros)
                for number in re.findall(f"{unit}(\d+)", result):
                    result = result.replace(unit + number, "1" + zeros[len(number):len(zeros)] + number)
                result = result.replace(unit, "1" + zeros)
        return result
    
    @staticmethod
    def modify_block(block):
        convert_block = re.findall(r"\d+", block)
        convert_block = '-'.join(convert_block)
        return convert_block

    def parse_address(self, address):
        address = self.cleansing(address)

        prefecture_match = self.PREFECTURE_PATTERN.match(address)
        if prefecture_match:
            prefecture = prefecture_match.group(1)
            remaining_address = prefecture_match.group(2)
        else:
            prefecture = None
            remaining_address = address

        city_match = self.CITY_PATTERN.match(remaining_address)
        if city_match:
            city = city_match.group(1)
            remaining_address = city_match.group(2)
        else:
            city = None


        try:       
            block = self.convert_kanji_to_int(self.ADDRESS_DETAILS_PATTERN.search(remaining_address)[0])
            #print("block: ", self.ADDRESS_DETAILS_PATTERN.search(remaining_address))
            block_span = self.ADDRESS_DETAILS_PATTERN.search(remaining_address).span()
            town = remaining_address[:block_span[0]]
            building = remaining_address[block_span[1]:]
            block = self.modify_block(block)
        except:
            town = ""
            block = ""
            building = ""

        return {
            "prefecture": prefecture,
            "city": city,
            "town": town,
            "block": block,
            "building": building,
        }

# テストコード
parser = JapaneseAddressParser()
#address = "東京都港区間之町通丸太町下る大津町六丁目十九番地の５号ABCビル201号室"
address = "東京都港区桜丁町19-6"
parsed_address = parser.parse_address(address)

print(f"元の住所: {address}")
print(f"都道府県: {parsed_address['prefecture']}")
print(f"市区町村: {parsed_address['city']}")
print(f"町域: {parsed_address['town']}")
print(f"丁目・番地・号: {parsed_address['block']}")
print(f"ビル名等: {parsed_address['building']}")

# 注意：「丁目・番地・号」が漢数字でそれらがハイフンでつながっている場合は対応できていない。
# 注意：町域に「数字+丁」が入っている場合は対応できていない。

元の住所: 東京都港区桜丁町19-6
都道府県: 東京都
市区町村: 港区
町域: 桜丁町
丁目・番地・号: 19-6
ビル名等: 
