In [45]:
import re

GERMAN_STATES = {
    "baden-württemberg", "bayern", "berlin", "brandenburg", "bremen",
    "hamburg", "hessen", "mecklenburg-vorpommern", "niedersachsen",
    "nordrhein-westfalen", "rheinland-pfalz", "saarland", "sachsen",
    "sachsen-anhalt", "schleswig-holstein", "thüringen"
}

CITY_STATES = {"berlin", "hamburg", "bremen"}

def parse_location_field(location: str) -> dict:
    result = {
        "index": None,
        "city": None,
        "state": None,
        "country": None,
        "address": None,
    }

    if not isinstance(location, str) or not location.strip():
        return {k: "unknown" for k in result}

    raw_parts = [p.strip() for p in location.split(",") if p.strip()]
    had_null_at_end = raw_parts and raw_parts[-1].lower() == "null"

    # Убираем null
    parts = [p for p in raw_parts if p.lower() != "null"]

    if not parts:
        return {k: "unknown" for k in result}

    # --- Классификаторы ---
    def is_index(s: str) -> bool:
        return bool(re.fullmatch(r"\d{5}", s))

    def is_country(s: str) -> bool:
        return s.lower() == "deutschland"

    def is_state(s: str) -> bool:
        return s.lower() in GERMAN_STATES

    def is_address(s: str) -> bool:
        # адрес содержит цифры + буквы
        if re.search(r"[0-9]", s) and re.search(r"[A-Za-zÄÖÜäöüß]", s):
            return True
        # содержит типичные слова улиц
        if re.search(r"(straße|str\.|weg|allee|platz|ring|gasse)$", s.lower()):
            return True
        return False

    # --- Этап 1: index, state, country, address ---
    for p in parts:
        if result["index"] is None and is_index(p):
            result["index"] = p
            continue
        if result["state"] is None and is_state(p):
            result["state"] = p
            continue
        if result["country"] is None and is_country(p):
            result["country"] = p
            continue
        if result["address"] is None and is_address(p):
            result["address"] = p
            continue

    # --- Этап 2: city = первый элемент, который не классифицирован ---
    for p in parts:
        if p not in result.values():
            result["city"] = p
            break

    # --- Новое правило 1: города‑земли ---
    if result["state"] and result["state"].lower() in CITY_STATES:
        result["city"] = result["state"]

    # --- Новое правило 2: если null был в конце → address = unknown ---
    if had_null_at_end:
        result["address"] = "unknown"

    # --- None → unknown ---
    for k, v in result.items():
        if v is None:
            result[k] = "unknown"

    return result


In [47]:
# ------------------------------------------------------------
# ТЕСТОВЫЕ КЕЙСЫ ДЛЯ ПРОВЕРКИ parse_location_field
# ------------------------------------------------------------

loc_list = [
    "04109, Leipzig, Sachsen, Deutschland, null",
    "86159, Augsburg, Bayern, Bayern, Deutschland, null",
    "09117, Chemnitz, Sachsen, Sachsen, Deutschland, null",
    "33415, Verl, Nordrhein-Westfalen, Deutschland, Schinkenstraße 16",
    "47805, Krefeld, Nordrhein-Westfalen, Deutschland, Untergath",
    "50674, Köln, Nordrhein-Westfalen, Deutschland, null",
    "Deutschland, null",
    # дополнительные кейсы
    "80331, München, Bayern, Deutschland",
    "Berlin, Deutschland",
    "20095, Hamburg, null",
    "01067, Dresden, Sachsen",
    "null",
    "99634, Straußfurt, Thüringen, Deutschland, Industriegebiet",
    "",
]

def pretty_print_location(parsed: dict):
    print("{")
    for k in ["index", "city", "state", "country", "address"]:
        print(f" '{k}': '{parsed.get(k, 'unknown')}',")
    print("}")

for loc in loc_list:
    parsed = parse_location_field(loc)
    print("INPUT :", loc)
    print("OUTPUT:")
    pretty_print_location(parsed)
    print("-" * 60)


INPUT : 04109, Leipzig, Sachsen, Deutschland, null
OUTPUT:
{
 'index': '04109',
 'city': 'Leipzig',
 'state': 'Sachsen',
 'country': 'Deutschland',
 'address': 'unknown',
}
------------------------------------------------------------
INPUT : 86159, Augsburg, Bayern, Bayern, Deutschland, null
OUTPUT:
{
 'index': '86159',
 'city': 'Augsburg',
 'state': 'Bayern',
 'country': 'Deutschland',
 'address': 'unknown',
}
------------------------------------------------------------
INPUT : 09117, Chemnitz, Sachsen, Sachsen, Deutschland, null
OUTPUT:
{
 'index': '09117',
 'city': 'Chemnitz',
 'state': 'Sachsen',
 'country': 'Deutschland',
 'address': 'unknown',
}
------------------------------------------------------------
INPUT : 33415, Verl, Nordrhein-Westfalen, Deutschland, Schinkenstraße 16
OUTPUT:
{
 'index': '33415',
 'city': 'Verl',
 'state': 'Nordrhein-Westfalen',
 'country': 'Deutschland',
 'address': 'Schinkenstraße 16',
}
------------------------------------------------------------
INPU