In [5]:
# -*- coding: utf-8 -*-
from pathlib import Path
import unicodedata, re, difflib, json

# -------- helpers -----------------------------------------------------------
def normalise(name: str) -> str:
    """ASCII-friendly, punctuation-free version of a club name."""
    name = unicodedata.normalize("NFKD", name.lower())
    name = "".join(c for c in name if not unicodedata.combining(c))          # de-accent
    name = re.sub(r"[^\w\s]", " ", name)                                      # drop punct.
    return re.sub(r"\s+", " ", name).strip()

# Hand-picked fixes where fuzzy-matching alone is ambiguous
ALIASES = {
    "hamburg":               "hamburger sv",
    "nan":                   "nantes",
    "celta":                 "celta vigo",
    "man city":              "manchester city",
    "ath bilbao":            "athletic club",
    "wigan":                 "wigan athletic",
    "leeds":                 "leeds united",
    "murcia":                "real murcia",
    "derby":                 "derby county",
    "mainz":                 "mainz 05",
    "fc koln":               "köln",
    "bielefeld":             "arminia",
    "norwich":               "norwich city",
    "leicester":             "leicester city",
    "clermont":              "clermont foot",
    "ath madrid":            "atlético madrid",
    "aachen":                "aa aachen",
    "cottbus":               "energie cottbus",
    "braunschweig":          "btsv",
    "ipswich":               "ipswich town",
    "hull":                  "hull city",
    "sp gijon":              "sporting gijón",
    "sociedad":              "real sociedad",
    "santander":             "racing sant",
    "luton":                 "luton town",
    "arles":                 "arles-avignon",
    "ajaccio gfco":          "gazélec ajaccio",
    "fortuna dusseldorf":    "düsseldorf",
    "vallecano":             "rayo vallecano",
    "stoke":                 "stoke city",
    "verona":                "hellas verona",
    "man united":            "manchester utd",
    "evian thonon gaillard": "evian",
    "cardiff":               "cardiff city",
    "swansea":               "swansea city",
    "hertha":                "hertha bsc",
}

# -------- load the two source lists -----------------------------------------
odds_path  = Path("/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/list_of_teams_odds.txt")
clean_path = Path("/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/list_of_teams.txt")

odds  = [line.strip() for line in odds_path.read_text(encoding="utf-8").splitlines() if line.strip()]
clean = [line.strip() for line in clean_path.read_text(encoding="utf-8").splitlines() if line.strip()]

# Dicts keyed by their normalised form for quick lookup
odds_norm  = {normalise(t): t for t in odds}
clean_norm = {normalise(t): t for t in clean}

# -------- build the mapping -------------------------------------------------
TEAM_NAME_MAP = {}
for norm_key, original in odds_norm.items():
    # 1️⃣ perfect normalised match
    if norm_key in clean_norm:
        TEAM_NAME_MAP[original] = clean_norm[norm_key]
        continue

    # 2️⃣ close fuzzy match
    best = difflib.get_close_matches(norm_key, clean_norm.keys(), n=1, cutoff=0.80)
    if best:
        TEAM_NAME_MAP[original] = clean_norm[best[0]]
        continue

    # 3️⃣ manual alias fallback
    if original.lower() in ALIASES:
        alias_norm = normalise(ALIASES[original.lower()])
        if alias_norm in clean_norm:
            TEAM_NAME_MAP[original] = clean_norm[alias_norm]

# Optional – dump to JSON for reuse
# Path("team_name_map.json").write_text(json.dumps(TEAM_NAME_MAP, ensure_ascii=False, indent=2))

print(json.dumps(TEAM_NAME_MAP, ensure_ascii=False, indent=2))

# safe the mapping to a file under /Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/
output_path = Path("/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/team_name_map.json")
output_path.write_text(json.dumps(TEAM_NAME_MAP, ensure_ascii=False, indent=2))


{
  "almeria": "almería",
  "everton": "everton",
  "hercules": "hércules",
  "bastia": "bastia",
  "bolton": "bolton",
  "hamburg": "hamburger sv",
  "nan": "nantes",
  "boulogne": "boulogne",
  "nott'm forest": "nott'ham forest",
  "espanol": "espanyol",
  "west brom": "west brom",
  "huesca": "huesca",
  "napoli": "napoli",
  "watford": "watford",
  "greuther furth": "greuther fürth",
  "grenoble": "grenoble",
  "celta": "celta vigo",
  "lorient": "lorient",
  "man city": "manchester city",
  "livorno": "livorno",
  "getafe": "getafe",
  "ath bilbao": "athletic club",
  "siena": "siena",
  "m'gladbach": "gladbach",
  "wigan": "wigan athletic",
  "atalanta": "atalanta",
  "duisburg": "msv duisburg",
  "leeds": "leeds united",
  "huddersfield": "huddersfield",
  "murcia": "real murcia",
  "eibar": "eibar",
  "derby": "derby county",
  "mainz": "mainz 05",
  "fc koln": "köln",
  "schalke 04": "schalke 04",
  "real madrid": "real madrid",
  "toulouse": "toulouse",
  "empoli": "empoli",


AttributeError: 'PosixPath' object has no attribute 'close'