In [None]:
import json
import re
import pandas as pd
from datetime import datetime, timezone

# =========================
# CONFIG
# =========================
IN_PATH = "expekt.txt"
OUT_CSV = "expekt_player_props.csv"

COLUMNS = [
    "event",
    "player",
    "selectionLabel",
    "odds_decimal",
    "status_selection",
    "marketLabel",
    "deadline",
]

# =========================
# HELPERS
# =========================
def load_json(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
        if not txt:
            raise ValueError(f"File is empty: {path}")
        return json.loads(txt)

def to_float(x):
    return pd.to_numeric(x, errors="coerce")

def normalize_selection_label_from_threshold(n: int) -> str:
    # Expekt: 1+ -> Over 0.5, 2+ -> Over 1.5, 3+ -> Over 2.5
    return f"Over {n - 0.5}"

def normalize_player_name(name: str) -> str:
    """
    Expekt bruger ofte 'Efternavn, Fornavn'
    Vi laver det om til 'Fornavn Efternavn' for at matche Betsson/Betano bedre.
    Hvis der ikke er komma, behold som den er.
    """
    if not isinstance(name, str):
        return name
    s = name.strip()
    if "," in s:
        last, first = s.split(",", 1)
        last = last.strip()
        first = first.strip()
        if first and last:
            return f"{first} {last}"
    return s

def build_event_name(doc: dict) -> str:
    """
    Byg 'Home vs Away' fra participants med position HOME/AWAY hvis muligt.
    Fallback: doc['name'] hvis den findes.
    """
    if isinstance(doc.get("name"), str) and doc["name"].strip():
        # nogle feeds har allerede et navn, men vi foretrækker HOME/AWAY hvis muligt
        fallback = doc["name"].strip()
    else:
        fallback = "unknown_event"

    parts = doc.get("participants") or []
    if not isinstance(parts, list) or len(parts) == 0:
        return fallback

    home = None
    away = None
    for p in parts:
        pos = p.get("position")
        nm = p.get("name")
        if not isinstance(nm, str) or not nm.strip():
            continue
        nm = nm.strip()
        if pos == "HOME":
            home = nm
        elif pos == "AWAY":
            away = nm

    if home and away:
        return f"{home} vs {away}"

    # fallback hvis position mangler
    names = [p.get("name") for p in parts if isinstance(p.get("name"), str) and p.get("name").strip()]
    if len(names) >= 2:
        return f"{names[0].strip()} vs {names[1].strip()}"

    return fallback

def get_deadline(doc: dict):
    """
    Expekt har typisk startTime som ISO string, fx '2026-01-08T20:00:00Z'.
    Vi gemmer den som tekst i deadline (ligesom de andre CSV'er).
    """
    st = doc.get("startTime")
    if isinstance(st, str) and st.strip():
        return st.strip()
    return None

def is_full_match_market(market_name: str) -> bool:
    """
    Filtrer markeder der tydeligt handler om halvleg/perioder/quarters osv.
    """
    if not isinstance(market_name, str):
        return True
    t = market_name.lower()

    bad_tokens = [
        "1. halvleg", "2. halvleg", "halvleg",
        "1st half", "2nd half", "first half", "second half",
        "periode", "period", "quarter", "q1", "q2", "q3", "q4",
        "overtime", "over time", "ekstra tid", "forlænget",
        "inkl. overtid", "incl. overtime",
    ]
    return not any(tok in t for tok in bad_tokens)

def classify_market_and_threshold(market_name: str):
    """
    Finder:
      - market_type som matcher din Betsson-labels
      - threshold n (1,2,3,...) fra '1+' osv

    Eksempler:
      'Spiller har 1+ skud' -> ('Spillers samlede antal skud', 1)
      'Spiller har 2+ skud på mål' -> ('Antal afslutninger på mål', 2)
      'Spiller laver 1+ assists' -> ('Spillers samlede antal assister', 1)
    """
    if not isinstance(market_name, str):
        return (None, None)

    s = market_name.strip()

    # threshold: find "X+"
    m = re.search(r"\b(\d+)\+\b", s)
    if not m:
        return (None, None)
    n = int(m.group(1))

    # shots on target
    if re.search(r"skud\s+p[åa]\s+m[åa]l", s, flags=re.IGNORECASE):
        return ("Antal afslutninger på mål", n)

    # total shots
    if re.search(r"\bskud\b", s, flags=re.IGNORECASE):
        return ("Spillers samlede antal skud", n)

    # assists
    if re.search(r"\bassist", s, flags=re.IGNORECASE):
        return ("Spillers samlede antal assister", n)

    # fouls/frispark (hvis Expekt på et tidspunkt har dem)
    if re.search(r"\bfrispark\b", s, flags=re.IGNORECASE) or re.search(r"\bfoul", s, flags=re.IGNORECASE):
        return ("Spiller Frispark Begået", n)

    return (None, None)

# =========================
# PARSER
# =========================
def parse_expekt_player_props(doc: dict) -> pd.DataFrame:
    event_name = build_event_name(doc)
    deadline = get_deadline(doc)

    rows_out = []

    markets = doc.get("markets") or []
    if not isinstance(markets, list):
        markets = []

    for market in markets:
        market_name = market.get("name")
        if not isinstance(market_name, str) or not market_name.strip():
            continue

        # kun hele kampen
        if not is_full_match_market(market_name):
            continue

        market_type, n = classify_market_and_threshold(market_name)
        if not market_type or not n:
            continue

        selection_label = normalize_selection_label_from_threshold(n)

        outcomes = market.get("outcomes") or []
        if not isinstance(outcomes, list) or len(outcomes) == 0:
            continue

        for o in outcomes:
            raw_player = o.get("name")
            if not isinstance(raw_player, str) or not raw_player.strip():
                continue
            player = normalize_player_name(raw_player)

            odds_dec = o.get("formatDecimal")  # Expekt bruger formatDecimal til decimal odds
            odds_dec = to_float(odds_dec)

            if pd.isna(odds_dec):
                continue

            status_sel = o.get("status") or market.get("status") or "Open"

            market_label = f"{market_type} | {player}"

            rows_out.append({
                "event": event_name,
                "player": player,
                "selectionLabel": selection_label,
                "odds_decimal": odds_dec,
                "status_selection": status_sel,
                "marketLabel": market_label,
                "deadline": deadline,
            })

    df = pd.DataFrame(rows_out, columns=COLUMNS)
    if df.empty:
        return df

    # Dedupe: der bør kun være én række per (event, marketLabel, selectionLabel)
    df = df.sort_values(["event", "marketLabel", "selectionLabel", "odds_decimal"], kind="stable")
    df = df.drop_duplicates(subset=["event", "marketLabel", "selectionLabel"], keep="first").reset_index(drop=True)

    # pæn sortering som de andre
    df = df.sort_values(["marketLabel", "player", "selectionLabel"], kind="stable").reset_index(drop=True)
    return df

def main():
    doc = load_json(IN_PATH)
    df = parse_expekt_player_props(doc)

    df.to_csv(OUT_CSV, index=False, encoding="utf-8")
    print(f"Saved {len(df)} rows to {OUT_CSV}")
    print(df.head(25))

if __name__ == "__main__":
    main()
