In [33]:
import json
import re
import unicodedata
import pandas as pd
from datetime import datetime, timezone

# =========================
# CONFIG
# =========================
EVENT_PATH = "event_response.json"

BETSSON_MARKETS_IN_ORDER = [
    {"name": "fouls",         "accordion_path": "accordion_response_fouls.json"},
    {"name": "shots",         "accordion_path": "accordion_response_shots.json"},
    {"name": "shotsOnTarget", "accordion_path": "accordion_response_shotsOnTarget.json"},
    {"name": "assists",       "accordion_path": "accordion_response_assists.json"},
]

BETANO_IN_PATH = "betano.txt"
EXPEKT_IN_PATH = "expekt.txt"

OUT_CSV = "betsson_betano_expekt_combined.csv"

OUT_COLUMNS = [
    "event",
    "player",
    "selectionLabel",
    "marketLabel",
    "deadline",
    "status_selection_betsson",
    "odds_decimal_betsson",
    "status_selection_betano",
    "odds_decimal_betano",
    "status_selection_expekt",
    "odds_decimal_expekt",
]

MERGE_WITH_DEADLINE = False  # False = merge uden deadline

# =========================
# GENERIC HELPERS
# =========================
def load_json(path: str):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
        if not txt:
            raise ValueError(f"Fil er tom: {path}")
        return json.loads(txt)

def to_float(x):
    return pd.to_numeric(x, errors="coerce")

def ms_to_iso_utc(ms):
    if ms is None:
        return None
    try:
        ms = int(ms)
        return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).isoformat()
    except Exception:
        return None

def normalize_event_name(name: str) -> str:
    """
    Betano "Arsenal FC - Liverpool FC" -> "Arsenal vs Liverpool"
    """
    if not isinstance(name, str):
        return None
    s = name.strip()
    s = re.sub(r"\bFC\b", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s+", " ", s).strip()
    s = s.replace(" - ", " vs ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def norm_text(x):
    """
    Gør strings merge-sikre:
    - unicode normalisering
    - trim
    - kollaps flere mellemrum
    - fjern usynlige chars
    """
    if x is None:
        return ""
    s = str(x)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u00a0", " ")  # non-breaking space -> space
    s = re.sub(r"\s+", " ", s).strip()
    return s

def norm_df_keys(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].apply(norm_text)
    return df

# =========================
# BETSSON PARSER
# =========================
def parse_line_from_marketid(market_id: str):
    if not isinstance(market_id, str):
        return None
    m = re.search(r"-([0-9]+(?:\.[0-9]+)?)-\d+$", market_id)
    return float(m.group(1)) if m else None

def selection_decimal_betsson(sel: dict):
    mp = sel.get("marketSelectionPriceFormats") or sel.get("oddsFormats") or {}
    if isinstance(mp, dict):
        if "1" in mp:
            return mp["1"]
        if "decimal" in mp:
            return mp["decimal"]
    return None

def extract_player_from_market_betsson(market: dict):
    group_labels = (market.get("marketSpecifics", {}) or {}).get("groupLabels", {}) or {}
    player = group_labels.get("2")
    if not player:
        lab = market.get("label") or market.get("marketFriendlyName") or ""
        if "|" in lab:
            _, right = lab.split("|", 1)
            player = right.strip()
    return player

def build_event_map(event_json: dict) -> dict:
    m = {}

    def name_from_event(evt: dict):
        participants = evt.get("participants") or []
        home = None
        away = None
        for p in participants:
            if p.get("side") == 1:
                home = p.get("label")
            elif p.get("side") == 2:
                away = p.get("label")
        if not home and len(participants) > 0:
            home = participants[0].get("label")
        if not away and len(participants) > 1:
            away = participants[1].get("label")
        if home and away:
            return f"{home} vs {away}"
        return None

    def id_from_event(evt: dict):
        return evt.get("eventId") or evt.get("id")

    data = event_json.get("data") if isinstance(event_json, dict) else None

    if isinstance(data, dict) and isinstance(data.get("event"), dict):
        evt = data["event"]
        eid = id_from_event(evt)
        ename = name_from_event(evt)
        if eid and ename:
            m[eid] = ename

    if isinstance(data, dict) and isinstance(data.get("events"), list):
        for evt in data["events"]:
            if not isinstance(evt, dict):
                continue
            eid = id_from_event(evt)
            ename = name_from_event(evt)
            if eid and ename:
                m[eid] = ename

    if isinstance(event_json, list):
        for evt in event_json:
            if not isinstance(evt, dict):
                continue
            eid = id_from_event(evt)
            ename = name_from_event(evt)
            if eid and ename:
                m[eid] = ename

    return m

def get_event_id_from_accordion(accordion: dict):
    accs = (accordion.get("data", {}) or {}).get("accordions", {}) or {}
    for _, group in accs.items():
        markets = group.get("markets") or []
        if markets:
            return markets[0].get("eventId")
    return None

def parse_betsson_one_accordion(accordion: dict, source_order: int, event_map: dict) -> pd.DataFrame:
    accordions = (accordion.get("data", {}) or {}).get("accordions", {}) or {}

    event_id = get_event_id_from_accordion(accordion)
    event_text = event_map.get(event_id) if event_id else None
    if not event_text:
        event_text = event_id

    market_rows = []
    selection_rows = []

    for _, group in accordions.items():
        for m in group.get("markets", []):
            market_id = m.get("id")
            player = extract_player_from_market_betsson(m)

            market_rows.append({
                "source_order": source_order,
                "event": event_text,
                "event_norm": normalize_event_name(event_text),
                "marketId": market_id,
                "marketLabel": m.get("label"),
                "deadline": m.get("deadline"),
                "player": player,
                "line": parse_line_from_marketid(market_id),
            })

        for s in group.get("selections", []):
            selection_rows.append({
                "marketId": s.get("marketId"),
                "selectionLabel": s.get("label"),
                "odds_decimal_betsson": selection_decimal_betsson(s),
                "status_selection_betsson": s.get("status"),
            })

    df_markets = pd.DataFrame(market_rows)
    df_selections = pd.DataFrame(selection_rows)

    if df_markets.empty or df_selections.empty:
        return pd.DataFrame()

    df_all = df_selections.merge(df_markets, on="marketId", how="left")
    df_all["odds_decimal_betsson"] = to_float(df_all["odds_decimal_betsson"])
    return df_all

def parse_betsson_all() -> pd.DataFrame:
    event_json = load_json(EVENT_PATH)
    event_map = build_event_map(event_json)

    parts = []
    for i, item in enumerate(BETSSON_MARKETS_IN_ORDER, start=1):
        accordion = load_json(item["accordion_path"])
        df_part = parse_betsson_one_accordion(accordion, source_order=i, event_map=event_map)
        if not df_part.empty:
            parts.append(df_part)

    if not parts:
        return pd.DataFrame()

    df = pd.concat(parts, ignore_index=True)

    keep = [
        "source_order", "event", "event_norm", "player",
        "selectionLabel", "marketLabel", "deadline",
        "status_selection_betsson", "odds_decimal_betsson"
    ]
    df = df[keep].copy()

    df = norm_df_keys(df, ["event_norm", "player", "selectionLabel", "marketLabel", "deadline"])
    return df

# =========================
# SHARED MARKET NORMALIZATION (Betano + Expekt)
# =========================
def normalize_market_header_to_betsson(header: str):
    if not isinstance(header, str) or not header.strip():
        return header
    h = header.strip()

    # shots on target
    if re.search(r"\bskud\s+p[åa]\s+m[åa]l\b", h, flags=re.IGNORECASE):
        return "Antal afslutninger på mål"

    # total shots
    if re.search(r"\bskud\b", h, flags=re.IGNORECASE):
        return "Spillers samlede antal skud"

    # assists
    if re.search(r"\bassist", h, flags=re.IGNORECASE):
        return "Spillers samlede antal assister"

    # fouls
    if re.search(r"\bfrispark\b", h, flags=re.IGNORECASE) or re.search(r"\bfoul", h, flags=re.IGNORECASE) or re.search(r"\bforseels", h, flags=re.IGNORECASE):
        return "Spiller Frispark Begået"

    return h

def is_full_match_text_ok(*texts):
    txt = " | ".join([t for t in texts if isinstance(t, str)]).lower()
    bad_tokens = [
        "1. halvleg", "2. halvleg", "halvleg",
        "1st half", "2nd half", "first half", "second half",
        "periode", "period", "quarter", "q1", "q2", "q3", "q4",
        "overtime", "over time", "ekstra tid", "forlænget",
        "inkl. overtid", "incl. overtime",
    ]
    return not any(t in txt for t in bad_tokens)

def normalize_selection_label_plus(label: str):
    """
    1+ -> Over 0.5
    2+ -> Over 1.5
    3+ -> Over 2.5
    """
    if not isinstance(label, str):
        return label
    lab = label.strip()
    m = re.fullmatch(r"(\d+)\+", lab)
    if m:
        x = int(m.group(1))
        return f"Over {x - 0.5}"
    return lab

# =========================
# BETANO PARSER
# =========================
def build_event_name_betano(evt: dict) -> str:
    name = evt.get("name") or evt.get("shortName")
    if isinstance(name, str) and name.strip():
        return name.strip()

    parts = evt.get("participants") or []
    if isinstance(parts, list) and len(parts) >= 2:
        labels = []
        for p in sorted(parts, key=lambda x: x.get("sortOrder", 9999)):
            lab = p.get("label")
            if isinstance(lab, str) and lab.strip():
                labels.append(lab.strip())
        if len(labels) >= 2:
            return f"{labels[0]} - {labels[1]}"

    slug = evt.get("slug")
    if isinstance(slug, str) and slug.strip():
        return slug.strip()

    return str(evt.get("id") or evt.get("eventId") or "unknown_event")

def pick_market_header_betano(market: dict) -> str:
    tl = market.get("tableLayout") or {}
    header = tl.get("headerTitle")
    if isinstance(header, str) and header.strip():
        return header.strip()
    nm = market.get("name")
    if isinstance(nm, str) and nm.strip():
        return nm.strip()
    return None

def selection_price_decimal_betano(sel: dict):
    p = sel.get("price")
    if isinstance(p, (int, float)):
        return p
    if isinstance(p, str):
        return p.strip()
    if isinstance(p, dict):
        for k in ["decimal", "odds", "value"]:
            if k in p:
                return p.get(k)
    odds_formats = sel.get("oddsFormats") or {}
    if isinstance(odds_formats, dict) and "decimal" in odds_formats:
        return odds_formats.get("decimal")
    return None

def strip_leading_player_from_header(header: str, players_in_market):
    if not isinstance(header, str) or not header.strip():
        return header
    header = header.strip()

    players_sorted = sorted(
        [p for p in players_in_market if isinstance(p, str) and p.strip()],
        key=lambda x: len(x),
        reverse=True
    )
    for p in players_sorted:
        p = p.strip()
        if header.lower().startswith(p.lower() + " "):
            return header[len(p):].strip()
    return header

def parse_betano_all() -> pd.DataFrame:
    doc = load_json(BETANO_IN_PATH)
    evt = ((doc.get("data") or {}).get("event")) or doc.get("event") or {}
    event_name = build_event_name_betano(evt)
    event_norm = normalize_event_name(event_name)

    rows_out = []
    markets = evt.get("markets") or []
    if not isinstance(markets, list):
        markets = []

    for market in markets:
        tl = market.get("tableLayout") or {}
        rows = tl.get("rows") or []
        if not isinstance(rows, list) or len(rows) == 0:
            continue

        raw_header = pick_market_header_betano(market)

        # kun hele kampen
        if not is_full_match_text_ok(raw_header, market.get("name")):
            continue

        players_in_market = []
        for r in rows:
            nm = r.get("title") or r.get("name")
            if isinstance(nm, str) and nm.strip():
                players_in_market.append(nm.strip())

        header_no_player = strip_leading_player_from_header(raw_header, players_in_market)
        market_type = normalize_market_header_to_betsson(header_no_player)

        deadline = ms_to_iso_utc(
            market.get("marketCloseTimeMillis")
            or market.get("closeTimeMillis")
            or market.get("deadlineMillis")
        )
        status_market = market.get("status") or market.get("marketStatus") or "Open"

        for r in rows:
            player = r.get("title") or r.get("name")
            if not (isinstance(player, str) and player.strip()):
                continue
            player = player.strip()

            market_label = f"{market_type} | {player}" if market_type else player

            group_selections = r.get("groupSelections") or []
            if not isinstance(group_selections, list):
                continue

            for gs in group_selections:
                sels = gs.get("selections") or []
                if not isinstance(sels, list):
                    continue

                for sel in sels:
                    raw_label = sel.get("name") or sel.get("label")
                    if not (isinstance(raw_label, str) and raw_label.strip()):
                        continue

                    selection_label = normalize_selection_label_plus(raw_label)
                    odds_dec = selection_price_decimal_betano(sel)

                    rows_out.append({
                        "event_betano": event_name,
                        "event_norm": event_norm,
                        "player": player,
                        "selectionLabel": selection_label,
                        "marketLabel": market_label,
                        "deadline": deadline,
                        "status_selection_betano": status_market,
                        "odds_decimal_betano": odds_dec,
                    })

    df = pd.DataFrame(rows_out)
    if df.empty:
        return df

    df["odds_decimal_betano"] = to_float(df["odds_decimal_betano"])
    df = df.dropna(subset=["odds_decimal_betano"]).reset_index(drop=True)

    df = norm_df_keys(df, ["event_norm", "player", "selectionLabel", "marketLabel", "deadline"])

    # dedupe efter full match filter
    df = df.sort_values(["event_norm", "marketLabel", "selectionLabel", "odds_decimal_betano"], kind="stable")
    df = df.drop_duplicates(subset=["event_norm", "marketLabel", "selectionLabel"], keep="first").reset_index(drop=True)
    return df

# =========================
# EXPEKT PARSER
# =========================
def build_event_name_expekt(doc: dict) -> str:
    # typisk har Expekt participants og evt name
    name = doc.get("name") or doc.get("eventName")
    if isinstance(name, str) and name.strip():
        return name.strip()

    parts = doc.get("participants") or []
    if isinstance(parts, list) and len(parts) >= 2:
        # prøv home/away hvis findes
        home = None
        away = None
        for p in parts:
            side = p.get("side") or p.get("type")
            lab = p.get("name") or p.get("label")
            if side in ("HOME", "home", 1) and lab:
                home = lab
            if side in ("AWAY", "away", 2) and lab:
                away = lab
        if not home:
            home = parts[0].get("name") or parts[0].get("label")
        if not away and len(parts) > 1:
            away = parts[1].get("name") or parts[1].get("label")
        if home and away:
            return f"{home} vs {away}"

    return "unknown_event"

def iso_from_expekt_time(ts):
    # Expekt: "2026-01-08T20:00:00Z" -> "2026-01-08T20:00:00+00:00"
    if not isinstance(ts, str):
        return None
    s = ts.strip()
    if s.endswith("Z"):
        s = s[:-1] + "+00:00"
    return s

def player_name_from_outcome(name: str):
    # "Mac Allister, Alexis" -> "Alexis Mac Allister"
    if not isinstance(name, str):
        return None
    n = name.strip()
    if "," in n:
        last, first = [p.strip() for p in n.split(",", 1)]
        if first and last:
            return f"{first} {last}"
    return n

def selection_label_from_x_plus(x: int) -> str:
    return f"Over {x - 0.5}"

def status_to_open(s):
    s2 = str(s).strip().upper()
    if s2 in ["OPEN", "ACTIVE", "TRADING", "ONGOING"]:
        return "Open"
    return str(s).strip() if s is not None else "Open"

def parse_expekt_all() -> pd.DataFrame:
    doc = load_json(EXPEKT_IN_PATH)

    markets = doc.get("markets") or []
    if not isinstance(markets, list):
        markets = []

    # event name (samme som din ipynb)
    event_raw = doc.get("name")
    if not isinstance(event_raw, str) or not event_raw.strip():
        parts = doc.get("participants") or []
        if isinstance(parts, list) and len(parts) >= 2:
            a = parts[0].get("name")
            b = parts[1].get("name")
            event_raw = f"{a} vs {b}"
        else:
            event_raw = "unknown_event"

    event_name = normalize_event_name(event_raw)
    event_norm = normalize_event_name(event_name)

    # deadline (samme som din ipynb)
    deadline = iso_from_expekt_time(doc.get("startTime"))

    # KUN de præcise markets du allerede ved virker (ingen gæt)
    patterns = [
        (r"^Spiller har (\d+)\+ skud på mål$", "Antal afslutninger på mål"),
        (r"^Spiller har (\d+)\+ skud$", "Spillers samlede antal skud"),
        (r"^Spiller laver (\d+)\+ assists$", "Spillers samlede antal assister"),
        (r"^Spiller begår (\d+)\+ forseelser$", "Spiller Frispark Begået"),
    ]

    rows_out = []

    for m in markets:
        market_name = m.get("name")
        if not isinstance(market_name, str) or not market_name.strip():
            continue
        market_name = market_name.strip()

        market_type = None
        x_val = None

        for pat, mtype in patterns:
            mm = re.match(pat, market_name)
            if mm:
                market_type = mtype
                x_val = int(mm.group(1))
                break

        if market_type is None or x_val is None:
            continue

        selection_label = selection_label_from_x_plus(x_val)
        status_market = status_to_open(m.get("status"))

        outcomes = m.get("outcomes") or []
        if not isinstance(outcomes, list) or len(outcomes) == 0:
            continue

        for o in outcomes:
            player_raw = o.get("name")
            player = player_name_from_outcome(player_raw)
            if not player:
                continue

            odds_dec = to_float(o.get("formatDecimal"))
            if pd.isna(odds_dec):
                continue

            market_label = f"{market_type} | {player}"

            rows_out.append({
                "event_expekt": event_name,
                "event_norm": event_norm,
                "player": player,
                "selectionLabel": selection_label,
                "marketLabel": market_label,
                "deadline": deadline,
                "status_selection_expekt": status_market,
                "odds_decimal_expekt": odds_dec,
            })

    df = pd.DataFrame(rows_out)
    if df.empty:
        return df

    # samme dedupe idé som du allerede bruger
    df = df.sort_values(["event_norm", "marketLabel", "selectionLabel", "odds_decimal_expekt"], kind="stable")
    df = df.drop_duplicates(subset=["event_norm", "marketLabel", "selectionLabel"], keep="first").reset_index(drop=True)

    # vigtig: hård normalisering af merge-keys
    for c in ["event_norm", "player", "selectionLabel", "marketLabel", "deadline"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()

    return df

    df["odds_decimal_expekt"] = to_float(df["odds_decimal_expekt"])
    df = df.dropna(subset=["odds_decimal_expekt"]).reset_index(drop=True)

    df = norm_df_keys(df, ["event_norm", "player", "selectionLabel", "marketLabel", "deadline"])

    # dedupe pr nøgle så merge ikke eksploderer
    df = df.sort_values(["event_norm", "marketLabel", "selectionLabel", "odds_decimal_expekt"], kind="stable")
    df = df.drop_duplicates(subset=["event_norm", "marketLabel", "selectionLabel"], keep="first").reset_index(drop=True)

    return df

# =========================
# MERGE + OUTPUT
# =========================
def main():
    df_betsson = parse_betsson_all()
    df_betano = parse_betano_all()
    df_expekt = parse_expekt_all()

    if df_betsson.empty:
        raise ValueError("Betsson-data er tom. Tjek dine accordion_response_*.json og event_response.json")
    if df_betano.empty:
        raise ValueError("Betano-data er tom. Tjek betano.txt")
    if df_expekt.empty:
        raise ValueError("Expekt-data er tom. Tjek expekt.txt eller din Expekt-parser")

    # Merge keys
    if MERGE_WITH_DEADLINE:
        keys = ["event_norm", "player", "selectionLabel", "marketLabel", "deadline"]
    else:
        keys = ["event_norm", "player", "selectionLabel", "marketLabel"]

    # 1) Betsson left
    df = df_betsson.copy()

    # 2) Merge Betano onto Betsson
    df = df.merge(df_betano, on=keys, how="left")
    betano_matches = df["odds_decimal_betano"].notna().sum()
    print(f"Merge matches Betano: {betano_matches}/{len(df)}")

    # 3) Merge Expekt onto (Betsson + Betano)
    df = df.merge(df_expekt, on=keys, how="left")
    expekt_matches = df["odds_decimal_expekt"].notna().sum()
    print(f"Merge matches Expekt: {expekt_matches}/{len(df)}")

    # Sikr event kolonne
    if "event" not in df.columns:
        if "event_x" in df.columns:
            df["event"] = df["event_x"]
        else:
            df["event"] = df.get("event_betano")

    # deadline: hvis ikke merget på deadline, behold Betssons deadline
    if "deadline" not in df.columns and "deadline_x" in df.columns:
        df["deadline"] = df["deadline_x"]

    # sørg for alle OUT_COLUMNS findes
    for col in OUT_COLUMNS:
        if col not in df.columns:
            df[col] = pd.NA

    df_out = df[OUT_COLUMNS].copy()

    # sortering i Betsson market order
    if "source_order" in df.columns:
        df_out = df_out.assign(source_order=df["source_order"].values)
        df_out = df_out.sort_values(
            ["source_order", "event", "player", "marketLabel", "selectionLabel"],
            kind="stable",
            na_position="last"
        ).drop(columns=["source_order"])

    df_out.to_csv(OUT_CSV, index=False, encoding="utf-8")
    print(f"Saved {len(df_out)} rows to {OUT_CSV}")
    print(df_out.head(20))

if __name__ == "__main__":
    main()

Merge matches Betano: 275/465
Merge matches Expekt: 204/465
Saved 465 rows to betsson_betano_expekt_combined.csv
                    event                player selectionLabel  \
40   Arsenal vs Liverpool   Alexis Mac Allister       Over 0.5   
41   Arsenal vs Liverpool   Alexis Mac Allister       Over 1.5   
39   Arsenal vs Liverpool   Alexis Mac Allister       Over 2.5   
126  Arsenal vs Liverpool           Amara Nallo       Over 0.5   
127  Arsenal vs Liverpool           Amara Nallo       Over 1.5   
125  Arsenal vs Liverpool           Amara Nallo       Over 2.5   
94   Arsenal vs Liverpool  Andre HarrimanAnnous       Over 0.5   
96   Arsenal vs Liverpool  Andre HarrimanAnnous       Over 1.5   
95   Arsenal vs Liverpool  Andre HarrimanAnnous       Over 2.5   
65   Arsenal vs Liverpool      Andrew Robertson       Over 0.5   
66   Arsenal vs Liverpool      Andrew Robertson       Over 1.5   
55   Arsenal vs Liverpool             Ben White       Over 0.5   
56   Arsenal vs Liverpool    

  df = pd.concat(parts, ignore_index=True)


In [36]:
import pandas as pd
import numpy as np
import re

# --- 1) Input / Output ---
IN_MERGED_CSV = "betsson_betano_expekt_combined.csv"   # ret hvis din fil hedder noget andet
OUT_ALL_WITH_RATIOS = "merged_with_ratios.csv"
OUT_VALUEBETS = "valuebets_ratio_ge_2.csv"

THRESHOLD = 1.5

# --- 2) Robust odds parsing ---
def parse_odds_series(s: pd.Series) -> pd.Series:
    """
    Gør odds kolonner robuste:
    - "1,33" -> 1.33
    - fjerner fx spaces og andet støj
    - tomme værdier -> NaN
    """
    if s is None:
        return pd.Series(dtype="float64")

    s = s.astype(str)

    # tomme/NaN-agtige strings
    s = s.replace({"": np.nan, "nan": np.nan, "None": np.nan, "NaN": np.nan})

    # dansk komma -> punktum
    s = s.str.replace(",", ".", regex=False)

    # fjern alt undtagen cifre og punktum
    s = s.str.replace(r"[^0-9.]+", "", regex=True)

    # hvis der er flere punktummer, behold kun første som decimal-separator
    # (sjældent, men kan ske ved dårlig formatting)
    def fix_multi_dots(x):
        if not isinstance(x, str) or x == "":
            return np.nan
        if x.count(".") <= 1:
            return x
        parts = x.split(".")
        return parts[0] + "." + "".join(parts[1:])

    s = s.apply(fix_multi_dots)

    return pd.to_numeric(s, errors="coerce")


# --- 3) Load ---
df = pd.read_csv(IN_MERGED_CSV)

# Forventede odds kolonner (tilpas kun hvis dine hedder noget andet)
ODDS_COLS = [
    ("Betsson", "odds_decimal_betsson"),
    ("Betano",  "odds_decimal_betano"),
    ("Expekt",  "odds_decimal_expekt"),
]

missing = [col for _, col in ODDS_COLS if col not in df.columns]
if missing:
    raise ValueError(f"Mangler odds kolonner i input: {missing}")

# --- 4) Rens odds ---
for _, col in ODDS_COLS:
    df[col] = parse_odds_series(df[col])

# --- 5) Find best og second best odds pr række ---
odds_matrix = df[[col for _, col in ODDS_COLS]].to_numpy(dtype=float)

# best odds pr række
best_idx = np.nanargmax(np.where(np.isnan(odds_matrix), -np.inf, odds_matrix), axis=1)
best_odds = odds_matrix[np.arange(len(df)), best_idx]

# second best odds: sæt best til -inf og tag max igen
odds_for_second = odds_matrix.copy()
odds_for_second[np.arange(len(df)), best_idx] = -np.inf
second_best = np.nanmax(np.where(np.isnan(odds_for_second), -np.inf, odds_for_second), axis=1)
second_best = np.where(second_best == -np.inf, np.nan, second_best)

# ratio: best over second best (giver mening for "dobbelt så meget som næste bedste")
ratio_best_over_second = best_odds / second_best

# bookmaker navne
book_names = [name for name, _ in ODDS_COLS]
best_bookmaker = np.array(book_names, dtype=object)[best_idx]

# find second bookmaker (den der har second_best)
second_idx = np.nanargmax(np.where(np.isnan(odds_for_second), -np.inf, odds_for_second), axis=1)
second_bookmaker = np.array(book_names, dtype=object)[second_idx]

# Hvis second_best var NaN (kun 1 udbyder havde odds), så giv også second_bookmaker NaN
second_bookmaker = np.where(np.isnan(second_best), np.nan, second_bookmaker)

df["best_bookmaker"] = best_bookmaker
df["best_odds_decimal"] = best_odds
df["second_bookmaker"] = second_bookmaker
df["second_best_odds_decimal"] = second_best
df["ratio_best_over_second"] = ratio_best_over_second

# --- 6) Filter valuebets ---
valid = (
    np.isfinite(df["best_odds_decimal"]) &
    np.isfinite(df["second_best_odds_decimal"]) &
    (df["best_odds_decimal"] > 0) &
    (df["second_best_odds_decimal"] > 0)
)

df_value = df[valid & (df["ratio_best_over_second"] >= THRESHOLD)].copy()
df_value = df_value.sort_values("ratio_best_over_second", ascending=False, kind="stable")

# --- 7) Save ---
df.to_csv(OUT_ALL_WITH_RATIOS, index=False, encoding="utf-8")
df_value.to_csv(OUT_VALUEBETS, index=False, encoding="utf-8")

print(f"Gemte: {OUT_ALL_WITH_RATIOS} ({len(df)} rækker)")
print(f"Valuebets ratio >= {THRESHOLD}: {OUT_VALUEBETS} ({len(df_value)} rækker)")

# --- 8) Print top 20 ---
cols_show = [c for c in ["event", "player", "selectionLabel", "marketLabel", "deadline"] if c in df_value.columns]
cols_show += [
    "odds_decimal_betsson",
    "odds_decimal_betano",
    "odds_decimal_expekt",
    "best_bookmaker",
    "best_odds_decimal",
    "second_bookmaker",
    "second_best_odds_decimal",
    "ratio_best_over_second",
]

print("\nTOP 20 valuebets:")
print(df_value[cols_show].head(200).to_string(index=False))

Gemte: merged_with_ratios.csv (465 rækker)
Valuebets ratio >= 1.5: valuebets_ratio_ge_2.csv (39 rækker)

TOP 20 valuebets:
               event             player selectionLabel                                          marketLabel                  deadline  odds_decimal_betsson  odds_decimal_betano  odds_decimal_expekt best_bookmaker  best_odds_decimal second_bookmaker  second_best_odds_decimal  ratio_best_over_second
Arsenal vs Liverpool     Michael Laffey       Over 2.5         Spillers samlede antal skud | Michael Laffey                       NaN                  3.20                14.50                  NaN         Betano              14.50          Betsson                      3.20                4.531250
Arsenal vs Liverpool      Tommy Pilling       Over 2.5          Spillers samlede antal skud | Tommy Pilling                       NaN                  3.20                11.75                  NaN         Betano              11.75          Betsson                      3.20     