In [2]:
import pandas as pd
import numpy as np
import re
import unicodedata

# =========================
# CONFIG
# =========================
BETINIA_CSV = "betinia_player_props.csv"
EXPEKT_CSV  = "expekt_player_props.csv"
BETANO_CSV  = "betano_player_props.csv"

OUT_COMBINED = "betinia_expekt_betano_combined.csv"

MERGE_WITH_DEADLINE = False

RATIO_THRESHOLD = 1.5
SECOND_BEST_MAX = 3.5

FINAL_COLUMNS = [
    "event",
    "player",
    "selectionLabel",
    "marketLabel",
    "payout_betinia",
    "payout_expekt",
    "payout_betano",
    "best_bookmaker",
    "best_payout",
    "second_best_payout",
    "ratio_best_over_second",
]

# =========================
# EVENT NORMALIZATION
# =========================
def strip_accents(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

# VIGTIGT: her er dit komma efter "RCD" rettet, og vi håndterer Balompie også
DROP_TOKENS = {
    # generelle klubord
    "fc", "cf", "sc", "ac", "afc", "cfc",
    "ud", "cd", "rcd", "sv", "fk", "sk",
    "club", "klub", "calcio", "association",

    # typiske sponsornavne / ekstra
    "balompie", "balompie",  # efter accent-strip bliver balompié til balompie
    "deportivo",
}

# Nogle ord er farlige at droppe altid (fx "real" kan være en vigtig del af navnet).
# Jeg anbefaler IKKE at droppe "real" generelt.
# Hvis du virkelig vil, så gør det kun som trailing word og kun hvis der er mindst 2 ord tilbage.
DANGEROUS_TOKENS = {"real"}

def clean_team_name(team: str) -> str:
    """
    Normaliserer et holdnavn så "Levante UD" og "Levante" matcher,
    og "Real Betis Balompié" matcher "Real Betis".
    """
    if not isinstance(team, str):
        return None

    s = team.strip()
    s = strip_accents(s)
    s = s.lower()

    # fjern parenteser og tegnsætning
    s = re.sub(r"\([^)]*\)", " ", s)
    s = re.sub(r"[^\w\s]", " ", s)  # kun bogstaver tal underscore mellemrum
    s = re.sub(r"\s+", " ", s).strip()

    if not s:
        return None

    words = s.split()

    # drop leading tokens
    while len(words) >= 2 and words[0] in DROP_TOKENS:
        words.pop(0)

    # drop trailing tokens
    while len(words) >= 2 and words[-1] in DROP_TOKENS:
        words.pop()

    # ekstra: håndter "real" kun som trailing og kun hvis vi stadig har mindst 2 ord bagefter
    while len(words) >= 3 and words[-1] in DANGEROUS_TOKENS:
        words.pop()

    # pæn casing tilbage (title-case)
    out = " ".join(words).strip()
    out = re.sub(r"\s+", " ", out)
    return out.title() if out else None

def normalize_event(event: str) -> str:
    """
    Gør event-navn til "Hold1 vs Hold2" og normaliserer hvert hold.
    """
    if not isinstance(event, str):
        return None

    s = event.strip()

    # ensart separatorer
    s = s.replace(" vs. ", " vs ")
    s = s.replace(" v ", " vs ")
    s = s.replace(" - ", " vs ")

    # split på vs
    parts = [p.strip() for p in s.split("vs")]
    if len(parts) != 2:
        # fallback: bare accent-strip og whitespace
        s2 = strip_accents(s)
        s2 = re.sub(r"\s+", " ", s2).strip()
        return s2

    home_raw, away_raw = parts
    home = clean_team_name(home_raw)
    away = clean_team_name(away_raw)

    if not home or not away:
        s2 = strip_accents(s)
        s2 = re.sub(r"\s+", " ", s2).strip()
        return s2

    return f"{home} vs {away}"

# =========================
# LOAD
# =========================
df_betinia = pd.read_csv(BETINIA_CSV)
df_expekt  = pd.read_csv(EXPEKT_CSV)
df_betano  = pd.read_csv(BETANO_CSV)

if df_betinia.empty:
    raise ValueError("Betinia CSV er tom.")
if df_expekt.empty:
    raise ValueError("Expekt CSV er tom.")
if df_betano.empty:
    raise ValueError("Betano CSV er tom.")

# =========================
# NORMALIZE / TYPES
# =========================
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # keys som strings uden spaces
    for c in ["event", "player", "selectionLabel", "marketLabel"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()

    if "deadline" in df.columns:
        df["deadline"] = df["deadline"].astype(str).str.strip()

    # NORMALISER EVENT HER (vigtigt for merge)
    if "event" in df.columns:
        df["event"] = df["event"].apply(normalize_event)

    # odds -> float
    if "odds_decimal" in df.columns:
        df["odds_decimal"] = pd.to_numeric(df["odds_decimal"], errors="coerce")

    return df

df_betinia = clean_df(df_betinia)
df_expekt  = clean_df(df_expekt)
df_betano  = clean_df(df_betano)

# Rename odds/status per bookmaker
df_betinia = df_betinia.rename(columns={
    "odds_decimal": "odds_decimal_betinia",
    "status_selection": "status_selection_betinia",
})

df_expekt = df_expekt.rename(columns={
    "odds_decimal": "odds_decimal_expekt",
    "status_selection": "status_selection_expekt",
})

df_betano = df_betano.rename(columns={
    "odds_decimal": "odds_decimal_betano",
    "status_selection": "status_selection_betano",
})

# =========================
# MERGE KEYS
# =========================
if MERGE_WITH_DEADLINE:
    keys = ["event", "player", "selectionLabel", "marketLabel", "deadline"]
else:
    keys = ["event", "player", "selectionLabel", "marketLabel"]

# =========================
# OUTER MERGE
# =========================
df_merged = df_betinia.merge(df_expekt, on=keys, how="outer")
df_merged = df_merged.merge(df_betano, on=keys, how="outer")

# =========================
# PAYOUT COLS
# =========================
df_merged["payout_betinia"] = pd.to_numeric(df_merged.get("odds_decimal_betinia"), errors="coerce")
df_merged["payout_expekt"]  = pd.to_numeric(df_merged.get("odds_decimal_expekt"), errors="coerce")
df_merged["payout_betano"]  = pd.to_numeric(df_merged.get("odds_decimal_betano"), errors="coerce")

# =========================
# BEST + SECOND BEST
# =========================
payout_cols = ["payout_betinia", "payout_expekt", "payout_betano"]
payout_matrix = df_merged[payout_cols].copy()
payout_matrix = payout_matrix.where(payout_matrix > 0)

df_merged["best_payout"] = payout_matrix.max(axis=1)
df_merged["best_bookmaker"] = payout_matrix.idxmax(axis=1)

df_merged["best_bookmaker"] = df_merged["best_bookmaker"].map({
    "payout_betinia": "Betinia",
    "payout_expekt": "Expekt",
    "payout_betano": "Betano",
})

arr = payout_matrix.to_numpy(dtype=float)
sorted_vals = np.sort(arr, axis=1)

second_best = np.full(len(df_merged), np.nan, dtype=float)
for i in range(len(df_merged)):
    row = sorted_vals[i]
    row = row[~np.isnan(row)]
    if len(row) >= 2:
        second_best[i] = row[-2]

df_merged["second_best_payout"] = second_best

valid_best = (
    df_merged["best_payout"].notna() &
    df_merged["second_best_payout"].notna() &
    (df_merged["best_payout"] > 0) &
    (df_merged["second_best_payout"] > 0)
)

df_merged["ratio_best_over_second"] = np.nan
df_merged.loc[valid_best, "ratio_best_over_second"] = (
    df_merged.loc[valid_best, "best_payout"] / df_merged.loc[valid_best, "second_best_payout"]
)

# =========================
# FILTERS
# =========================
df_filtered = df_merged[
    valid_best &
    (df_merged["ratio_best_over_second"] >= RATIO_THRESHOLD) &
    (df_merged["second_best_payout"] <= SECOND_BEST_MAX)
].copy()

# =========================
# OUTPUT
# =========================
df_out = df_filtered[FINAL_COLUMNS].copy()
df_out = df_out.sort_values("ratio_best_over_second", ascending=False, kind="stable").reset_index(drop=True)

df_out.to_csv(OUT_COMBINED, index=False, encoding="utf-8")

print(f"Rows written to CSV: {len(df_out)}")
print(f"Saved CSV: {OUT_COMBINED}")
print(df_out.head(100).to_string(index=False))

Rows written to CSV: 248
Saved CSV: betinia_expekt_betano_combined.csv
                     event               player selectionLabel                                            marketLabel  payout_betinia  payout_expekt  payout_betano best_bookmaker  best_payout  second_best_payout  ratio_best_over_second
Rayo Vallecano vs Mallorca       Alfonso Espino       Over 2.5           Spillers samlede antal skud | Alfonso Espino             NaN           2.15          10.25         Betano      10.2500              2.1500                4.767442
           Udinese vs Pisa     Michel Aebischer            Yes                      Spiller scorer | Michel Aebischer          2.5455          11.00            NaN         Expekt      11.0000              2.5455                4.321351
       Levante vs Espanyol        Ramon Terrats       Over 3.5            Spillers samlede antal skud | Ramon Terrats             NaN           3.20          13.00         Betano      13.0000              3.2000          

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re

# =========================
# CONFIG
# =========================
BETINIA_CSV = "betinia_player_props.csv"
EXPEKT_CSV  = "expekt_player_props.csv"
BETANO_CSV  = "betano_player_props.csv"

OUT_COMBINED = "betinia_expekt_betano_combined.csv"

MERGE_WITH_DEADLINE = False

RATIO_THRESHOLD = 1.5
SECOND_BEST_MAX = 3.5

FINAL_COLUMNS = [
    "event",
    "marketLabel",
    "selectionLabel",
    "payout_betinia",
    "payout_expekt",
    "payout_betano",
    "best_bookmaker",
    "best_payout",
    "second_best_payout",
    "ratio_best_over_second",
]

# =========================
# EVENT NORMALIZATION
# =========================
def strip_accents(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

DROP_TOKENS = {
    "fc", "cf", "sc", "ac", "afc", "cfc",
    "ud", "cd", "rcd", "sv", "fk", "sk",
    "club", "klub", "calcio", "association",
    "balompie",  # balompié -> balompie efter accent-strip
    "deportivo",
}

# "real" er farlig at droppe generelt, så vi dropper den kun som trailing
DANGEROUS_TOKENS = {"real"}

def clean_team_name(team: str) -> str:
    if not isinstance(team, str):
        return None

    s = strip_accents(team.strip()).lower()
    s = re.sub(r"\([^)]*\)", " ", s)
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    if not s:
        return None

    words = s.split()

    # drop leading tokens
    while len(words) >= 2 and words[0] in DROP_TOKENS:
        words.pop(0)

    # drop trailing tokens
    while len(words) >= 2 and words[-1] in DROP_TOKENS:
        words.pop()

    # drop trailing "real" kun hvis der stadig er mindst 2 ord bagefter
    while len(words) >= 3 and words[-1] in DANGEROUS_TOKENS:
        words.pop()

    out = " ".join(words).strip()
    out = re.sub(r"\s+", " ", out)
    return out.title() if out else None

def normalize_event(event: str) -> str:
    """
    Gør event-navn til "Hold1 vs Hold2" og normaliserer hvert hold.
    """
    if not isinstance(event, str):
        return None

    s = event.strip()
    s = s.replace(" vs. ", " vs ")
    s = s.replace(" v ", " vs ")
    s = s.replace(" - ", " vs ")

    parts = [p.strip() for p in s.split("vs")]
    if len(parts) != 2:
        s2 = strip_accents(s)
        s2 = re.sub(r"\s+", " ", s2).strip()
        return s2

    home_raw, away_raw = parts
    home = clean_team_name(home_raw)
    away = clean_team_name(away_raw)

    if not home or not away:
        s2 = strip_accents(s)
        s2 = re.sub(r"\s+", " ", s2).strip()
        return s2

    return f"{home} vs {away}"

# =========================
# LOAD
# =========================
df_betinia = pd.read_csv(BETINIA_CSV)
df_expekt  = pd.read_csv(EXPEKT_CSV)
df_betano  = pd.read_csv(BETANO_CSV)

if df_betinia.empty:
    raise ValueError("Betinia CSV er tom.")
if df_expekt.empty:
    raise ValueError("Expekt CSV er tom.")
if df_betano.empty:
    raise ValueError("Betano CSV er tom.")

# =========================
# CLEAN
# =========================
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for c in ["event", "player", "selectionLabel", "marketLabel"]:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()

    if "deadline" in df.columns:
        df["deadline"] = df["deadline"].astype(str).str.strip()

    if "odds_decimal" in df.columns:
        df["odds_decimal"] = pd.to_numeric(df["odds_decimal"], errors="coerce")

    if "event" in df.columns:
        df["event"] = df["event"].apply(normalize_event)

    return df

df_betinia = clean_df(df_betinia)
df_expekt  = clean_df(df_expekt)
df_betano  = clean_df(df_betano)

# rename odds/status
df_betinia = df_betinia.rename(columns={
    "odds_decimal": "odds_decimal_betinia",
    "status_selection": "status_selection_betinia",
})
df_expekt = df_expekt.rename(columns={
    "odds_decimal": "odds_decimal_expekt",
    "status_selection": "status_selection_expekt",
})
df_betano = df_betano.rename(columns={
    "odds_decimal": "odds_decimal_betano",
    "status_selection": "status_selection_betano",
})

# =========================
# MERGE
# =========================
if MERGE_WITH_DEADLINE:
    keys = ["event", "player", "selectionLabel", "marketLabel", "deadline"]
else:
    keys = ["event", "player", "selectionLabel", "marketLabel"]

df_merged = df_betinia.merge(df_expekt, on=keys, how="outer")
df_merged = df_merged.merge(df_betano, on=keys, how="outer")

# =========================
# PAYOUT COLS
# =========================
df_merged["payout_betinia"] = pd.to_numeric(df_merged.get("odds_decimal_betinia"), errors="coerce")
df_merged["payout_expekt"]  = pd.to_numeric(df_merged.get("odds_decimal_expekt"), errors="coerce")
df_merged["payout_betano"]  = pd.to_numeric(df_merged.get("odds_decimal_betano"), errors="coerce")

# -------------------------
# NYT KRAV: alle tre skal findes (ingen NaN) + være > 0
# -------------------------
all_three_present = (
    df_merged["payout_betinia"].notna() &
    df_merged["payout_expekt"].notna() &
    df_merged["payout_betano"].notna() &
    (df_merged["payout_betinia"] > 0) &
    (df_merged["payout_expekt"] > 0) &
    (df_merged["payout_betano"] > 0)
)

# =========================
# BEST + SECOND BEST
# =========================
payout_cols = ["payout_betinia", "payout_expekt", "payout_betano"]
payout_matrix = df_merged[payout_cols].copy()
payout_matrix = payout_matrix.where(payout_matrix > 0)

df_merged["best_payout"] = payout_matrix.max(axis=1)
df_merged["best_bookmaker"] = payout_matrix.idxmax(axis=1)

df_merged["best_bookmaker"] = df_merged["best_bookmaker"].map({
    "payout_betinia": "Betinia",
    "payout_expekt": "Expekt",
    "payout_betano": "Betano",
})

arr = payout_matrix.to_numpy(dtype=float)
sorted_vals = np.sort(arr, axis=1)

second_best = np.full(len(df_merged), np.nan, dtype=float)
for i in range(len(df_merged)):
    row = sorted_vals[i]
    row = row[~np.isnan(row)]
    if len(row) >= 2:
        second_best[i] = row[-2]

df_merged["second_best_payout"] = second_best

valid_best = (
    df_merged["best_payout"].notna() &
    df_merged["second_best_payout"].notna() &
    (df_merged["best_payout"] > 0) &
    (df_merged["second_best_payout"] > 0)
)

df_merged["ratio_best_over_second"] = np.nan
df_merged.loc[valid_best, "ratio_best_over_second"] = (
    df_merged.loc[valid_best, "best_payout"] / df_merged.loc[valid_best, "second_best_payout"]
)

# =========================
# FILTERS
# =========================
df_filtered = df_merged[
    all_three_present &               # <-- NYT: kræv alle tre bookmakere
    valid_best &
    (df_merged["ratio_best_over_second"] >= RATIO_THRESHOLD) &
    (df_merged["second_best_payout"] <= SECOND_BEST_MAX)
].copy()

# =========================
# OUTPUT
# =========================
df_out = df_filtered[FINAL_COLUMNS].copy()
df_out = df_out.sort_values("ratio_best_over_second", ascending=False, kind="stable").reset_index(drop=True)

df_out.to_csv(OUT_COMBINED, index=False, encoding="utf-8")

print(f"Rows written to CSV: {len(df_out)}")
print(f"Saved CSV: {OUT_COMBINED}")
print(df_out.head(100).to_string(index=False))

Rows written to CSV: 30
Saved CSV: betinia_expekt_betano_combined.csv
                    event                                       marketLabel selectionLabel  payout_betinia  payout_expekt  payout_betano best_bookmaker  best_payout  second_best_payout  ratio_best_over_second
     Villarreal vs Alaves  Spillers samlede antal skud | Georges Mikautadze       Over 4.5          1.7143           2.30           7.30         Betano         7.30              2.3000                3.173913
     Villarreal vs Alaves  Spillers samlede antal skud | Georges Mikautadze       Over 3.5          1.3200           1.53           3.80         Betano         3.80              1.5300                2.483660
      Levante vs Espanyol      Spillers samlede antal skud | Pablo Martinez       Over 2.5          1.7500           2.30           5.00         Betano         5.00              2.3000                2.173913
     Villarreal vs Alaves        Spillers samlede antal skud | Nicolas Pepe       Over 3.5    