In [6]:
from pathlib import Path
from collections import defaultdict, deque
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [7]:
def process_full_features(df_base, new_df=None, k=5, init_elo=1500, adv_const=0.05, elo_scale=150.0, K=20):
    df_base["Date"] = pd.to_datetime(df_base["Date"], dayfirst=True)
    
    if new_df is not None:
        new_df["Date"] = pd.to_datetime(new_df["Date"], dayfirst=True)
        df_base = pd.concat([df_base, new_df], ignore_index=True)
    
    df = df_base.sort_values("Date").reset_index(drop=True)
    df.index.name = "MatchID"

    ### Part 1: Win rate home/away
    long = pd.concat([
        pd.DataFrame({
            "MatchID": df.index,
            "Date": df["Date"],
            "Team": df["HomeTeam"],
            "is_home": True,
            "GF": df["Full Time Home Goals"],
            "GA": df["Full Time Away Goals"]
        }),
        pd.DataFrame({
            "MatchID": df.index,
            "Date": df["Date"],
            "Team": df["AwayTeam"],
            "is_home": False,
            "GF": df["Full Time Away Goals"],
            "GA": df["Full Time Home Goals"]
        })
    ], ignore_index=True)

    long["ResultNum"] = np.sign(long["GF"] - long["GA"])
    long["WinFlag"] = (long["ResultNum"] == 1).astype(float)
    long = long.sort_values(["Team", "Date", "MatchID"]).reset_index(drop=True)

    # Win rate k trận sân nhà
    def compute_win_rate_k(df, is_home=True):
        result = []
        for team, g in df[df["is_home"] == is_home].groupby("Team", sort=False):
            wins_hist = []
            for _, r in g.iterrows():
                rate = sum(wins_hist) / len(wins_hist) if wins_hist else 0.0
                result.append((r["MatchID"], team, rate))
                wins_hist.append(r["WinFlag"])
                if len(wins_hist) > k:
                    wins_hist.pop(0)
        return pd.DataFrame(result, columns=["MatchID", "Team", f'{"home" if is_home else "away"}_win_rate_k'])

    home_rate_df = compute_win_rate_k(long, True)
    away_rate_df = compute_win_rate_k(long, False)

    df = df.merge(home_rate_df.rename(columns={"Team": "HomeTeam"}), on=["MatchID", "HomeTeam"], how="left") \
           .merge(away_rate_df.rename(columns={"Team": "AwayTeam"}), on=["MatchID", "AwayTeam"], how="left")
    
    df[["home_win_rate_k", "away_win_rate_k"]] = df[["home_win_rate_k", "away_win_rate_k"]].fillna(0.0)
    df["Home_adv"] = df["home_win_rate_k"] - df["away_win_rate_k"] + adv_const
    df["Home_adv_elo"] = elo_scale * df["Home_adv"]

    ### Part 2: ELO
    teams = pd.unique(pd.concat([df["HomeTeam"], df["AwayTeam"]], ignore_index=True))
    elo = {team: float(init_elo) for team in teams}

    eH, eA = [], []
    for _, row in df.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        Rh, Ra = elo[h], elo[a]
        eH.append(Rh)
        eA.append(Ra)

        h_adv = row["Home_adv_elo"]
        Rdiff = (Rh + h_adv) - Ra
        We = 1.0 / (1.0 + 10 ** (-Rdiff / 400.0))
        hg, ag = row["Full Time Home Goals"], row["Full Time Away Goals"]
        W = 1.0 if hg > ag else (0.5 if hg == ag else 0.0)

        margin = abs(hg - ag)
        G = (np.log1p(margin) if margin > 0 else 1.0) * (2.2 / (0.001 * abs(Rdiff) + 2.2))
        delta = K * G * (W - We)

        elo[h] += delta
        elo[a] -= delta

    df["Elo_H_before"] = eH
    df["Elo_A_before"] = eA

    ### Part 3: Team split stats (WinStreak, Goals scored/conceded, etc.)
    def expand_team_view(df):
        home = pd.DataFrame({
            "MatchID": df.index, "Date": df["Date"], "Team": df["HomeTeam"],
            "Opponent": df["AwayTeam"], "is_home": True,
            "GS": df["Full Time Home Goals"], "GA": df["Full Time Away Goals"]
        })
        away = pd.DataFrame({
            "MatchID": df.index, "Date": df["Date"], "Team": df["AwayTeam"],
            "Opponent": df["HomeTeam"], "is_home": False,
            "GS": df["Full Time Away Goals"], "GA": df["Full Time Home Goals"]
        })
        return pd.concat([home, away], ignore_index=True)

    split = expand_team_view(df)
    split["ResultNum"] = np.sign(split["GS"] - split["GA"])
    split = split.sort_values(["Team", "Date", "MatchID"]).reset_index(drop=True)

    GS_shift = split.groupby("Team")["GS"].shift(1)
    GA_shift = split.groupby("Team")["GA"].shift(1)

    rolling_count = GS_shift.groupby(split["Team"]).rolling(window=k, min_periods=1).count().reset_index(level=0, drop=True)
    split["GS_k"] = GS_shift.groupby(split["Team"]).rolling(window=k, min_periods=1).sum().reset_index(level=0, drop=True)
    split["GA_k"] = GA_shift.groupby(split["Team"]).rolling(window=k, min_periods=1).sum().reset_index(level=0, drop=True)
    split["GD_k"] = split["GS_k"] - split["GA_k"]

    WinFlag = (split["ResultNum"] == 1).astype(int).groupby(split["Team"]).shift(1)
    LossFlag = (split["ResultNum"] == -1).astype(int).groupby(split["Team"]).shift(1)
    split["Wins_k"] = WinFlag.groupby(split["Team"]).rolling(window=k, min_periods=1).sum().reset_index(level=0, drop=True)
    split["Losses_k"] = LossFlag.groupby(split["Team"]).rolling(window=k, min_periods=1).sum().reset_index(level=0, drop=True)
    split["WinRate_k"] = split["Wins_k"] / rolling_count
    split["GS_avg_k"] = split["GS_k"] / rolling_count
    split["GA_avg_k"] = split["GA_k"] / rolling_count

    # Win/Lose Streak
    prev_result = split.groupby("Team")["ResultNum"].shift(1).fillna(0).astype(int)
    win_streak = np.zeros(len(split))
    lose_streak = np.zeros(len(split))
    for _, idxs in split.groupby("Team").indices.items():
        w = l = 0
        for pos in idxs:
            x = int(prev_result.iloc[pos])
            w = w + 1 if x == 1 else 0
            l = l + 1 if x == -1 else 0
            win_streak[pos] = min(w, k)
            lose_streak[pos] = min(l, k)

    split["WinStreak"] = win_streak.astype(int)
    split["LoseStreak"] = lose_streak.astype(int)

    # Merge lại về từng trận đấu
    features = ["MatchID","Team","is_home","GS_k","GA_k","GD_k","Wins_k","Losses_k",
                "WinRate_k","WinStreak","LoseStreak","GS_avg_k","GA_avg_k"]

    home_df = split[split["is_home"]][features].rename(columns={
        "Team":"HomeTeam","GS_k":"GoalsScore_H","GA_k":"GoalsAgainst_H","GD_k":"GoalDifference_H",
        "Wins_k":"Wins_H","Losses_k":"Losses_H","WinRate_k":"WinRate_H","WinStreak":"WinStreak_H",
        "LoseStreak":"LoseStreak_H","GS_avg_k":"GoalsScore_H_avg","GA_avg_k":"GoalsAgainst_H_avg"
    }).drop(columns="is_home")

    away_df = split[~split["is_home"]][features].rename(columns={
        "Team":"AwayTeam","GS_k":"GoalsScore_A","GA_k":"GoalsAgainst_A","GD_k":"GoalDifference_A",
        "Wins_k":"Wins_A","Losses_k":"Losses_A","WinRate_k":"WinRate_A","WinStreak":"WinStreak_A",
        "LoseStreak":"LoseStreak_A","GS_avg_k":"GoalsScore_A_avg","GA_avg_k":"GoalsAgainst_A_avg"
    }).drop(columns="is_home")

    df = (
        df.reset_index()
        .merge(home_df, on=["MatchID","HomeTeam"], how="left")
        .merge(away_df, on=["MatchID","AwayTeam"], how="left")
    )

    ### Part 4: Head2Head
    df_sorted = df.sort_values(["Date","MatchID"]).reset_index()
    matches_hist = defaultdict(deque)
    prev_matchs, h2h_gs_H_total, h2h_gs_A_total, h2h_gs_H_avg, h2h_gs_A_avg = {}, {}, {}, {}, {}

    for _, row in df_sorted.iterrows():
        h, a = row["HomeTeam"], row["AwayTeam"]
        key = tuple(sorted((h, a)))
        past = list(matches_hist[key])

        total = sum(1 if t[0] == h and t[1] == 1 else -1 if t[0] == a and t[1] == 1 else 0 for t in past)
        goals_H_total = sum(t[2] if t[0] == h else t[3] for t in past)
        goals_A_total = sum(t[3] if t[0] == h else t[2] for t in past)
        count = len(past)

        idx = row["index"]
        prev_matchs[idx] = total
        h2h_gs_H_total[idx] = goals_H_total
        h2h_gs_A_total[idx] = goals_A_total
        h2h_gs_H_avg[idx] = goals_H_total / count if count else 0.0
        h2h_gs_A_avg[idx] = goals_A_total / count if count else 0.0

        matches_hist[key].append((h, np.sign(row["Full Time Home Goals"] - row["Full Time Away Goals"]),
                                  row["Full Time Home Goals"], row["Full Time Away Goals"]))

    df["H2H_score"] = df.index.to_series().map(prev_matchs)
    df["H2H_GS_H_total"] = df.index.to_series().map(h2h_gs_H_total)
    df["H2H_GS_A_total"] = df.index.to_series().map(h2h_gs_A_total)
    df["H2H_GS_H_avg"] = df.index.to_series().map(h2h_gs_H_avg)
    df["H2H_GS_A_avg"] = df.index.to_series().map(h2h_gs_A_avg)

    # Final selected columns
    final_cols = [
        "Date","HomeTeam","AwayTeam","Elo_H_before","Elo_A_before", 
        "GoalsScore_H","GoalsAgainst_H","GoalDifference_H",
        "WinStreak_H","LoseStreak_H","Wins_H","Losses_H","WinRate_H",
        "GoalsScore_H_avg", "GoalsAgainst_H_avg", 
        "Home_adv_elo",
        "GoalsScore_A","GoalsAgainst_A","GoalDifference_A",
        "WinStreak_A","LoseStreak_A","Wins_A","Losses_A","WinRate_A",
        "GoalsScore_A_avg", "GoalsAgainst_A_avg",
        "H2H_score","H2H_GS_H_total", "H2H_GS_A_total", "H2H_GS_H_avg",
        "H2H_GS_A_avg", "Full Time Home Goals", "Full Time Away Goals"
    ]
    return df[final_cols].sort_values("Date").reset_index(drop=True)

In [10]:
def prepare_features_for_prediction(
    folder="data_season",
    new_data=None,
    baseline_path="data_extra/new_season.csv",
    k=5,
    init_elo=1500,
    adv_const=0.05,
    elo_scale=150.0,
    K=20
):
    # Load historical data
    files = sorted(Path(folder).glob("*.csv"), key=lambda p: p.stem)
    df_hist_raw = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)
    df_hist_raw["Date"] = pd.to_datetime(df_hist_raw["Date"], dayfirst=True)

    # Nếu có new_data
    if new_data is not None:
        new_df = pd.DataFrame(new_data)

        # Nếu không có Date, tự động sinh từ ngày cuối cùng
        if "Date" not in new_df.columns or new_df["Date"].isna().all():
            last_date = df_hist_raw["Date"].max()
            new_df["Date"] = [last_date + pd.Timedelta(days=i+1) for i in range(len(new_df))]
        else:
            new_df["Date"] = pd.to_datetime(new_df["Date"], dayfirst=True)
    else:
        new_df = None

    # Process full historical + new matches
    df_hist = process_full_features(
        df_hist_raw, 
        new_df=new_df, 
        k=k, init_elo=init_elo,
        adv_const=adv_const, elo_scale=elo_scale, K=K
    ).fillna(0)

    # Load baseline
    df_baseline = pd.read_csv(baseline_path)

    # ===== Các bước giữ nguyên như cũ =====
    # Elo
    elo_latest = {}
    for team in pd.unique(pd.concat([df_hist["HomeTeam"], df_hist["AwayTeam"]], ignore_index=True)):
        last_home = df_hist[df_hist["HomeTeam"] == team][["Elo_H_before"]].tail(1)
        last_away = df_hist[df_hist["AwayTeam"] == team][["Elo_A_before"]].tail(1)
        last = pd.concat([last_home.rename(columns={"Elo_H_before": "Elo"}),
                          last_away.rename(columns={"Elo_A_before": "Elo"})])
        elo_latest[team] = last["Elo"].iloc[-1] if not last.empty else init_elo

    df_baseline["Elo_H_before"] = df_baseline["HomeTeam"].map(elo_latest).fillna(init_elo)
    df_baseline["Elo_A_before"] = df_baseline["AwayTeam"].map(elo_latest).fillna(init_elo)

    # Win rate
    def get_win_rate(df, team, is_home=True):
        cond = (df["HomeTeam"] == team) if is_home else (df["AwayTeam"] == team)
        win = (df[cond]["Full Time Home Goals"] > df[cond]["Full Time Away Goals"]) if is_home else \
              (df[cond]["Full Time Away Goals"] > df[cond]["Full Time Home Goals"])
        return win.tail(k).mean() if not win.empty else 0.0

    home_win_rate_k = df_baseline["HomeTeam"].apply(lambda t: get_win_rate(df_hist, t, True))
    away_win_rate_k = df_baseline["AwayTeam"].apply(lambda t: get_win_rate(df_hist, t, False))
    df_baseline["Home_adv_elo"] = (home_win_rate_k - away_win_rate_k + adv_const) * elo_scale

    # Head-to-Head
    def get_h2h_stats(df, h, a):
        matches = df[((df["HomeTeam"] == h) & (df["AwayTeam"] == a)) |
                     ((df["HomeTeam"] == a) & (df["AwayTeam"] == h))].tail(k)
        score = gs_H = gs_A = 0
        for _, r in matches.iterrows():
            is_home = r["HomeTeam"] == h
            result = np.sign(r["Full Time Home Goals"] - r["Full Time Away Goals"])
            result = result if is_home else -result
            score += result
            gs_H += r["Full Time Home Goals"] if is_home else r["Full Time Away Goals"]
            gs_A += r["Full Time Away Goals"] if is_home else r["Full Time Home Goals"]
        n = len(matches)
        return pd.Series({
            "H2H_score": score,
            "H2H_GS_H_total": gs_H,
            "H2H_GS_A_total": gs_A,
            "H2H_GS_H_avg": gs_H / n if n else 0.0,
            "H2H_GS_A_avg": gs_A / n if n else 0.0
        })

    h2h_df = df_baseline.apply(lambda r: get_h2h_stats(df_hist, r["HomeTeam"], r["AwayTeam"]), axis=1)
    df_baseline = pd.concat([df_baseline, h2h_df], axis=1)

    # Merge last stats
    latest_home = df_hist.drop_duplicates("HomeTeam", keep="last").set_index("HomeTeam")
    latest_away = df_hist.drop_duplicates("AwayTeam", keep="last").set_index("AwayTeam")

    home_features = [
        "GoalsScore_H", "GoalsAgainst_H", "GoalDifference_H",
        "WinStreak_H", "LoseStreak_H", "Wins_H", "Losses_H", "WinRate_H",
        "GoalsScore_H_avg", "GoalsAgainst_H_avg"
    ]
    away_features = [
        "GoalsScore_A", "GoalsAgainst_A", "GoalDifference_A",
        "WinStreak_A", "LoseStreak_A", "Wins_A", "Losses_A", "WinRate_A",
        "GoalsScore_A_avg", "GoalsAgainst_A_avg"
    ]

    df_baseline = df_baseline.merge(latest_home[home_features], left_on="HomeTeam", right_index=True, how="left")
    df_baseline = df_baseline.merge(latest_away[away_features], left_on="AwayTeam", right_index=True, how="left")

    # Feature engineering
    df_baseline["Elo_diff"] = df_baseline["Elo_H_before"] - df_baseline["Elo_A_before"]
    df_baseline["Elo_ratio"] = df_baseline["Elo_H_before"] / df_baseline["Elo_A_before"]
    df_baseline["Goals_likelyhood_H"] = df_baseline["GoalsScore_H_avg"] + df_baseline["GoalsAgainst_A_avg"]
    df_baseline["Goals_likelyhood_A"] = df_baseline["GoalsScore_A_avg"] + df_baseline["GoalsAgainst_H_avg"]
    df_baseline["Home_adv_elo_sum"] = df_baseline["Elo_H_before"] + df_baseline["Home_adv_elo"]

    # Encode
    team_cols = ["HomeTeam", "AwayTeam"]
    le = LabelEncoder()
    all_teams = pd.concat([df_baseline[col] for col in team_cols]).unique()
    le.fit(all_teams)
    for col in team_cols:
        df_baseline[col + "_code"] = le.transform(df_baseline[col])

    return df_baseline

In [None]:
new_data = {
    "HomeTeam": ["Liverpool", "Liverpool", "Liverpool"],
    "AwayTeam": ["Man United", "Man United", "Man United"],
    "Full Time Home Goals": [4, 3, 3],
    "Full Time Away Goals": [1, 1, 2]
}

df_final = prepare_features_for_prediction(
    folder="data_season",
    new_data=new_data,
    baseline_path="./data_extra/new_season.csv"
)

df_final

  df_hist_raw["Date"] = pd.to_datetime(df_hist_raw["Date"], dayfirst=True)


Unnamed: 0,HomeTeam,AwayTeam,Elo_H_before,Elo_A_before,Home_adv_elo,H2H_score,H2H_GS_H_total,H2H_GS_A_total,H2H_GS_H_avg,H2H_GS_A_avg,...,WinRate_A,GoalsScore_A_avg,GoalsAgainst_A_avg,Elo_diff,Elo_ratio,Goals_likelyhood_H,Goals_likelyhood_A,Home_adv_elo_sum,HomeTeam_code,AwayTeam_code
0,Man United,Fulham,1516.145896,1570.812465,-22.5,3.0,5.0,2.0,1.0,0.4,...,0.2,0.8,1.6,-54.666569,0.965199,2.4,2.6,1493.645896,13,8
1,Ipswich,Liverpool,1424.131119,1789.398152,-52.5,-4.0,2.0,12.0,0.5,3.0,...,0.6,2.2,1.4,-365.267033,0.795872,1.8,4.6,1371.631119,9,11
2,Arsenal,Wolves,1751.126111,1528.812706,-22.5,5.0,8.0,0.0,1.6,0.0,...,0.6,1.6,1.0,222.313405,1.145416,3.0,3.0,1728.626111,0,19
3,Everton,Brighton,1588.959812,1613.889183,-22.5,0.0,3.0,7.0,0.6,1.4,...,0.6,2.2,1.8,-24.929371,0.984553,3.0,3.4,1566.459812,7,4
4,Newcastle,Southampton,1662.139473,1325.997749,127.5,5.0,11.0,3.0,2.2,0.6,...,0.0,0.4,1.6,336.141725,1.253501,3.0,1.6,1789.639473,14,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,Newcastle,Everton,1662.139473,1588.959812,37.5,-2.0,1.0,3.0,0.2,0.6,...,0.4,1.4,1.2,73.179661,1.046055,2.6,2.6,1699.639473,14,7
376,Nott'm Forest,Chelsea,1610.240746,1650.343774,-22.5,-3.0,4.0,7.0,0.8,1.4,...,0.8,1.4,0.8,-40.103029,0.975700,2.2,2.8,1587.740746,15,5
377,Southampton,Arsenal,1325.997749,1751.126111,-52.5,-4.0,7.0,13.0,1.4,2.6,...,0.4,2.0,1.2,-425.128362,0.757226,1.6,3.4,1273.497749,16,0
378,Tottenham,Brighton,1547.537104,1613.889183,-22.5,-3.0,8.0,15.0,1.6,3.0,...,0.6,2.2,1.8,-66.352079,0.958887,2.4,4.6,1525.037104,17,4
