In [23]:
import pandas as pd
from datetime import datetime
import numpy as np

In [24]:
# Set end_period as a date object.
# end_period = pd.to_datetime("2025-12-14").date()

In [25]:
from datetime import date, timedelta

today = date.today()
today_day_of_week = today.strftime("%A")          # e.g. "Sunday"
end_period = pd.to_datetime((today + timedelta(days=1)).isoformat()).date() # e.g. "2025-12-21"

print("Today:", today_day_of_week)
print("end_date:", end_period)


Today: Friday
end_date: 2026-01-03


In [26]:
# # Path to the .xls file
# file_path = r"C:\Users\leere\OneDrive\Desktop\RAW DATA\ml_goals.xls"
#
# # Load the Excel file into a DataFrame
# df = pd.read_excel(file_path)
#
# # Display the first few rows of the DataFrame
# df.head()


In [27]:
import pandas as pd
from pathlib import Path
import importlib.util


def load_excel_auto_convert(file_path: str, *, convert_to_xlsx: bool = True) -> pd.DataFrame:
    """
    Load an Excel file robustly.

    Behaviour:
      - .xlsx / .xlsm → normal pd.read_excel
      - .xls →
          * try engine="calamine" first (python-calamine)
          * if that fails, try engine="xlrd"
          * optionally save a .xlsx copy IF openpyxl is installed
      - anything else → try pd.read_csv
    """
    path = Path(file_path)

    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    suffix = path.suffix.lower()

    # Modern Excel formats
    if suffix in {".xlsx", ".xlsm"}:
        return pd.read_excel(path)

    # Legacy .xls handling
    if suffix == ".xls":
        df = None
        last_error = None

        # 1) Prefer calamine
        for engine in ("calamine", "xlrd"):
            try:
                df = pd.read_excel(path, engine=engine)
                print(f"Loaded {path.name} using engine='{engine}'.")
                break
            except Exception as e:
                last_error = e
                print(f"Engine '{engine}' failed: {e}")

        if df is None:
            raise RuntimeError(
                f"Failed to read legacy .xls file '{path}'. "
                f"Last error: {last_error}\n"
                f"Install python-calamine (and optionally xlrd), "
                f"or open the file in Excel and re-save as .xlsx."
            ) from last_error

        # 2) Optionally: convert to .xlsx IF openpyxl is available
        if convert_to_xlsx:
            has_openpyxl = importlib.util.find_spec("openpyxl") is not None
            if not has_openpyxl:
                print(
                    "openpyxl is not installed – skipping .xlsx export. "
                    "Data is loaded into a DataFrame and ready to use."
                )
            else:
                try:
                    xlsx_path = path.with_suffix(".xlsx")
                    df.to_excel(xlsx_path, index=False)
                    print(f"Converted {path.name} → {xlsx_path.name}")
                except Exception as e:
                    # Don't crash just because the export failed
                    print(f"Could not write .xlsx copy: {e}")

        return df

    # Fallback: non-Excel → assume CSV
    print(f"{path.name} is not an Excel file, trying CSV reader.")
    return pd.read_csv(path)

file_path = r"C:\Users\leere\OneDrive\Desktop\RAW DATA\ml_goals.xls"
df = load_excel_auto_convert(file_path)

df.head()


Loaded ml_goals.xls using engine='calamine'.
openpyxl is not installed – skipping .xlsx export. Data is loaded into a DataFrame and ready to use.


Unnamed: 0,country,sezonul,datameci,orameci,etapa,txtechipa1,txtechipa2,scor1,scor2,scorp1,...,yellowa2,ballph,ballph1,ballph2,ballpa,ballpa1,ballpa2,stare,codechipa1,codechipa2
0,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,2,40,39,41,60,61,59,J,41017,41008
1,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,0,37,33,41,63,67,59,J,41011,41006
2,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,0,55,66,44,45,34,56,J,41020,41002
3,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,1,37,41,33,63,59,67,J,41019,41012
4,Mex1,25,2024-07-07,200,1,G. Chivas,Toluca,0,0,0,...,2,52,46,58,48,54,42,J,41013,41007


In [28]:
df['country'].unique()

array(['Mex1', 'Rom1', 'Bul1', 'Czh1', 'Den1', 'Pol1', 'Slo1', 'Swi2',
       'Swi1', 'Bel1', 'Hun1', 'Slk1', 'Ger2', 'Ger3', 'Aus1', 'Aus2',
       'Cro1', 'Sco2', 'Sco1', 'Eng2', 'Eng3', 'Eng4', 'Ned1', 'Ned2',
       'Por1', 'Tur1', 'Tur2', 'Spa1', 'Spa2', 'Eng1', 'Ita2', 'Fra1',
       'Fra2', 'Ita1', 'Gre1', 'Ger1', 'Isr1', 'Aut1', 'Arg1', 'Ire1',
       'Jap1', 'Jap2', 'Kor1', 'Chl1', 'USA1', 'Chi1', 'Nor1', 'Swe1',
       'Swe2', 'Bra1', 'Ice1'], dtype=object)

In [29]:
column_dict = {
    "country": "country",
    "league": "league",
    "sezonul": "season",
    "datameci": "date",
    "orameci": "ko_time",
    "etapa": "round",
    "txtechipa1": "home_team",
    "txtechipa2": "away_team",
    "scor1": "home_goals_ft",
    "scor2": "away_goals_ft",
    "scorp1": "home_goals_ht",
    "scorp2": "away_goals_ht",
    "place1": "home_team_place_total",
    "place1a": "home_team_place_home",
    "place2": "away_team_place_total",
    "place2d": "away_team_place_away",
    "cotaa": "home_odds",
    "cotae": "draw_odds",
    "cotad": "away_odds",
    # "cotao0": "",
    # "cotao1": "",
    "cotao": "over_25_odds",
    # "cotao3": "",
    # "cotao4": "",
    # "cotau0": "",
    # "cotau1": "",
    "cotau": "under_25_odds",
    # "cotau3": "",
    # "cotau4": "",
    # "gg": "",
    # "ng": "",
    "elohomeo": "elo_home",
    "eloawayo": "elo_away",
    "formah": "form_home",
    "formaa": "form_away",
    "suth": "shots_home",
    "suth1": "shots_home_1h",
    "suth2": "shots_home_2h",
    "suta": "shots_away",
    "suta1": "shots_away_1h",
    "suta2": "shots_away_2h",
    "sutht": "shots_on_target_home",
    "sutht1": "shots_on_target_home_1h",
    "sutht2": "shots_on_target_home_2h",
    "sutat": "shots_on_target_away",
    "sutat1": "shots_on_target_away_1h",
    "sutat2": "shots_on_target_away_2h",
    "corh": "corners_home",
    "corh1": "corners_home_1h",
    "corh2": "corners_home_2h",
    "cora": "corners_away",
    "cora1": "corners_away_1h",
    "cora2": "corners_away_2h",
    "foulsh": "fouls_home",
    "foulsh1": "fouls_home_1h",
    "foulsh2": "fouls_home_2h",
    "foulsa": "fouls_away",
    "foulsa1": "fouls_away_1h",
    "foulsa2": "fouls_away_2h",
    "yellowh": "yellow_cards_home",
    "yellowh1": "yellow_cards_home_1h",
    "yellowh2": "yellow_cards_home_2h",
    "yellowa": "yellow_cards_away",
    "yellowa1": "yellow_cards_away_1h",
    "yellowa2": "yellow_cards_away_2h",
    "ballph": "possession_home",
    "ballph1": "possession_home_1h",
    "ballph2": "possession_home_2h",
    "ballpa": "possession_away",
    "ballpa1": "possession_away_1h",
    "ballpa2": "possession_away_2h",
    "gsh": "goals_scored_total_home",
    "gch": "goals_conceded_total_home",
    "gsa": "goals_scored_total_away",
    "gca": "goals_conceded_total_away",
    # "stare": "",
    # "codechipa1": "",
    # "codechipa2": ""
}

df = df.rename(columns=column_dict).filter(items=column_dict.values())
data = df.copy()
data

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,yellow_cards_home_2h,yellow_cards_away,yellow_cards_away_1h,yellow_cards_away_2h,possession_home,possession_home_1h,possession_home_2h,possession_away,possession_away_1h,possession_away_2h
0,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,1,3,1,2,40,39,41,60,61,59
1,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,2,0,0,0,37,33,41,63,67,59
2,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,1,2,2,0,55,66,44,45,34,56
3,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,3,2,1,1,37,41,33,63,59,67
4,Mex1,25,2024-07-07,200,1,G. Chivas,Toluca,0,0,0,...,0,2,0,2,52,46,58,48,54,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25068,Spa2,26,2026-05-31,1700,42,Granada,Gijon,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25069,Spa2,26,2026-05-31,1700,42,Leganes,Mirandes,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25070,Spa2,26,2026-05-31,1700,42,Santander,Cadiz,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25071,Spa2,26,2026-05-31,1700,42,R. Sociedad B,Leonesa,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Convert 'date' column to datetime object
data['date'] = pd.to_datetime(data['date'], format="%d/%m/%Y", errors='coerce')

# Order by date
data = data.sort_values(by='date')

# Filter out future dates (ensure data does not go beyond today)
today = datetime.today().date()
data = data[data['date'].dt.date <= end_period]

# Create a mask for matches that have been played (i.e. date is less than today)
played_mask = data['date'].dt.date < today

# Calculate home points for played matches only.
data.loc[played_mask, 'points_home'] = np.where(
    data.loc[played_mask, 'home_goals_ft'] > data.loc[played_mask, 'away_goals_ft'], 3,
    np.where(data.loc[played_mask, 'home_goals_ft'] == data.loc[played_mask, 'away_goals_ft'], 1, 0)
)

# Calculate away points for played matches only.
data.loc[played_mask, 'points_away'] = np.where(
    data.loc[played_mask, 'away_goals_ft'] > data.loc[played_mask, 'home_goals_ft'], 3,
    np.where(data.loc[played_mask, 'away_goals_ft'] == data.loc[played_mask, 'home_goals_ft'], 1, 0)
)

In [31]:
data

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,yellow_cards_away_1h,yellow_cards_away_2h,possession_home,possession_home_1h,possession_home_2h,possession_away,possession_away_1h,possession_away_2h,points_home,points_away
0,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,1,2,40,39,41,60,61,59,3.0,0.0
1,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,0,0,37,33,41,63,67,59,0.0,3.0
2,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,2,0,55,66,44,45,34,56,1.0,1.0
3,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,1,1,37,41,33,63,59,67,3.0,0.0
4,Mex1,25,2024-07-07,200,1,G. Chivas,Toluca,0,0,0,...,0,2,52,46,58,48,54,42,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20251,Fra1,26,2026-01-03,1600,17,AS Monaco,Lyon,0,0,0,...,0,0,0,0,0,0,0,0,,
20248,Spa2,26,2026-01-03,1300,20,Leonesa,R. Sociedad B,0,0,0,...,0,0,0,0,0,0,0,0,,
20247,Spa2,26,2026-01-03,2000,20,Cordoba,Burgos CF,0,0,0,...,0,0,0,0,0,0,0,0,,
20246,Spa2,26,2026-01-03,1515,20,Castellon,Huesca,0,0,0,...,0,0,0,0,0,0,0,0,,


In [32]:
data['country'].unique()

array(['Mex1', 'Rom1', 'Swi2', 'Slo1', 'Pol1', 'Den1', 'Czh1', 'Bul1',
       'Swi1', 'Hun1', 'Bel1', 'Slk1', 'Sco2', 'Aus2', 'Aus1', 'Ger3',
       'Ger2', 'Cro1', 'Sco1', 'Ned2', 'Por1', 'Tur1', 'Tur2', 'Ned1',
       'Eng2', 'Eng3', 'Eng4', 'Spa1', 'Spa2', 'Eng1', 'Ita2', 'Fra1',
       'Fra2', 'Gre1', 'Ita1', 'Ger1', 'Isr1', 'Aut1', 'Arg1', 'Jap1',
       'Ire1', 'Jap2', 'Kor1', 'Chl1', 'USA1', 'Chi1', 'Swe2', 'Swe1',
       'Nor1', 'Bra1', 'Ice1'], dtype=object)

In [33]:
# =============================================================================
# 1. Data Preparation: Build Team-Level Data (Home & Away)
# =============================================================================
# Prepare home records.
home_df = data[['country', 'season', 'date', 'home_team', 'away_team',
                'home_goals_ft', 'away_goals_ft', 'home_goals_ht', 'away_goals_ht',
                'shots_home', 'shots_home_1h', 'shots_home_2h',
                'shots_on_target_home', 'shots_on_target_home_1h', 'shots_on_target_home_2h',
                'corners_home', 'corners_home_1h', 'corners_home_2h']].copy()
home_df.rename(columns={
    'home_team': 'Team',
    'away_team': 'Opponent',
    'home_goals_ft': 'GoalsScored',
    'away_goals_ft': 'GoalsConceded',
    'home_goals_ht': 'FirstHalfGoalsScored',
    'away_goals_ht': 'FirstHalfGoalsConceded',
    'shots_home': 'Shots',
    'shots_home_1h': 'Shots_1h',
    'shots_home_2h': 'Shots_2h',
    'shots_on_target_home': 'ShotsOnTarget',
    'shots_on_target_home_1h': 'ShotsOnTarget_1h',
    'shots_on_target_home_2h': 'ShotsOnTarget_2h',
    'corners_home': 'Corners',
    'corners_home_1h': 'Corners_1h',
    'corners_home_2h': 'Corners_2h'
}, inplace=True)
home_df['is_home'] = 1

# Prepare away records.
away_df = data[['country', 'season', 'date', 'away_team', 'home_team',
                'away_goals_ft', 'home_goals_ft', 'away_goals_ht', 'home_goals_ht',
                'shots_away', 'shots_away_1h', 'shots_away_2h',
                'shots_on_target_away', 'shots_on_target_away_1h', 'shots_on_target_away_2h',
                'corners_away', 'corners_away_1h', 'corners_away_2h']].copy()
away_df.rename(columns={
    'away_team': 'Team',
    'home_team': 'Opponent',
    'away_goals_ft': 'GoalsScored',
    'home_goals_ft': 'GoalsConceded',
    'away_goals_ht': 'FirstHalfGoalsScored',
    'home_goals_ht': 'FirstHalfGoalsConceded',
    'shots_away': 'Shots',
    'shots_away_1h': 'Shots_1h',
    'shots_away_2h': 'Shots_2h',
    'shots_on_target_away': 'ShotsOnTarget',
    'shots_on_target_away_1h': 'ShotsOnTarget_1h',
    'shots_on_target_away_2h': 'ShotsOnTarget_2h',
    'corners_away': 'Corners',
    'corners_away_1h': 'Corners_1h',
    'corners_away_2h': 'Corners_2h'
}, inplace=True)
away_df['is_home'] = 0

# Combine both.
team_df = pd.concat([home_df, away_df], ignore_index=True)
team_df.sort_values(by=['country', 'season', 'Team', 'date'], inplace=True)

# Define rolling window sizes.
window_long = 5   # for long-term trends
window_short = 3  # for short-term momentum

# =============================================================================
# 2. Rolling Feature Computation Functions
# =============================================================================
def compute_slope(x):
    """Compute slope using simple linear regression."""
    if len(x) < 2:
        return np.nan
    xs = np.arange(len(x))
    return np.polyfit(xs, x, 1)[0]

def compute_rolling_features_metric(df_sub, full_col, first_half_col, prefix):
    """
    Compute rolling features for a given metric.
    Returns a DataFrame of new columns.
    """
    new_cols = {}
    # Full-match features.
    new_cols[f'{prefix}_Rolling_{full_col}_Mean'] = df_sub[full_col].rolling(window=window_long, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Rolling_{full_col}_Std']  = df_sub[full_col].rolling(window=window_long, min_periods=1).std().shift(1)
    new_cols[f'{prefix}_Rolling_{full_col}_Mean_Short'] = df_sub[full_col].rolling(window=window_short, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Momentum_{full_col}'] = new_cols[f'{prefix}_Rolling_{full_col}_Mean_Short'] - new_cols[f'{prefix}_Rolling_{full_col}_Mean']
    new_cols[f'{prefix}_Trend_Slope_{full_col}'] = df_sub[full_col].rolling(window=window_long, min_periods=2).apply(compute_slope, raw=True).shift(1)
    # First-half features.
    new_cols[f'{prefix}_Rolling_{first_half_col}_Mean'] = df_sub[first_half_col].rolling(window=window_long, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Rolling_{first_half_col}_Std']  = df_sub[first_half_col].rolling(window=window_long, min_periods=1).std().shift(1)
    new_cols[f'{prefix}_Rolling_{first_half_col}_Mean_Short'] = df_sub[first_half_col].rolling(window=window_short, min_periods=1).mean().shift(1)
    new_cols[f'{prefix}_Momentum_{first_half_col}'] = new_cols[f'{prefix}_Rolling_{first_half_col}_Mean_Short'] - new_cols[f'{prefix}_Rolling_{first_half_col}_Mean']
    new_cols[f'{prefix}_Trend_Slope_{first_half_col}'] = df_sub[first_half_col].rolling(window=window_long, min_periods=2).apply(compute_slope, raw=True).shift(1)
    return pd.DataFrame(new_cols, index=df_sub.index)

def add_rolling_features_split(group):
    """Compute overall, home-, and away-specific rolling features plus outcome percentages."""
    group = group.sort_values(by='date').reset_index(drop=True)

    # Overall features.
    overall_features = pd.concat([
        compute_rolling_features_metric(group, 'GoalsScored', 'FirstHalfGoalsScored', 'Overall'),
        compute_rolling_features_metric(group, 'Shots', 'Shots_1h', 'Overall'),
        compute_rolling_features_metric(group, 'Corners', 'Corners_1h', 'Overall'),
        compute_rolling_features_metric(group, 'ShotsOnTarget', 'ShotsOnTarget_1h', 'Overall')
    ], axis=1)
    group = pd.concat([group, overall_features], axis=1)

    home_mask = group['is_home'] == 1
    away_mask = group['is_home'] == 0

    # Home-specific.
    if home_mask.any():
        home_feats = pd.concat([
            compute_rolling_features_metric(group.loc[home_mask], 'GoalsScored', 'FirstHalfGoalsScored', 'Home'),
            compute_rolling_features_metric(group.loc[home_mask], 'Shots', 'Shots_1h', 'Home'),
            compute_rolling_features_metric(group.loc[home_mask], 'Corners', 'Corners_1h', 'Home'),
            compute_rolling_features_metric(group.loc[home_mask], 'ShotsOnTarget', 'ShotsOnTarget_1h', 'Home')
        ], axis=1)
        group.loc[home_mask, home_feats.columns] = home_feats

    # Away-specific.
    if away_mask.any():
        away_feats = pd.concat([
            compute_rolling_features_metric(group.loc[away_mask], 'GoalsScored', 'FirstHalfGoalsScored', 'Away'),
            compute_rolling_features_metric(group.loc[away_mask], 'Shots', 'Shots_1h', 'Away'),
            compute_rolling_features_metric(group.loc[away_mask], 'Corners', 'Corners_1h', 'Away'),
            compute_rolling_features_metric(group.loc[away_mask], 'ShotsOnTarget', 'ShotsOnTarget_1h', 'Away')
        ], axis=1)
        group.loc[away_mask, away_feats.columns] = away_feats

    # Additional outcome percentages for goals.
    thresh_dict = {}
    for thresh in [1.5, 2.5, 3.5]:
        thresh_dict[f'Overall_Percent_Over_{thresh}'] = group['GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        thresh_dict[f'Overall_Rolling5_Percent_Over_{thresh}'] = group['GoalsScored'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if home_mask.any():
            thresh_dict[f'Home_Percent_Over_{thresh}'] = group.loc[home_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            thresh_dict[f'Home_Rolling5_Percent_Over_{thresh}'] = group.loc[home_mask, 'GoalsScored'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if away_mask.any():
            thresh_dict[f'Away_Percent_Over_{thresh}'] = group.loc[away_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            thresh_dict[f'Away_Rolling5_Percent_Over_{thresh}'] = group.loc[away_mask, 'GoalsScored'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
    group = pd.concat([group, pd.DataFrame(thresh_dict, index=group.index)], axis=1)

    # Outcome percentages for goals.
    outcome_dict = {}
    for thresh in [0.5, 1.5, 2.5, 3.5]:
        outcome_dict[f'TeamPct_Over_{thresh}'] = group['GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        if home_mask.any():
            outcome_dict[f'Home_TeamPct_Over_{thresh}'] = group.loc[home_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        if away_mask.any():
            outcome_dict[f'Away_TeamPct_Over_{thresh}'] = group.loc[away_mask, 'GoalsScored'].gt(thresh).shift(1).expanding(min_periods=1).mean()
    group = pd.concat([group, pd.DataFrame(outcome_dict, index=group.index)], axis=1)

    # Outcome percentages for corners.
    corners_thresh = [3.5, 4.5, 5.5, 6.5]
    corners_dict = {}
    for thresh in corners_thresh:
        corners_dict[f'CornersPct_Over_{thresh}'] = group['Corners'].gt(thresh).shift(1).expanding(min_periods=1).mean()
        corners_dict[f'CornersRolling5Pct_Over_{thresh}'] = group['Corners'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if home_mask.any():
            corners_dict[f'Home_CornersPct_Over_{thresh}'] = group.loc[home_mask, 'Corners'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            corners_dict[f'Home_CornersRolling5Pct_Over_{thresh}'] = group.loc[home_mask, 'Corners'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
        if away_mask.any():
            corners_dict[f'Away_CornersPct_Over_{thresh}'] = group.loc[away_mask, 'Corners'].gt(thresh).shift(1).expanding(min_periods=1).mean()
            corners_dict[f'Away_CornersRolling5Pct_Over_{thresh}'] = group.loc[away_mask, 'Corners'].gt(thresh).shift(1).rolling(window=5, min_periods=1).mean()
    group = pd.concat([group, pd.DataFrame(corners_dict, index=group.index)], axis=1)

    return group

# Apply group-wise computations.
team_df = team_df.groupby(['country', 'season', 'Team'], group_keys=False).apply(add_rolling_features_split).reset_index(drop=True)
team_df = team_df.copy()  # ensure defragmentation

# =============================================================================
# 3. Compute Team-Level Corners Outcome Features (from Match Data)
# =============================================================================
# Build a match-level DataFrame for corners outcomes.
match_df = data.copy()
match_df['Total_Corners'] = match_df['corners_home'] + match_df['corners_away']
match_df.sort_values(by=['country', 'season', 'date'], inplace=True)

# Create a team perspective by combining home and away records.
home_matches = match_df[['country', 'season', 'date', 'home_team', 'Total_Corners']].copy()
home_matches.rename(columns={'home_team': 'Team'}, inplace=True)
away_matches = match_df[['country', 'season', 'date', 'away_team', 'Total_Corners']].copy()
away_matches.rename(columns={'away_team': 'Team'}, inplace=True)
team_corners_matches = pd.concat([home_matches, away_matches], ignore_index=True)
team_corners_matches.sort_values(by=['country', 'season', 'Team', 'date'], inplace=True)

# For thresholds 9.5, 10.5, and 11.5, compute season-level and rolling percentages.
for thr in [9.5, 10.5, 11.5]:
    indicator = f'Over_{thr}'
    team_corners_matches[indicator] = (team_corners_matches['Total_Corners'] > thr).astype(int)
    team_corners_matches[f'SeasonPct_{indicator}'] = team_corners_matches.groupby(
        ['country', 'season', 'Team']
    )[indicator].transform(lambda x: x.shift(1).expanding(min_periods=1).mean())
    team_corners_matches[f'Rolling5Pct_{indicator}'] = team_corners_matches.groupby(
        ['country', 'season', 'Team']
    )[indicator].transform(lambda x: x.shift(1).rolling(window=5, min_periods=1).mean())

# Select only the keys and outcome columns for merging.
cols_to_merge = ['country', 'season', 'date', 'Team',
                 'SeasonPct_Over_9.5', 'Rolling5Pct_Over_9.5',
                 'SeasonPct_Over_10.5', 'Rolling5Pct_Over_10.5',
                 'SeasonPct_Over_11.5', 'Rolling5Pct_Over_11.5']

# Merge the corners outcome features into team_df.
team_df = team_df.merge(team_corners_matches[cols_to_merge],
                        on=['country', 'season', 'date', 'Team'],
                        how='left')

# =============================================================================
# 4. Process Home and Away Features for Match-Level Merging
# =============================================================================
# -- Home-Team Process --
home_subset = team_df[team_df['is_home'] == 1].copy()
home_subset.drop(columns=['Opponent'], inplace=True)
home_subset.rename(columns={'Team': 'home_team'}, inplace=True)
home_key = ['country', 'season', 'date', 'home_team', 'is_home']
# Include features starting with Overall_, Home_, SeasonPct_Over_, or Rolling5Pct_Over_
home_feats = [col for col in home_subset.columns if col not in home_key and
              (col.startswith("Overall_") or col.startswith("Home_") or
               col.startswith("SeasonPct_Over_") or col.startswith("Rolling5Pct_Over_"))]
home_features = home_subset[home_key + home_feats].copy()
def clean_home_name(col):
    return "home_" + (col[len("Home_"):] if col.startswith("Home_") else col)
home_features.rename(columns={col: clean_home_name(col) for col in home_feats}, inplace=True)

# -- Away-Team Process --
away_subset = team_df[team_df['is_home'] == 0].copy()
away_subset.drop(columns=['Opponent'], inplace=True)
away_subset.rename(columns={'Team': 'away_team'}, inplace=True)
away_key = ['country', 'season', 'date', 'away_team', 'is_home']
away_feats = [col for col in away_subset.columns if col not in away_key and
              (col.startswith("Overall_") or col.startswith("Away_") or
               col.startswith("SeasonPct_Over_") or col.startswith("Rolling5Pct_Over_"))]
away_features = away_subset[away_key + away_feats].copy()
def clean_away_name(col):
    return "away_" + (col[len("Away_"):] if col.startswith("Away_") else col)
away_features.rename(columns={col: clean_away_name(col) for col in away_feats}, inplace=True)

# =============================================================================
# 5. Merge Home and Away Features into the Match-Level DataFrame
# =============================================================================
# Start with the original match data.
match_merge_df = data.copy()
# Merge home features.
match_merge_df = match_merge_df.merge(home_features, on=['country', 'season', 'date', 'home_team'], how='left')
# Merge away features.
match_merge_df = match_merge_df.merge(away_features, on=['country', 'season', 'date', 'away_team'], how='left')

# (Optional) Display a sample.
#print(match_merge_df.head())


  team_df = team_df.groupby(['country', 'season', 'Team'], group_keys=False).apply(add_rolling_features_split).reset_index(drop=True)


In [34]:
match_merge_df_filter = match_merge_df[match_merge_df['home_team'] == "Auckland FC"]
match_merge_df_filter

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
2728,Aut1,25,2024-10-18,23,1,Auckland FC,Brisbane,2,0,1,...,,,,,,,,,,
3307,Aut1,25,2024-10-27,300,2,Auckland FC,Sydney,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4495,Aut1,25,2024-11-30,400,6,Auckland FC,Newcastle Jets,2,0,0,...,0.0,0.0,0.0,0.0,0.75,0.75,0.75,0.75,0.75,0.75
4832,Aut1,25,2024-12-07,400,7,Auckland FC,Wellington,2,1,1,...,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2
5396,Aut1,25,2024-12-21,400,9,Auckland FC,Western United,0,4,0,...,0.5,0.5,0.25,0.25,0.5,0.8,0.5,0.8,0.5,0.8
5721,Aut1,25,2025-01-01,400,11,Auckland FC,Melbourne Victory,0,0,0,...,0.4,0.4,0.4,0.4,0.444444,0.4,0.444444,0.4,0.333333,0.4
6126,Aut1,25,2025-01-18,400,15,Auckland FC,Melbourne City,3,0,3,...,0.5,0.4,0.333333,0.4,0.333333,0.2,0.25,0.2,0.166667,0.2
6675,Aut1,25,2025-02-01,400,17,Auckland FC,Macarthur FC,2,1,1,...,0.428571,0.6,0.285714,0.4,0.733333,1.0,0.666667,1.0,0.466667,0.6
7704,Aut1,25,2025-02-22,400,20,Auckland FC,Wellington,6,1,3,...,0.0,0.0,0.0,0.0,0.4375,0.8,0.4375,0.8,0.25,0.2
8097,Aut1,25,2025-03-01,400,21,Auckland FC,Adelaide,4,4,2,...,0.375,0.4,0.25,0.4,0.647059,0.8,0.529412,0.8,0.411765,0.8


In [35]:
match_merge_df

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
0,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,,,,,,,,,,
1,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,,,,,,,,,,
2,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,,,,,,,,,,
3,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,,,,,,,,,,
4,Mex1,25,2024-07-07,200,1,G. Chivas,Toluca,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20276,Fra1,26,2026-01-03,1600,17,AS Monaco,Lyon,0,0,0,...,0.500000,0.6,0.500000,0.6,0.500000,0.4,0.437500,0.2,0.375000,0.2
20277,Spa2,26,2026-01-03,1300,20,Leonesa,R. Sociedad B,0,0,0,...,0.111111,0.2,0.111111,0.2,0.368421,0.4,0.263158,0.2,0.210526,0.2
20278,Spa2,26,2026-01-03,2000,20,Cordoba,Burgos CF,0,0,0,...,0.222222,0.2,0.111111,0.2,0.368421,0.6,0.315789,0.4,0.210526,0.4
20279,Spa2,26,2026-01-03,1515,20,Castellon,Huesca,0,0,0,...,0.333333,0.6,0.222222,0.4,0.473684,1.0,0.473684,1.0,0.315789,0.6


In [36]:
## -----------------------------
# 1. Process Home-Team Features (with clean naming)
# -----------------------------
home_subset = team_df[team_df['is_home'] == 1].copy()
home_subset = home_subset.drop(columns=['Opponent'])
home_subset.rename(columns={'Team': 'home_team'}, inplace=True)

# Key columns that remain unchanged
home_key_cols = ['country', 'season', 'date', 'home_team', 'is_home']

# Update the feature column selection to include the merged outcome columns.
home_feature_cols = [col for col in home_subset.columns
                     if col not in home_key_cols and
                     (col.startswith("Overall_") or
                      col.startswith("Home_") or
                      col.startswith("SeasonPct_Over_") or
                      col.startswith("Rolling5Pct_Over_"))]

# Create a DataFrame with key columns and desired features
home_features = home_subset[home_key_cols + home_feature_cols].copy()

# Function to clean column names by removing any existing "Home_" prefix
def clean_home_name(col):
    if col.startswith("Home_"):
        col = col[len("Home_"):]
    return "home_" + col

# Build a renaming dictionary for home features
rename_mapping_home = {col: clean_home_name(col) for col in home_feature_cols}
home_features.rename(columns=rename_mapping_home, inplace=True)


# -----------------------------
# 2. Process Away-Team Features (with clean naming)
# -----------------------------
away_subset = team_df[team_df['is_home'] == 0].copy()
away_subset = away_subset.drop(columns=['Opponent'])
away_subset.rename(columns={'Team': 'away_team'}, inplace=True)

# Key columns that remain unchanged
away_key_cols = ['country', 'season', 'date', 'away_team', 'is_home']

# Update the feature column selection to include the merged outcome columns.
away_feature_cols = [col for col in away_subset.columns
                     if col not in away_key_cols and
                     (col.startswith("Overall_") or
                      col.startswith("Away_") or
                      col.startswith("SeasonPct_Over_") or
                      col.startswith("Rolling5Pct_Over_"))]

# Create a DataFrame with key columns and desired features
away_features = away_subset[away_key_cols + away_feature_cols].copy()

# Function to clean column names by removing any existing "Away_" prefix
def clean_away_name(col):
    if col.startswith("Away_"):
        col = col[len("Away_"):]
    return "away_" + col

# Build a renaming dictionary for away features
rename_mapping_away = {col: clean_away_name(col) for col in away_feature_cols}
away_features.rename(columns=rename_mapping_away, inplace=True)


# -----------------------------
# 3. Merge Processed Home- and Away-Team Features Back into the Match-Level DataFrame
# -----------------------------
# Start with your original match-level data
match_df = data.copy()

# Merge home features on the common keys: country, season, date, and home_team.
match_df = match_df.merge(home_features, on=['country', 'season', 'date', 'home_team'], how='left')

#Merge away features on the common keys: country, season, date, and away_team.
match_df = match_df.merge(away_features, on=['country', 'season', 'date', 'away_team'], how='left')

# # match_df now contains cleanly named columns such as "home_Rolling_GoalsScored_Mean" along with your corners outcome features.
# print(match_df.head())


In [37]:
team_filter2 = match_df[(match_df["home_team"]=="Auckland FC") | (match_df["away_team"]=="Auckland FC")]
team_filter2

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
2728,Aut1,25,2024-10-18,23,1,Auckland FC,Brisbane,2,0,1,...,,,,,,,,,,
3307,Aut1,25,2024-10-27,300,2,Auckland FC,Sydney,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3527,Aut1,25,2024-11-02,400,3,Wellington,Auckland FC,0,2,0,...,,,,,1.0,1.0,0.5,0.5,0.5,0.5
4326,Aut1,25,2024-11-24,500,5,Macarthur FC,Auckland FC,0,1,0,...,0.0,0.0,0.0,0.0,0.666667,0.666667,0.333333,0.333333,0.333333,0.333333
4495,Aut1,25,2024-11-30,400,6,Auckland FC,Newcastle Jets,2,0,0,...,0.0,0.0,0.0,0.0,0.75,0.75,0.75,0.75,0.75,0.75
4832,Aut1,25,2024-12-07,400,7,Auckland FC,Wellington,2,1,1,...,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2
5252,Aut1,25,2024-12-15,600,8,Melbourne City,Auckland FC,2,2,1,...,0.0,0.0,0.0,0.0,0.5,0.4,0.333333,0.4,0.166667,0.2
5396,Aut1,25,2024-12-21,400,9,Auckland FC,Western United,0,4,0,...,0.5,0.5,0.25,0.25,0.5,0.8,0.5,0.8,0.5,0.8
5622,Aut1,25,2024-12-28,600,10,Central Coast,Auckland FC,1,4,1,...,0.0,0.0,0.0,0.0,0.5,0.4,0.375,0.4,0.25,0.2
5721,Aut1,25,2025-01-01,400,11,Auckland FC,Melbourne Victory,0,0,0,...,0.4,0.4,0.4,0.4,0.444444,0.4,0.444444,0.4,0.333333,0.4


In [38]:
match_df

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
0,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,,,,,,,,,,
1,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,,,,,,,,,,
2,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,,,,,,,,,,
3,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,,,,,,,,,,
4,Mex1,25,2024-07-07,200,1,G. Chivas,Toluca,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20276,Fra1,26,2026-01-03,1600,17,AS Monaco,Lyon,0,0,0,...,0.500000,0.6,0.500000,0.6,0.500000,0.4,0.437500,0.2,0.375000,0.2
20277,Spa2,26,2026-01-03,1300,20,Leonesa,R. Sociedad B,0,0,0,...,0.111111,0.2,0.111111,0.2,0.368421,0.4,0.263158,0.2,0.210526,0.2
20278,Spa2,26,2026-01-03,2000,20,Cordoba,Burgos CF,0,0,0,...,0.222222,0.2,0.111111,0.2,0.368421,0.6,0.315789,0.4,0.210526,0.4
20279,Spa2,26,2026-01-03,1515,20,Castellon,Huesca,0,0,0,...,0.333333,0.6,0.222222,0.4,0.473684,1.0,0.473684,1.0,0.315789,0.6


In [39]:
features = ['round', 'home_team_place_total', 'home_team_place_home', 'away_team_place_total', 'away_team_place_away', 'home_odds', 'draw_odds', 'away_odds', 'over_25_odds', 'under_25_odds', 'elo_home', 'elo_away', 'form_home', 'form_away', 'home_Overall_Rolling_GoalsScored_Mean', 'home_Overall_Rolling_GoalsScored_Std', 'home_Overall_Rolling_GoalsScored_Mean_Short', 'home_Overall_Momentum_GoalsScored', 'home_Overall_Trend_Slope_GoalsScored', 'home_Overall_Rolling_FirstHalfGoalsScored_Mean', 'home_Overall_Rolling_FirstHalfGoalsScored_Std', 'home_Overall_Rolling_FirstHalfGoalsScored_Mean_Short', 'home_Overall_Momentum_FirstHalfGoalsScored', 'home_Overall_Trend_Slope_FirstHalfGoalsScored', 'home_Overall_Rolling_Shots_Mean', 'home_Overall_Rolling_Shots_Std', 'home_Overall_Rolling_Shots_Mean_Short', 'home_Overall_Momentum_Shots', 'home_Overall_Trend_Slope_Shots', 'home_Overall_Rolling_Shots_1h_Mean', 'home_Overall_Rolling_Shots_1h_Std', 'home_Overall_Rolling_Shots_1h_Mean_Short', 'home_Overall_Momentum_Shots_1h', 'home_Overall_Trend_Slope_Shots_1h', 'home_Overall_Rolling_Corners_Mean', 'home_Overall_Rolling_Corners_Std', 'home_Overall_Rolling_Corners_Mean_Short', 'home_Overall_Momentum_Corners', 'home_Overall_Trend_Slope_Corners', 'home_Overall_Rolling_Corners_1h_Mean', 'home_Overall_Rolling_Corners_1h_Std', 'home_Overall_Rolling_Corners_1h_Mean_Short', 'home_Overall_Momentum_Corners_1h', 'home_Overall_Trend_Slope_Corners_1h', 'home_Overall_Rolling_ShotsOnTarget_Mean', 'home_Overall_Rolling_ShotsOnTarget_Std', 'home_Overall_Rolling_ShotsOnTarget_Mean_Short', 'home_Overall_Momentum_ShotsOnTarget', 'home_Overall_Trend_Slope_ShotsOnTarget', 'home_Overall_Rolling_ShotsOnTarget_1h_Mean', 'home_Overall_Rolling_ShotsOnTarget_1h_Std', 'home_Overall_Rolling_ShotsOnTarget_1h_Mean_Short', 'home_Overall_Momentum_ShotsOnTarget_1h', 'home_Overall_Trend_Slope_ShotsOnTarget_1h', 'home_Rolling_GoalsScored_Mean', 'home_Rolling_GoalsScored_Std', 'home_Rolling_GoalsScored_Mean_Short', 'home_Momentum_GoalsScored', 'home_Trend_Slope_GoalsScored', 'home_Rolling_FirstHalfGoalsScored_Mean', 'home_Rolling_FirstHalfGoalsScored_Std', 'home_Rolling_FirstHalfGoalsScored_Mean_Short', 'home_Momentum_FirstHalfGoalsScored', 'home_Trend_Slope_FirstHalfGoalsScored', 'home_Rolling_Shots_Mean', 'home_Rolling_Shots_Std', 'home_Rolling_Shots_Mean_Short', 'home_Momentum_Shots', 'home_Trend_Slope_Shots', 'home_Rolling_Shots_1h_Mean', 'home_Rolling_Shots_1h_Std', 'home_Rolling_Shots_1h_Mean_Short', 'home_Momentum_Shots_1h', 'home_Trend_Slope_Shots_1h', 'home_Rolling_Corners_Mean', 'home_Rolling_Corners_Std', 'home_Rolling_Corners_Mean_Short', 'home_Momentum_Corners', 'home_Trend_Slope_Corners', 'home_Rolling_Corners_1h_Mean', 'home_Rolling_Corners_1h_Std', 'home_Rolling_Corners_1h_Mean_Short', 'home_Momentum_Corners_1h', 'home_Trend_Slope_Corners_1h', 'home_Rolling_ShotsOnTarget_Mean', 'home_Rolling_ShotsOnTarget_Std', 'home_Rolling_ShotsOnTarget_Mean_Short', 'home_Momentum_ShotsOnTarget', 'home_Trend_Slope_ShotsOnTarget', 'home_Rolling_ShotsOnTarget_1h_Mean', 'home_Rolling_ShotsOnTarget_1h_Std', 'home_Rolling_ShotsOnTarget_1h_Mean_Short', 'home_Momentum_ShotsOnTarget_1h', 'home_Trend_Slope_ShotsOnTarget_1h', 'home_Overall_Percent_Over_1.5', 'home_Overall_Rolling5_Percent_Over_1.5', 'home_Percent_Over_1.5', 'home_Rolling5_Percent_Over_1.5', 'home_Overall_Percent_Over_2.5', 'home_Overall_Rolling5_Percent_Over_2.5', 'home_Percent_Over_2.5', 'home_Rolling5_Percent_Over_2.5', 'home_Overall_Percent_Over_3.5', 'home_Overall_Rolling5_Percent_Over_3.5', 'home_Percent_Over_3.5', 'home_Rolling5_Percent_Over_3.5', 'home_TeamPct_Over_0.5', 'home_TeamPct_Over_1.5', 'home_TeamPct_Over_2.5', 'home_TeamPct_Over_3.5', 'home_CornersPct_Over_3.5', 'home_CornersRolling5Pct_Over_3.5', 'home_CornersPct_Over_4.5', 'home_CornersRolling5Pct_Over_4.5', 'home_CornersPct_Over_5.5', 'home_CornersRolling5Pct_Over_5.5', 'home_CornersPct_Over_6.5', 'home_CornersRolling5Pct_Over_6.5', 'home_SeasonPct_Over_9.5', 'home_Rolling5Pct_Over_9.5', 'home_SeasonPct_Over_10.5', 'home_Rolling5Pct_Over_10.5', 'home_SeasonPct_Over_11.5', 'home_Rolling5Pct_Over_11.5', 'away_Overall_Rolling_GoalsScored_Mean', 'away_Overall_Rolling_GoalsScored_Std', 'away_Overall_Rolling_GoalsScored_Mean_Short', 'away_Overall_Momentum_GoalsScored', 'away_Overall_Trend_Slope_GoalsScored', 'away_Overall_Rolling_FirstHalfGoalsScored_Mean', 'away_Overall_Rolling_FirstHalfGoalsScored_Std', 'away_Overall_Rolling_FirstHalfGoalsScored_Mean_Short', 'away_Overall_Momentum_FirstHalfGoalsScored', 'away_Overall_Trend_Slope_FirstHalfGoalsScored', 'away_Overall_Rolling_Shots_Mean', 'away_Overall_Rolling_Shots_Std', 'away_Overall_Rolling_Shots_Mean_Short', 'away_Overall_Momentum_Shots', 'away_Overall_Trend_Slope_Shots', 'away_Overall_Rolling_Shots_1h_Mean', 'away_Overall_Rolling_Shots_1h_Std', 'away_Overall_Rolling_Shots_1h_Mean_Short', 'away_Overall_Momentum_Shots_1h', 'away_Overall_Trend_Slope_Shots_1h', 'away_Overall_Rolling_Corners_Mean', 'away_Overall_Rolling_Corners_Std', 'away_Overall_Rolling_Corners_Mean_Short', 'away_Overall_Momentum_Corners', 'away_Overall_Trend_Slope_Corners', 'away_Overall_Rolling_Corners_1h_Mean', 'away_Overall_Rolling_Corners_1h_Std', 'away_Overall_Rolling_Corners_1h_Mean_Short', 'away_Overall_Momentum_Corners_1h', 'away_Overall_Trend_Slope_Corners_1h', 'away_Overall_Rolling_ShotsOnTarget_Mean', 'away_Overall_Rolling_ShotsOnTarget_Std', 'away_Overall_Rolling_ShotsOnTarget_Mean_Short', 'away_Overall_Momentum_ShotsOnTarget', 'away_Overall_Trend_Slope_ShotsOnTarget', 'away_Overall_Rolling_ShotsOnTarget_1h_Mean', 'away_Overall_Rolling_ShotsOnTarget_1h_Std', 'away_Overall_Rolling_ShotsOnTarget_1h_Mean_Short', 'away_Overall_Momentum_ShotsOnTarget_1h', 'away_Overall_Trend_Slope_ShotsOnTarget_1h', 'away_Rolling_GoalsScored_Mean', 'away_Rolling_GoalsScored_Std', 'away_Rolling_GoalsScored_Mean_Short', 'away_Momentum_GoalsScored', 'away_Trend_Slope_GoalsScored', 'away_Rolling_FirstHalfGoalsScored_Mean', 'away_Rolling_FirstHalfGoalsScored_Std', 'away_Rolling_FirstHalfGoalsScored_Mean_Short', 'away_Momentum_FirstHalfGoalsScored', 'away_Trend_Slope_FirstHalfGoalsScored', 'away_Rolling_Shots_Mean', 'away_Rolling_Shots_Std', 'away_Rolling_Shots_Mean_Short', 'away_Momentum_Shots', 'away_Trend_Slope_Shots', 'away_Rolling_Shots_1h_Mean', 'away_Rolling_Shots_1h_Std', 'away_Rolling_Shots_1h_Mean_Short', 'away_Momentum_Shots_1h', 'away_Trend_Slope_Shots_1h', 'away_Rolling_Corners_Mean', 'away_Rolling_Corners_Std', 'away_Rolling_Corners_Mean_Short', 'away_Momentum_Corners', 'away_Trend_Slope_Corners', 'away_Rolling_Corners_1h_Mean', 'away_Rolling_Corners_1h_Std', 'away_Rolling_Corners_1h_Mean_Short', 'away_Momentum_Corners_1h', 'away_Trend_Slope_Corners_1h', 'away_Rolling_ShotsOnTarget_Mean', 'away_Rolling_ShotsOnTarget_Std', 'away_Rolling_ShotsOnTarget_Mean_Short', 'away_Momentum_ShotsOnTarget', 'away_Trend_Slope_ShotsOnTarget', 'away_Rolling_ShotsOnTarget_1h_Mean', 'away_Rolling_ShotsOnTarget_1h_Std', 'away_Rolling_ShotsOnTarget_1h_Mean_Short', 'away_Momentum_ShotsOnTarget_1h', 'away_Trend_Slope_ShotsOnTarget_1h', 'away_Overall_Percent_Over_1.5', 'away_Overall_Rolling5_Percent_Over_1.5', 'away_Percent_Over_1.5', 'away_Rolling5_Percent_Over_1.5', 'away_Overall_Percent_Over_2.5', 'away_Overall_Rolling5_Percent_Over_2.5', 'away_Percent_Over_2.5', 'away_Rolling5_Percent_Over_2.5', 'away_Overall_Percent_Over_3.5', 'away_Overall_Rolling5_Percent_Over_3.5', 'away_Percent_Over_3.5', 'away_Rolling5_Percent_Over_3.5', 'away_TeamPct_Over_0.5', 'away_TeamPct_Over_1.5', 'away_TeamPct_Over_2.5', 'away_TeamPct_Over_3.5', 'away_CornersPct_Over_3.5', 'away_CornersRolling5Pct_Over_3.5', 'away_CornersPct_Over_4.5', 'away_CornersRolling5Pct_Over_4.5', 'away_CornersPct_Over_5.5', 'away_CornersRolling5Pct_Over_5.5', 'away_CornersPct_Over_6.5', 'away_CornersRolling5Pct_Over_6.5', 'away_SeasonPct_Over_9.5', 'away_Rolling5Pct_Over_9.5', 'away_SeasonPct_Over_10.5', 'away_Rolling5Pct_Over_10.5', 'away_SeasonPct_Over_11.5', 'away_Rolling5Pct_Over_11.5', 'country_Arg1', 'country_Aus1', 'country_Aus2', 'country_Aut1', 'country_Bel1', 'country_Bra1', 'country_Bul1', 'country_Chi1', 'country_Chl1', 'country_Cro1', 'country_Czh1', 'country_Den1', 'country_Eng1', 'country_Eng2', 'country_Eng3', 'country_Eng4', 'country_Fra1', 'country_Fra2', 'country_Ger1', 'country_Ger2', 'country_Ger3', 'country_Gre1', 'country_Hun1', 'country_Ice1', 'country_Ire1', 'country_Isr1', 'country_Ita1', 'country_Ita2', 'country_Jap1', 'country_Jap2', 'country_Kor1', 'country_Mex1', 'country_Ned1', 'country_Ned2', 'country_Nor1', 'country_Pol1', 'country_Por1', 'country_Rom1', 'country_Sco1', 'country_Sco2', 'country_Slk1', 'country_Slo1', 'country_Spa1', 'country_Spa2', 'country_Swe1', 'country_Swe2', 'country_Swi1', 'country_Swi2', 'country_Tur1', 'country_Tur2', 'country_USA1', 'country_Arg1', 'country_Aus1', 'country_Aus2', 'country_Aut1', 'country_Bel1', 'country_Bra1', 'country_Bul1', 'country_Chi1', 'country_Chl1', 'country_Cro1', 'country_Czh1', 'country_Den1', 'country_Eng1', 'country_Eng2', 'country_Eng3', 'country_Eng4', 'country_Fra1', 'country_Fra2', 'country_Ger1', 'country_Ger2', 'country_Ger3', 'country_Gre1', 'country_Hun1', 'country_Ice1', 'country_Ire1', 'country_Isr1', 'country_Ita1', 'country_Ita2', 'country_Jap1', 'country_Jap2', 'country_Kor1', 'country_Mex1', 'country_Ned1', 'country_Ned2', 'country_Nor1', 'country_Pol1', 'country_Por1', 'country_Rom1', 'country_Sco1', 'country_Sco2', 'country_Slk1', 'country_Slo1', 'country_Spa1', 'country_Spa2', 'country_Swe1', 'country_Swe2', 'country_Swi1', 'country_Swi2', 'country_Tur1', 'country_Tur2', 'country_USA1']

In [40]:
match_df

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
0,Mex1,25,2024-07-05,2345,1,Puebla,Santos Laguna,1,0,0,...,,,,,,,,,,
1,Mex1,25,2024-07-06,200,1,Queretaro,Tijuana de Caliente,1,2,0,...,,,,,,,,,,
2,Mex1,25,2024-07-06,410,1,Juarez,Atlas,2,2,1,...,,,,,,,,,,
3,Mex1,25,2024-07-07,0,1,San Luis,Club America,2,1,1,...,,,,,,,,,,
4,Mex1,25,2024-07-07,200,1,G. Chivas,Toluca,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20276,Fra1,26,2026-01-03,1600,17,AS Monaco,Lyon,0,0,0,...,0.500000,0.6,0.500000,0.6,0.500000,0.4,0.437500,0.2,0.375000,0.2
20277,Spa2,26,2026-01-03,1300,20,Leonesa,R. Sociedad B,0,0,0,...,0.111111,0.2,0.111111,0.2,0.368421,0.4,0.263158,0.2,0.210526,0.2
20278,Spa2,26,2026-01-03,2000,20,Cordoba,Burgos CF,0,0,0,...,0.222222,0.2,0.111111,0.2,0.368421,0.6,0.315789,0.4,0.210526,0.4
20279,Spa2,26,2026-01-03,1515,20,Castellon,Huesca,0,0,0,...,0.333333,0.6,0.222222,0.4,0.473684,1.0,0.473684,1.0,0.315789,0.6


In [41]:
import function_library as fl
filtered_data = match_df[(match_df['date'].dt.date >= today) & (match_df['date'].dt.date <= end_period)].copy()
filtered_data = fl.team_name_map(filtered_data)

####
# Create a boolean mask for country == 'Chl1'
mask = (filtered_data['country'] == 'Chl1')

# Only look at those rows, and replace 'Everton' → 'Everton De Vina'
filtered_data.loc[mask, ['home_team','away_team']] = (
    filtered_data.loc[mask, ['home_team','away_team']]
    .replace('Everton', 'Everton De Vina')
)
####
filtered_data

Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
20225,Aut1,26,2026-01-02,835,11,Melbourne Victory,Perth Glory,0,0,0,...,0.0,0.0,0.0,0.0,0.5,0.6,0.4,0.6,0.1,0.2
20226,Por1,26,2026-01-02,2045,17,Guimaraes,CD Nacional Funchal,0,0,0,...,0.25,0.2,0.0,0.0,0.6,0.6,0.533333,0.6,0.4,0.4
20227,Fra1,26,2026-01-02,1945,17,Toulouse,Lens,0,0,0,...,0.25,0.4,0.25,0.4,0.5625,0.4,0.5,0.2,0.5,0.2
20228,Por1,26,2026-01-02,1845,17,Gil Vicente,Sporting Lisbon,0,0,0,...,0.625,0.6,0.625,0.6,0.4375,0.0,0.3125,0.0,0.1875,0.0
20229,Spa1,26,2026-01-02,2000,18,Rayo Vallecano,Getafe,0,0,0,...,0.333333,0.4,0.111111,0.2,0.411765,0.6,0.352941,0.4,0.294118,0.4
20230,Ita1,26,2026-01-02,1945,18,Cagliari,AC Milan,0,0,0,...,0.0,0.0,0.0,0.0,0.4375,0.6,0.25,0.0,0.125,0.0
20231,Spa2,26,2026-01-02,1930,20,Eibar,Mirandes,0,0,0,...,0.0,0.0,0.0,0.0,0.315789,0.2,0.263158,0.2,0.210526,0.2
20232,Fra2,26,2026-01-03,1900,18,Pau,Rodez,0,0,0,...,0.25,0.2,0.125,0.2,0.647059,1.0,0.529412,0.8,0.470588,0.6
20233,Fra2,26,2026-01-03,1900,18,Reims,Annecy,0,0,0,...,0.111111,0.0,0.111111,0.0,0.411765,0.8,0.294118,0.6,0.235294,0.4
20234,Fra2,26,2026-01-03,1300,18,ESTAC Troyes,Red Star,0,0,0,...,0.111111,0.2,0.0,0.0,0.352941,0.2,0.235294,0.2,0.117647,0.0


In [42]:
filtered_data[filtered_data.isna().any(axis=1)]


Unnamed: 0,country,season,date,ko_time,round,home_team,away_team,home_goals_ft,away_goals_ft,home_goals_ht,...,away_CornersPct_Over_5.5,away_CornersRolling5Pct_Over_5.5,away_CornersPct_Over_6.5,away_CornersRolling5Pct_Over_6.5,away_SeasonPct_Over_9.5,away_Rolling5Pct_Over_9.5,away_SeasonPct_Over_10.5,away_Rolling5Pct_Over_10.5,away_SeasonPct_Over_11.5,away_Rolling5Pct_Over_11.5
20225,Aut1,26,2026-01-02,835,11,Melbourne Victory,Perth Glory,0,0,0,...,0.0,0.0,0.0,0.0,0.5,0.6,0.4,0.6,0.1,0.2
20226,Por1,26,2026-01-02,2045,17,Guimaraes,CD Nacional Funchal,0,0,0,...,0.25,0.2,0.0,0.0,0.6,0.6,0.533333,0.6,0.4,0.4
20227,Fra1,26,2026-01-02,1945,17,Toulouse,Lens,0,0,0,...,0.25,0.4,0.25,0.4,0.5625,0.4,0.5,0.2,0.5,0.2
20228,Por1,26,2026-01-02,1845,17,Gil Vicente,Sporting Lisbon,0,0,0,...,0.625,0.6,0.625,0.6,0.4375,0.0,0.3125,0.0,0.1875,0.0
20229,Spa1,26,2026-01-02,2000,18,Rayo Vallecano,Getafe,0,0,0,...,0.333333,0.4,0.111111,0.2,0.411765,0.6,0.352941,0.4,0.294118,0.4
20230,Ita1,26,2026-01-02,1945,18,Cagliari,AC Milan,0,0,0,...,0.0,0.0,0.0,0.0,0.4375,0.6,0.25,0.0,0.125,0.0
20231,Spa2,26,2026-01-02,1930,20,Eibar,Mirandes,0,0,0,...,0.0,0.0,0.0,0.0,0.315789,0.2,0.263158,0.2,0.210526,0.2
20232,Fra2,26,2026-01-03,1900,18,Pau,Rodez,0,0,0,...,0.25,0.2,0.125,0.2,0.647059,1.0,0.529412,0.8,0.470588,0.6
20233,Fra2,26,2026-01-03,1900,18,Reims,Annecy,0,0,0,...,0.111111,0.0,0.111111,0.0,0.411765,0.8,0.294118,0.6,0.235294,0.4
20234,Fra2,26,2026-01-03,1300,18,ESTAC Troyes,Red Star,0,0,0,...,0.111111,0.2,0.0,0.0,0.352941,0.2,0.235294,0.2,0.117647,0.0


In [43]:
# filtered_data.dropna(inplace=True)
# filtered_data

In [44]:
import os
import pandas as pd
from joblib import load

# ── CONFIG ────────────────────────────────────────────────────────────────
# Hard-code the exact PKL file you want to use:
PKL_PATH    = r"C:\Users\leere\PycharmProjects\Football_ML3\Goals\Over_2_5\best_model_xgb_calibrated_20251206_162811.pkl"

fixtures_df = filtered_data.copy()   # your fresh fixtures DataFrame
IMPORT_DIR  = r"C:\Users\leere\OneDrive\Desktop\IMPORTS"
OUT_PATH    = os.path.join(IMPORT_DIR, "o25_predictions_v2.csv")

os.makedirs(IMPORT_DIR, exist_ok=True)

# ── HELPERS ───────────────────────────────────────────────────────────────
def align_features(df_in: pd.DataFrame, feature_contract: list[str]) -> pd.DataFrame:
    """
    Align df to the saved training feature list:
      - add missing columns as 0
      - drop extras
      - preserve column order
    """
    X = df_in.reindex(columns=feature_contract, fill_value=0)
    # ensure numeric dtypes where possible
    for c in X.columns:
        if pd.api.types.is_object_dtype(X[c]):
            X[c] = pd.to_numeric(X[c], errors="ignore")
    return X

# ── LOAD MODEL ────────────────────────────────────────────────────────────
if not os.path.exists(PKL_PATH):
    raise RuntimeError(f"PKL file not found at: {PKL_PATH}")

md = load(PKL_PATH)
model     = md['model']          # calibrated estimator
threshold = float(md.get('threshold', 0.5))
feat_list = list(md['features']) # feature contract used in training

# Optional: if training used one-hot on 'country', recreate those columns
if 'country' in fixtures_df.columns and any(f.startswith('country_') for f in feat_list):
    fixtures_df = pd.get_dummies(fixtures_df, columns=['country'], prefix='country')

# ── PREPARE FEATURES ─────────────────────────────────────────────────────
X = align_features(fixtures_df, feat_list)

# Drop rows with missing feature values; keep row alignment
X = X.dropna()
fixtures_df = fixtures_df.loc[X.index].copy()

# ── PREDICT ───────────────────────────────────────────────────────────────
if hasattr(model, "predict_proba"):
    proba = model.predict_proba(X)
    proba = proba[:, 1] if proba.ndim == 2 else proba
    mask  = proba >= threshold
else:
    # Fallback (unlikely): use predict -> bool
    mask  = model.predict(X).astype(bool)
    proba = pd.Series(mask, index=X.index, dtype=float)

positives = fixtures_df.loc[mask].copy()
positives["pred_proba_o25"] = proba[mask]
positives = positives.sort_values("pred_proba_o25", ascending=False)  # optional, nice to have

# ── WRITE IMPORT FILE ─────────────────────────────────────────────────────
if positives.empty:
    print(f"No selections (≥ {threshold:.2f}); nothing to write.")
else:
    # Build your import structure (adjust columns if your fixtures use different names)
    if not {"home_team", "away_team"}.issubset(positives.columns):
        raise KeyError("fixtures_df must contain 'home_team' and 'away_team' columns.")

    out_df = pd.DataFrame({
        'EventName':     positives['home_team'] + ' v ' + positives['away_team'],
        'Provider':      'over_2_5_goals_v2',
        'MarketName':    'Over/Under 2.5 Goals',
        'SelectionName': 'Over 2.5 Goals',
        # 'PredProb':      positives['pred_proba_o25'].round(4),
        # 'Threshold':     threshold,
    })

    out_df.to_csv(OUT_PATH, index=False)
    print(
        f"✓ Wrote {len(out_df)} O2.5 selections to:\n"
        f"   {OUT_PATH}\n"
        f"   (model: {PKL_PATH})"
    )


✓ Wrote 2 O2.5 selections to:
   C:\Users\leere\OneDrive\Desktop\IMPORTS\o25_predictions_v2.csv
   (model: C:\Users\leere\PycharmProjects\Football_ML3\Goals\Over_2_5\best_model_xgb_calibrated_20251206_162811.pkl)
