In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from lightgbm import LGBMRegressor, early_stopping
from pathlib import Path

# function to convert datetime
def parse_sa(s: str):
    if pd.isna(s):
        return pd.NaT
    s = re.sub(r'(\b\d{1,2})(st|nd|rd|th)\b', r'\1', s)     # 20th -> 20
    s = s.replace("SAST", "").strip()                       # drop SAST
    dt = pd.to_datetime(s, dayfirst=True, errors="coerce")  # tz-naive
    return dt

In [None]:
# Load data
df = pd.read_csv("data/processed/matches_with_weather_features24.csv")

# Parse datetime and sort for time-aware CV
df["Date_time"] = df["Date_time"].apply(parse_sa)
df = df.sort_values("Date_time").reset_index(drop=True)

In [None]:
# Select features and targets
X_cols = ["Home_team","Away_team","Venue","wx_temp_c","wx_summary",
          "time_bucket","is_in_south_africa","is_main_home_stadium"]

Y_ht_cols = ["Halftime_score_home","Halftime_score_away"]
Y_ft_cols = ["Fulltime_score_home","Fulltime_score_away"]

X = df[X_cols]
Y_ht = df[Y_ht_cols].values
Y_ft = df[Y_ft_cols].values

In [None]:
# Preprocessing
cat = ["Home_team", "Away_team", "Venue", "wx_summary", "time_bucket", "is_in_south_africa", "is_main_home_stadium"]
num = ["wx_temp_c"]

X_native = X.copy()
for c in cat:
    X_native[c] = X_native[c].astype("category")
X_native["wx_temp_c"] = pd.to_numeric(X_native["wx_temp_c"], errors="coerce")

In [49]:
def mae_clip_multi(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return np.mean([mean_absolute_error(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

scoring = {
    "utility": make_scorer(mae_clip_multi, greater_is_better=False),  # used for refit
    "mae":     make_scorer(mae_clip_multi, greater_is_better=True),   # for reporting
}

tscv = TimeSeriesSplit(n_splits=3)

# Base estimators (one for HT pair, one for FT pair)
base_ht = MultiOutputRegressor(LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1))
base_ft = MultiOutputRegressor(LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1))

In [None]:
# Small grids (start tight; expand only if needed)
grid_ht = {
    "estimator__objective": ["mae"],
    "estimator__learning_rate": [0.05, 0.025],
    "estimator__n_estimators": [800, 1500],
    "estimator__num_leaves": [31, 63],
    "estimator__min_child_samples": [20, 40, 60],
    "estimator__reg_lambda": [0.3, 0.6],
    "estimator__colsample_bytree": [0.9, 1.0],
}
grid_ft = {
    # if Poisson helped FT, flip to ["poisson"]; otherwise keep "mae"
    "estimator__objective": ["mae", "poisson"],
    "estimator__learning_rate": [0.03, 0.02],
    "estimator__n_estimators": [1200, 1800],
    "estimator__num_leaves": [47, 63, 95],
    "estimator__min_child_samples": [20, 40, 80],
    "estimator__reg_lambda": [0.3, 0.6],
    "estimator__colsample_bytree": [0.9, 1.0],
}

gcv_ht = GridSearchCV(base_ht, grid_ht, scoring=scoring, refit="utility", cv=tscv, n_jobs=-1, verbose=1)
gcv_ft = GridSearchCV(base_ft, grid_ft, scoring=scoring, refit="utility", cv=tscv, n_jobs=-1, verbose=1)

gcv_ht.fit(X_native, Y_ht)
gcv_ft.fit(X_native, Y_ft)

print("HT best MAE:", -gcv_ht.best_score_, "\nHT best params:", gcv_ht.best_params_)
print("FT best MAE:", -gcv_ft.best_score_, "\nFT best params:", gcv_ft.best_params_)

Fitting 3 folds for each of 96 candidates, totalling 288 fits
Fitting 3 folds for each of 288 candidates, totalling 864 fits
HT best MAE: 6.655405405405404 
HT best params: {'estimator__colsample_bytree': 0.9, 'estimator__learning_rate': 0.05, 'estimator__min_child_samples': 60, 'estimator__n_estimators': 800, 'estimator__num_leaves': 31, 'estimator__objective': 'mae', 'estimator__reg_lambda': 0.3}
FT best MAE: 8.691861053103732 
FT best params: {'estimator__colsample_bytree': 0.9, 'estimator__learning_rate': 0.03, 'estimator__min_child_samples': 40, 'estimator__n_estimators': 1800, 'estimator__num_leaves': 47, 'estimator__objective': 'poisson', 'estimator__reg_lambda': 0.6}


In [52]:
_POSSIBLE_PARAM_COLS = [
    "param_estimator__objective",
    "param_estimator__learning_rate",
    "param_estimator__n_estimators",
    "param_estimator__num_leaves",
    "param_estimator__min_child_samples",
    "param_estimator__subsample",
    "param_estimator__colsample_bytree",
    "param_estimator__reg_alpha",
    "param_estimator__reg_lambda",
]

def show_gcv_table(gcv, k=10):
    cvres = pd.DataFrame(gcv.cv_results_).sort_values("rank_test_utility")
    base = ["rank_test_utility", "mean_test_mae", "std_test_mae"]
    params = [c for c in _POSSIBLE_PARAM_COLS if c in cvres.columns]
    cols = base + params
    print(cvres[cols].head(k).to_string(index=False))

# Usage:
print("=== HT diagnostics ===")
show_gcv_table(gcv_ht, k=10)

print("\n=== FT diagnostics ===")
show_gcv_table(gcv_ft, k=10)

=== HT diagnostics ===
 rank_test_utility  mean_test_mae  std_test_mae param_estimator__objective  param_estimator__learning_rate  param_estimator__n_estimators  param_estimator__num_leaves  param_estimator__min_child_samples  param_estimator__colsample_bytree  param_estimator__reg_lambda
                 1       6.655405      0.169952                        mae                           0.050                           1500                           63                                  60                                0.9                          0.6
                 1       6.655405      0.169952                        mae                           0.050                           1500                           63                                  60                                0.9                          0.3
                 1       6.655405      0.169952                        mae                           0.050                           1500                           31          