In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer


In [3]:
# --- function to convert datetime ---
def parse_sa(s: str):
    if pd.isna(s):
        return pd.NaT
    s = re.sub(r'(\b\d{1,2})(st|nd|rd|th)\b', r'\1', s)     # 20th -> 20
    s = s.replace("SAST", "").strip()                       # drop SAST
    dt = pd.to_datetime(s, dayfirst=True, errors="coerce")  # tz-naive
    return dt

In [4]:
# --- Load data ---
df = pd.read_csv("data/processed/matches_with_weather_features24.csv")

# Parse datetime and sort for time-aware CV
df["Date_time"] = df["Date_time"].apply(parse_sa)
df = df.sort_values("Date_time").reset_index(drop=True)

In [5]:
# --- Select features and targets ---
X_cols = ["Home_team","Away_team","Venue","wx_temp_c","wx_summary",
          "time_bucket","is_in_south_africa","is_main_home_stadium"]

Y_ht_cols = ["Halftime_score_home","Halftime_score_away"]
Y_ft_cols = ["Fulltime_score_home","Fulltime_score_away"]

X = df[X_cols]
Y_ht = df[Y_ht_cols].values
Y_ft = df[Y_ft_cols].values

In [6]:
# --- Preprocessing ---
cat = ["Home_team", "Away_team", "Venue", "wx_summary", "time_bucket", "is_in_south_africa", "is_main_home_stadium"]
num = ["wx_temp_c"]

# OneHotEncoder
prep = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num),
])


In [11]:
etr_ht = ExtraTreesRegressor(random_state=42, n_jobs=-1)
etr_ft = ExtraTreesRegressor(random_state=42, n_jobs=-1)

pipe_ht = Pipeline([("prep", prep), ("etr", etr_ht)])
pipe_ft = Pipeline([("prep", prep), ("etr", etr_ft)])

In [12]:
# --- 5-fold TimeSeriesSplit CV ---
tscv = TimeSeriesSplit(n_splits=5)

def mae_clip(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return mean_absolute_error(y_true, y_pred)

scoring = {
    # used for picking the best model (larger is better because it's the negated loss)
    "utility": make_scorer(mae_clip, greater_is_better=False),
    # flip sign for reporting (positive MAE)
    "mae": make_scorer(mae_clip, greater_is_better=True),
}

In [17]:
param_grid = {
    "etr__n_estimators":     [600, 1000, 1500],
    "etr__criterion":        ["squared_error", "absolute_error"],  # needs sklearn â‰¥1.1
    "etr__max_depth":        [None, 12, 18],
    "etr__min_samples_leaf": [1, 2, 4],
    "etr__max_features":     ["sqrt", 0.5, 1.0],
    "etr__bootstrap":        [False],
}

In [18]:
gcv_ht = GridSearchCV(
    estimator=pipe_ht,
    param_grid=param_grid,
    scoring=scoring,  
    refit="utility",        
    cv=tscv,
    n_jobs=-1,
    verbose=1,
)

gcv_ft = GridSearchCV(
    estimator=pipe_ft,
    param_grid=param_grid,
    scoring=scoring,  
    refit="utility",         
    cv=tscv,
    n_jobs=-1,
    verbose=1,
)

gcv_ht.fit(X, Y_ht)
gcv_ft.fit(X, Y_ft)

best_mae_ht = -gcv_ht.best_score_
best_mae_ft = -gcv_ft.best_score_

print(f"Best params ht: {gcv_ht.best_params_} | MAE: {best_mae_ht:.3f}")
print(f"Best params ft: {gcv_ft.best_params_} | MAE: {best_mae_ft:.3f}")

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best params ht: {'etr__bootstrap': False, 'etr__criterion': 'absolute_error', 'etr__max_depth': None, 'etr__max_features': 'sqrt', 'etr__min_samples_leaf': 4, 'etr__n_estimators': 1000} | MAE: 6.413
Best params ft: {'etr__bootstrap': False, 'etr__criterion': 'squared_error', 'etr__max_depth': 12, 'etr__max_features': 'sqrt', 'etr__min_samples_leaf': 2, 'etr__n_estimators': 1500} | MAE: 8.183


In [20]:
cvres_ht = pd.DataFrame(gcv_ht.cv_results_).sort_values("rank_test_utility")
print(cvres_ht[[
    "rank_test_utility","mean_test_mae","std_test_mae",
    "param_etr__n_estimators","param_etr__criterion",
    "param_etr__max_depth","param_etr__min_samples_leaf",
    "param_etr__max_features","param_etr__bootstrap"
]].head(10))

cvres_ft = pd.DataFrame(gcv_ft.cv_results_).sort_values("rank_test_utility")
print(cvres_ft[[
    "rank_test_utility","mean_test_mae","std_test_mae",
    "param_etr__n_estimators","param_etr__criterion",
    "param_etr__max_depth","param_etr__min_samples_leaf",
    "param_etr__max_features","param_etr__bootstrap"
]].head(10))

     rank_test_utility  mean_test_mae  std_test_mae  param_etr__n_estimators  \
88                   1       6.413486      0.494875                     1000   
87                   2       6.413550      0.506266                      600   
142                  3       6.413814      0.495259                     1000   
141                  4       6.413857      0.506632                      600   
89                   5       6.415117      0.495805                     1500   
143                  6       6.415380      0.496111                     1500   
114                  7       6.417680      0.510848                      600   
115                  8       6.419134      0.501184                     1000   
116                  9       6.420983      0.502346                     1500   
84                  10       6.438427      0.470991                      600   

    param_etr__criterion param_etr__max_depth  param_etr__min_samples_leaf  \
88        absolute_error                 