In [19]:
import pandas as pd
import numpy as np
import re

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer
from pathlib import Path

# --- function to convert datetime ---
def parse_sa(s: str):
    if pd.isna(s):
        return pd.NaT
    s = re.sub(r'(\b\d{1,2})(st|nd|rd|th)\b', r'\1', s)     # 20th -> 20
    s = s.replace("SAST", "").strip()                       # drop SAST
    dt = pd.to_datetime(s, dayfirst=True, errors="coerce")  # tz-naive
    return dt

In [20]:
import os
from pathlib import Path

print("CWD:", os.getcwd())


CWD: c:\Very old laptop\Betting\urc-score-prediction


In [21]:
# --- Load data ---
df = pd.read_csv("data/processed/matches_with_weather_features24.csv")

# Parse datetime and sort for time-aware CV
df["Date_time"] = df["Date_time"].apply(parse_sa)
df = df.sort_values("Date_time").reset_index(drop=True)

In [22]:
# --- Select features and targets ---
X_cols = ["Home_team","Away_team","Venue","wx_temp_c","wx_summary",
          "time_bucket","is_in_south_africa","is_main_home_stadium"]

Y_ht_cols = ["Halftime_score_home","Halftime_score_away"]
Y_ft_cols = ["Fulltime_score_home","Fulltime_score_away"]

X = df[X_cols]
Y_ht = df[Y_ht_cols].values
Y_ft = df[Y_ft_cols].values

In [23]:
# --- Preprocessing ---
cat = ["Home_team", "Away_team", "Venue", "wx_summary", "time_bucket", "is_in_south_africa", "is_main_home_stadium"]
num = ["wx_temp_c"]

# OneHotEncoder
prep = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num),
])


In [24]:
# --- Model ---
rf_ht = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_ft = RandomForestRegressor(random_state=42, n_jobs=-1)

pipe_ht = Pipeline([("prep", prep), ("rf", rf_ht)])
pipe_ft = Pipeline([("prep", prep), ("rf", rf_ft)])

In [25]:
# --- 5-fold TimeSeriesSplit CV ---
tscv = TimeSeriesSplit(n_splits=5)

def cv_mae(pipe, X, Y):
    total_abs = np.zeros(Y.shape[1]); total_n = 0
    for tr, te in tscv.split(X):
        pipe.fit(X.iloc[tr], Y[tr])
        pred = np.clip(pipe.predict(X.iloc[te]), 0, None)
        total_abs += np.sum(np.abs(Y[te] - pred), axis=0)
        #print(total_abs)
        total_n += len(te)
        #print(total_n)
    return total_abs / total_n   # per-target MAE

mae_ht = cv_mae(pipe_ht, X, Y_ht)  # [MAE_HT_home, MAE_HT_away]
mae_ft = cv_mae(pipe_ft, X, Y_ft)  # [MAE_FT_home, MAE_FT_away]

print(f"HT MAE home/away: {mae_ht[0]:.3f} / {mae_ht[1]:.3f}")
print(f"FT MAE home/away: {mae_ft[0]:.3f} / {mae_ft[1]:.3f}")

HT MAE home/away: 6.915 / 6.570
FT MAE home/away: 9.848 / 7.636


In [38]:
# --- 5-fold TimeSeriesSplit CV ---
tscv = TimeSeriesSplit(n_splits=5)

def mae_clip(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    return mean_absolute_error(y_true, y_pred)

scoring = {
    # used for picking the best model (larger is better because it's the negated loss)
    "utility": make_scorer(mae_clip, greater_is_better=False),
    # flip sign for reporting (positive MAE)
    "mae": make_scorer(mae_clip, greater_is_better=True),
}

In [39]:
param_grid = {
    "rf__n_estimators":    [300, 600, 900],
    "rf__max_depth":       [None, 12, 18, 24],
    "rf__min_samples_leaf":[1, 2, 4],
    "rf__max_features":    ["sqrt", 0.5, 1.0],  # try less/more randomness
}

In [40]:
gcv_ht = GridSearchCV(
    estimator=pipe_ht,
    param_grid=param_grid,
    scoring=scoring,  
    refit="utility",        
    cv=tscv,
    n_jobs=-1,
    verbose=1,
)

gcv_ft = GridSearchCV(
    estimator=pipe_ft,
    param_grid=param_grid,
    scoring=scoring,  
    refit="utility",         
    cv=tscv,
    n_jobs=-1,
    verbose=1,
)

gcv_ht.fit(X, Y_ht)
gcv_ft.fit(X, Y_ft)

best_mae_ht = -gcv_ht.best_score_
best_mae_ft = -gcv_ft.best_score_

print(f"Best params ht: {gcv_ht.best_params_} | MAE: {best_mae_ht:.3f}")
print(f"Best params ft: {gcv_ft.best_params_} | MAE: {best_mae_ft:.3f}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best params ht: {'rf__max_depth': 12, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 4, 'rf__n_estimators': 600} | MAE: 6.538
Best params ft: {'rf__max_depth': 12, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__n_estimators': 600} | MAE: 8.240


In [43]:
cvres_ht = pd.DataFrame(gcv_ht.cv_results_).sort_values("rank_test_utility")
print(cvres_ht[["rank_test_utility","mean_test_mae","std_test_mae",
             "param_rf__n_estimators","param_rf__max_depth",
             "param_rf__min_samples_leaf","param_rf__max_features"]].head(10))

cvres_ft = pd.DataFrame(gcv_ft.cv_results_).sort_values("rank_test_utility")
print(cvres_ft[["rank_test_utility","mean_test_mae","std_test_mae",
             "param_rf__n_estimators","param_rf__max_depth",
             "param_rf__min_samples_leaf","param_rf__max_features"]].head(10))

    rank_test_utility  mean_test_mae  std_test_mae  param_rf__n_estimators  \
34                  1       6.538281      0.458019                     600   
7                   2       6.538305      0.458040                     600   
61                  2       6.538305      0.458040                     600   
88                  2       6.538305      0.458040                     600   
59                  5       6.538618      0.469808                     900   
5                   6       6.539508      0.470735                     900   
86                  7       6.539508      0.470735                     900   
32                  8       6.540300      0.471239                     900   
6                   9       6.541063      0.454950                     300   
87                  9       6.541063      0.454950                     300   

   param_rf__max_depth  param_rf__min_samples_leaf param_rf__max_features  
34                  12                           4               