In [7]:
import pandas as pd
import numpy as np
import re

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from pathlib import Path

# --- function to convert datetime ---
def parse_sa(s: str):
    if pd.isna(s):
        return pd.NaT
    s = re.sub(r'(\b\d{1,2})(st|nd|rd|th)\b', r'\1', s)     # 20th -> 20
    s = s.replace("SAST", "").strip()                       # drop SAST
    dt = pd.to_datetime(s, dayfirst=True, errors="coerce")  # tz-naive
    return dt

In [10]:
from pathlib import Path
import os

print("CWD:", os.getcwd())
p = Path("data/processed")
print("processed dir exists?", p.exists())
print("CSV files I can see there:")
print([q.name for q in p.glob("*.csv")])

CWD: c:\Very old laptop\Betting\urc-score-prediction\notebooks
processed dir exists? False
CSV files I can see there:
[]


In [12]:
# --- Load data ---
df = pd.read_csv("../data/processed/matches_with_weather_features24.csv")

# # Parse datetime and sort for time-aware CV
# df["Date_time"] = pd.to_datetime(df["Date_time"])
# df = df.sort_values("Date_time").reset_index(drop=True)

In [13]:
# --- Select features and targets ---
X_cols = ["Home_team","Away_team","Venue","wx_temp_c","wx_summary",
          "time_bucket","is_in_south_africa","is_main_home_stadium"]

Y_ht_cols = ["Halftime_score_home","Halftime_score_away"]
Y_ft_cols = ["Fulltime_score_home","Fulltime_score_away"]

X = df[X_cols]
Y_ht = df[Y_ht_cols].values
Y_ft = df[Y_ft_cols].values

In [14]:
# --- Preprocessing ---
cat = ["Home_team", "Away_team", "Venue", "wx_summary", "time_bucket", "is_in_south_africa", "is_main_home_stadium"]
num = ["wx_temp_c"]

# OneHotEncoder
prep = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num),
])


In [15]:
# --- Model ---
rf_ht = RandomForestRegressor(n_estimators=600, min_samples_leaf=2, n_jobs=-1, random_state=42)
rf_ft = RandomForestRegressor(n_estimators=600, min_samples_leaf=2, n_jobs=-1, random_state=42)

pipe_ht = Pipeline([("prep", prep), ("rf", rf_ht)])
pipe_ft = Pipeline([("prep", prep), ("rf", rf_ft)])

In [16]:
np.zeros(Y_ht.shape[1])

array([0., 0.])

In [17]:
# --- 5-fold TimeSeriesSplit CV ---
tscv = TimeSeriesSplit(n_splits=5)

def cv_mae(pipe, X, Y):
    total_abs = np.zeros(Y.shape[1]); total_n = 0
    for tr, te in tscv.split(X):
        pipe.fit(X.iloc[tr], Y[tr])
        pred = np.clip(pipe.predict(X.iloc[te]), 0, None)
        total_abs += np.sum(np.abs(Y[te] - pred), axis=0)
        print(total_abs)
        total_n += len(te)
        print(total_n)
    return total_abs / total_n   # per-target MAE

mae_ht = cv_mae(pipe_ht, X, Y_ht)  # [MAE_HT_home, MAE_HT_away]
mae_ft = cv_mae(pipe_ft, X, Y_ft)  # [MAE_FT_home, MAE_FT_away]

print(f"HT MAE home/away: {mae_ht[0]:.3f} / {mae_ht[1]:.3f}")
print(f"FT MAE home/away: {mae_ft[0]:.3f} / {mae_ft[1]:.3f}")

[163.23671098 134.27504299]
25
[333.95639021 286.90103644]
50
[463.82674886 470.71627784]
75
[666.6983664  638.84680315]
100
[866.83707924 789.1999884 ]
125
[248.00944444 203.22568585]
25
[517.89800992 368.99565807]
50
[707.67820016 551.39643308]
75
[972.1207601  796.83731999]
100
[1246.90565176  963.03485113]
125
HT MAE home/away: 6.935 / 6.314
FT MAE home/away: 9.975 / 7.704
