In [1]:


import os
import json
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBRegressor
import joblib

DATA_PATH = os.path.join(os.getcwd(), "dataset", "data.csv")
MODEL_DIR = os.path.join(os.getcwd(), "models")
MODEL_HOME_PATH = os.path.join(MODEL_DIR, "home_model.pkl")
MODEL_AWAY_PATH = os.path.join(MODEL_DIR, "away_model.pkl")

TARGET_HOME = "Full Time Home Goals"
TARGET_AWAY = "Full Time Away Goals"
CATEGORICAL_FEATURES = ["HomeTeam", "AwayTeam"]
EXCLUDE_COLUMNS = {"Date", TARGET_HOME, TARGET_AWAY}

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)





In [2]:
df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%y", errors="coerce")


for col in df.columns:
    if col in CATEGORICAL_FEATURES or col == "Date":
        continue
    df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.dropna(subset=["Date"]).copy()   
print("Rows:", len(df))
df.head() 

Rows: 4560


Unnamed: 0,Date,HomeTeam,AwayTeam,Elo_H_before,Elo_A_before,GoalsScore_H,GoalsAgainst_H,GoalDifference_H,WinStreak_H,LoseStreak_H,Wins_H,Losses_H,WinRate_H,GoalsScore_H_avg,GoalsAgainst_H_avg,GoalsScore_A,GoalsAgainst_A,GoalDifference_A,WinStreak_A,LoseStreak_A,Wins_A,Losses_A,WinRate_A,GoalsScore_A_avg,GoalsAgainst_A_avg,H2H_last_5,Full Time Home Goals,Full Time Away Goals
0,2013-08-17,Arsenal,Aston Villa,1500.0,1500.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,1,3
1,2013-08-17,Liverpool,Stoke,1500.0,1500.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,1,0
2,2013-08-17,Norwich,Everton,1500.0,1500.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,2,2
3,2013-08-17,Sunderland,Fulham,1500.0,1500.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,1
4,2013-08-17,Swansea,Man United,1500.0,1500.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0,1,4


## EDA

In [3]:
# Quick EDA
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4560 entries, 0 to 4559
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  4560 non-null   datetime64[ns]
 1   HomeTeam              4560 non-null   object        
 2   AwayTeam              4560 non-null   object        
 3   Elo_H_before          4560 non-null   float64       
 4   Elo_A_before          4560 non-null   float64       
 5   GoalsScore_H          4560 non-null   int64         
 6   GoalsAgainst_H        4560 non-null   int64         
 7   GoalDifference_H      4560 non-null   int64         
 8   WinStreak_H           4560 non-null   int64         
 9   LoseStreak_H          4560 non-null   int64         
 10  Wins_H                4560 non-null   int64         
 11  Losses_H              4560 non-null   int64         
 12  WinRate_H             4560 non-null   float64       
 13  GoalsScore_H_avg  

## Data Preprocessing


In [4]:
def split_features_targets(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
    df = df.dropna(subset=[TARGET_HOME, TARGET_AWAY])
    feature_columns = [c for c in df.columns if c not in EXCLUDE_COLUMNS]
    X = df[feature_columns].copy()
    y_home = df[TARGET_HOME].astype(float)
    y_away = df[TARGET_AWAY].astype(float)
    return X, y_home, y_away

In [5]:
# Season-based splits

train_start = pd.Timestamp("2013-07-01")
train_end = pd.Timestamp("2023-06-30")
val_start = pd.Timestamp("2023-07-01")
val_end = pd.Timestamp("2024-06-30")
test_start = pd.Timestamp("2024-07-01")
test_end = pd.Timestamp("2025-06-30")

mask_train = (df["Date"] >= train_start) & (df["Date"] <= train_end)
mask_val = (df["Date"] >= val_start) & (df["Date"] <= val_end)
mask_test = (df["Date"] >= test_start) & (df["Date"] <= test_end)

df_train = df[mask_train]
df_val = df[mask_val]
df_test = df[mask_test]

print(len(df_train), len(df_val), len(df_test))


3800 380 380


In [6]:
# Build features

X_train, yh_train, ya_train = split_features_targets(df_train)
X_val, yh_val, ya_val = split_features_targets(df_val)
X_test, yh_test, ya_test = split_features_targets(df_test)


In [7]:
categorical = [c for c in CATEGORICAL_FEATURES if c in X_train.columns]
numeric = [c for c in X_train.columns if c not in categorical]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric),
    ("cat", categorical_transformer, categorical),
])


## Model

In [None]:
def build_model(preprocessor: ColumnTransformer) -> Pipeline:

# Thay model vào để test

    xgb_model = XGBRegressor(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=42,
        tree_method="hist",
        n_jobs=4,
    )

# Thay model vô pipeline
    model = xgb_model
    
    return Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model),
    ])

In [9]:

model_home = build_model(preprocessor)
model_away = build_model(preprocessor)

In [10]:
# Train
model_home.fit(X_train, yh_train)
model_away.fit(X_train, ya_train)


## Evaluate

In [11]:
def compute_metrics(y_true: pd.Series, y_pred: np.ndarray) -> Dict[str, float]:
    return {
        "MAE": float(mean_absolute_error(y_true, y_pred)),
        "RMSE": float(np.sqrt(mean_squared_error(y_true, y_pred))),
        "R2": float(r2_score(y_true, y_pred)),
    }


In [12]:

from pprint import pprint

metrics = {
    "val": {
        "home": compute_metrics(yh_val, model_home.predict(X_val)) if len(X_val) else {"MAE": None, "RMSE": None, "R2": None},
        "away": compute_metrics(ya_val, model_away.predict(X_val)) if len(X_val) else {"MAE": None, "RMSE": None, "R2": None},
    },
    "test": {
        "home": compute_metrics(yh_test, model_home.predict(X_test)) if len(X_test) else {"MAE": None, "RMSE": None, "R2": None},
        "away": compute_metrics(ya_test, model_away.predict(X_test)) if len(X_test) else {"MAE": None, "RMSE": None, "R2": None},
    },
}

pprint(metrics)


{'test': {'away': {'MAE': 0.8802809047071557,
                   'R2': 0.06145640204186642,
                   'RMSE': 1.1512603309819536},
          'home': {'MAE': 1.0129366269629252,
                   'R2': -0.004192313243202905,
                   'RMSE': 1.2789067870788136}},
 'val': {'away': {'MAE': 0.9359125093703992,
                  'R2': 0.0659629381549044,
                  'RMSE': 1.2333220326642842},
         'home': {'MAE': 1.0602373274925507,
                  'R2': 0.03466914401937782,
                  'RMSE': 1.339968429525098}}}


In [13]:
# Save models and metrics

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(model_home, MODEL_HOME_PATH)
joblib.dump(model_away, MODEL_AWAY_PATH)
print("Saved:", MODEL_HOME_PATH, MODEL_AWAY_PATH)

with open(os.path.join(MODEL_DIR, "metrics.json"), "w") as f:
    json.dump(metrics, f, indent=2)
print("Saved metrics to models/metrics.json")


Saved: /home/hunglk/Documents/VSCode/FPT_Campuslink/ML/Final Project/models/home_model.pkl /home/hunglk/Documents/VSCode/FPT_Campuslink/ML/Final Project/models/away_model.pkl
Saved metrics to models/metrics.json
