In [2]:
import os
import pandas as pd
from pathlib import Path

# Set project root and define directories
project_root = Path(...).resolve()
os.chdir(project_root)

data_dir = project_root / "02_preprocessing"
output_dir = project_root / "03_training"
output_dir.mkdir(parents=True, exist_ok=True)

# Load in combined data 
combined_data_path = data_dir / "combined_data.csv"
combined_data = pd.read_csv(combined_data_path)

In [3]:
combined_data

Unnamed: 0,Team,Season,Date,Time,Round,Day,Venue,Result,GF,GA,...,SoT_Rolling,Dist_Rolling,FK_Rolling,PK_Rolling,PKatt_Rolling,Venue_Code,Opp_Code,Hour,Day_Code,Target
0,Alaves,2023-2024,2023-08-14,19:30,Matchweek 1,Mon,Away,L,0,1,...,,,,,,0,7,19.0,0.0,0
1,Alaves,2023-2024,2023-08-21,19:00,Matchweek 2,Mon,Home,W,4,3,...,,,,,,1,23,19.0,0.0,1
2,Alaves,2023-2024,2023-08-28,19:30,Matchweek 3,Mon,Away,L,0,1,...,,,,,,0,11,19.0,0.0,0
3,Alaves,2023-2024,2023-09-02,18:30,Matchweek 4,Sat,Home,W,1,0,...,,,,,,1,24,18.0,5.0,1
4,Alaves,2023-2024,2023-09-15,21:00,Matchweek 5,Fri,Away,L,0,2,...,,,,,,0,20,21.0,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3439,Villarreal,2024-2025,2024-12-08,16:15,Matchweek 16,Sun,Away,L,0,2,...,4.000000,16.400000,0.333333,0.666667,0.666667,0,2,16.0,6.0,0
3440,Villarreal,2024-2025,2024-12-15,18:30,Matchweek 17,Sun,Home,L,1,2,...,3.666667,16.166667,0.000000,0.333333,0.333333,1,5,18.0,6.0,0
3441,Villarreal,2024-2025,2024-12-18,21:30,Matchweek 12,Wed,Home,D,1,1,...,2.666667,15.533333,0.000000,0.000000,0.000000,1,20,21.0,2.0,0
3442,Villarreal,2024-2025,2024-12-22,18:30,Matchweek 18,Sun,Away,W,5,2,...,3.000000,16.366667,0.000000,0.000000,0.000000,0,16,18.0,6.0,1


In [4]:
# Load in future games 
future_games__path = data_dir / "future_games.csv"
future_games = pd.read_csv(future_games__path)

In [5]:
future_games

Unnamed: 0,Wk,Day,Date,Time,Venue,Team,Opponent,Hour,Day_Code,Venue_Code,Opp_Code
0,19.0,Fri,2025-01-10,21:00,Estadio del Rayo Vallecano,Rayo Vallecano,Celta Vigo,21,4,1,6
1,19.0,Sat,2025-01-11,14:00,Estadio de Mendizorroza,Alaves,Girona,14,5,1,12
2,19.0,Sat,2025-01-11,16:15,Estadio Municipal José Zorrilla,Valladolid,Real Betis,16,5,1,5
3,19.0,Sat,2025-01-11,18:30,RCDE Stadium,Espanyol,Leganes,18,5,1,16
4,19.0,Sat,2025-01-11,21:00,Estadio Ramón Sánchez Pizjuán,Sevilla,Valencia,21,5,1,24
5,19.0,Sun,2025-01-12,13:00,Estadio de Gran Canaria,Las Palmas,Getafe,13,6,1,11
6,19.0,Sun,2025-01-12,16:15,Riyadh Air Metropolitan Stadium,Atletico Madrid,Osasuna,16,6,1,19
7,19.0,Mon,2025-01-13,21:00,Reale Arena,Real Sociedad,Villarreal,21,0,1,26
8,20.0,Fri,2025-01-17,21:00,RCDE Stadium,Espanyol,Valladolid,21,4,1,25
9,20.0,Sat,2025-01-18,14:00,Estadio El Sadar,Osasuna,Rayo Vallecano,14,5,1,20


In [7]:
# Define rolling average columns
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
new_cols = [f"{c}_Rolling" for c in cols]

# Define predictors and target
predictors = ["Venue_Code", "Opp_Code", "Hour", "Day_Code"] + new_cols

# Ensure the columns exist in combined_data
missing_predictors_in_combined_data = set(predictors) - set(combined_data.columns)
if missing_predictors_in_combined_data:
    raise ValueError(f"Missing predictors in combined_data: {missing_predictors_in_combined_data}")

X = combined_data[predictors]
y = combined_data["Target"]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, min_samples_split=10, random_state=42)
rf.fit(X_train, y_train)

In [19]:
# Check if all predictors are present in future_games
missing_predictors_in_future_games = set(predictors) - set(future_games.columns)
if missing_predictors_in_future_games:
    print(f"Adding missing predictors: {missing_predictors_in_future_games}")

# Get the most recent rolling averages for each team from combined_data
recent_rolling = (
    combined_data.sort_values("Date")
    .groupby("Team")
    .last()[new_cols] 
    .reset_index()
)

# Merge recent rolling averages with future_games
future_games = future_games.merge(recent_rolling, on="Team", how="left")

# Check for missing rolling averages after merge
missing_rollings = set(new_cols) - set(future_games.columns)
if missing_rollings:
    raise ValueError(f"Rolling averages still missing after merge: {missing_rollings}")

# Fill missing rolling averages with 0 (or another default value if appropriate)
future_games[new_cols] = future_games[new_cols].fillna(0)

# Prepare future_games for prediction
X_future = future_games[predictors]

# Check for missing values in X_future
if X_future.isnull().any().any():
    raise ValueError("Missing values detected in X_future. Please check preprocessing steps.")


Adding missing predictors: {'FK_Rolling', 'Sh_Rolling', 'Dist_Rolling', 'GF_Rolling', 'SoT_Rolling', 'PK_Rolling', 'PKatt_Rolling', 'GA_Rolling'}


In [20]:
future_games

Unnamed: 0,Wk,Day,Date,Time,Venue,Team,Opponent,Hour,Day_Code,Venue_Code,Opp_Code,GF_Rolling,GA_Rolling,Sh_Rolling,SoT_Rolling,Dist_Rolling,FK_Rolling,PK_Rolling,PKatt_Rolling
0,19.0,Fri,2025-01-10,21:00,Estadio del Rayo Vallecano,Rayo Vallecano,Celta Vigo,21,4,1,6,1.666667,1.666667,14.0,3.666667,18.2,0.666667,0.0,0.0
1,19.0,Sat,2025-01-11,14:00,Estadio de Mendizorroza,Alaves,Girona,14,5,1,12,1.666667,1.666667,7.333333,2.666667,16.966667,0.666667,0.333333,0.333333
2,19.0,Sat,2025-01-11,16:15,Estadio Municipal José Zorrilla,Valladolid,Real Betis,16,5,1,5,0.666667,1.666667,4.0,1.333333,28.466667,0.333333,0.0,0.0
3,19.0,Sat,2025-01-11,18:30,RCDE Stadium,Espanyol,Leganes,18,5,1,16,0.333333,0.666667,9.333333,2.666667,21.2,0.0,0.0,0.0
4,19.0,Sat,2025-01-11,21:00,Estadio Ramón Sánchez Pizjuán,Sevilla,Valencia,21,5,1,24,2.0,2.666667,7.333333,3.333333,19.2,0.0,0.0,0.0
5,19.0,Sun,2025-01-12,13:00,Estadio de Gran Canaria,Las Palmas,Getafe,13,6,1,11,1.0,0.333333,10.0,3.666667,21.833333,1.333333,0.0,0.0
6,19.0,Sun,2025-01-12,16:15,Riyadh Air Metropolitan Stadium,Atletico Madrid,Osasuna,16,6,1,19,2.333333,1.333333,10.0,5.333333,20.466667,0.333333,0.0,0.0
7,19.0,Mon,2025-01-13,21:00,Reale Arena,Real Sociedad,Villarreal,21,0,1,26,1.0,0.666667,12.333333,3.333333,17.766667,0.0,0.0,0.0
8,20.0,Fri,2025-01-17,21:00,RCDE Stadium,Espanyol,Valladolid,21,4,1,25,0.333333,0.666667,9.333333,2.666667,21.2,0.0,0.0,0.0
9,20.0,Sat,2025-01-18,14:00,Estadio El Sadar,Osasuna,Rayo Vallecano,14,5,1,20,1.0,1.333333,9.333333,3.666667,20.533333,0.333333,0.0,0.0


In [21]:
# Save trained_future_games.csv
trained_future_games_path = output_dir / "trained_future_games.csv"

future_games.to_csv(trained_future_games_path, index=False)
print(f"Saved to {trained_future_games_path}")

Trained future games saved to /Users/jventurav/La_Liga_Match_Predictor/data/trained_future_games.csv


In [23]:
import joblib

# Save training model
model_path = "random_forest_model.pkl"
joblib.dump(rf, model_path)

# Save predictors
predictors_path = "predictors.pkl"
joblib.dump(predictors, predictors_path)

print(f"Model saved to {model_path}")
print(f"Predictors saved to {predictors_path}")

Model saved to random_forest_model.pkl
Predictors saved to predictors.pkl
