In [1]:
# Train a match outcome prediction model using recent form, past encounters, key stats, and unpredictability factors.

import pandas as pd
import joblib
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score

# Load Data (Update these paths based on your directory structure)
data_dir = Path("PATH/TO/02_preprocessing")  # Replace with your actual path
history_path = data_dir / "la_liga_history.csv"
history_path = data_dir / "la_liga_history.csv"
recent_path = data_dir / "la_liga_recent.csv"

history_data = pd.read_csv(history_path)
recent_data = pd.read_csv(recent_path)

if "Target" not in recent_data.columns:
    recent_data["Target"] = (recent_data["Result"] == "W").astype(int)
if "Target" not in history_data.columns:
    history_data["Target"] = (history_data["Result"] == "W").astype(int)

history_data["Date"] = pd.to_datetime(history_data["Date"])
recent_data["Date"] = pd.to_datetime(recent_data["Date"])

combined_data = pd.concat([history_data, recent_data], ignore_index=True)

# Compute Past Encounters Score
def calculate_past_encounter_score(team, opponent, data):
    past_matches = data[((data["Team_Code"] == team) & (data["Opp_Code"] == opponent)) |
                        ((data["Team_Code"] == opponent) & (data["Opp_Code"] == team))]
    
    past_matches = past_matches.sort_values("Date", ascending=False).head(5)
    team_wins = sum((past_matches["Team_Code"] == team) & (past_matches["Target"] == 1))
    
    return (team_wins / 5) * 35  

combined_data["Past_Encounters_Score"] = combined_data.apply(
    lambda row: calculate_past_encounter_score(row["Team_Code"], row["Opp_Code"], history_data), axis=1)

# Compute Recent Form Score
def calculate_recent_form_score(team, data):
    recent_matches = data[data["Team_Code"] == team].sort_values("Date", ascending=False).head(6)
    wins = sum(recent_matches["Target"] == 1)
    goal_difference = recent_matches["GF"].sum() - recent_matches["GA"].sum()
    
    return ((wins / 6) * 30) + ((goal_difference / 10) * 5) 

combined_data["Recent_Form_Score"] = combined_data["Team_Code"].apply(
    lambda team: calculate_recent_form_score(team, recent_data))

# Compute Key Stats Score 
key_stats = ["GF", "GA", "SoT", "PK", "PKatt"]

# Ensure proper scaling
for stat in key_stats:
    min_val = combined_data[stat].min()
    max_val = combined_data[stat].max()
    combined_data[f"{stat}_Scaled"] = (combined_data[stat] - min_val) / (max_val - min_val) if max_val > min_val else 0

combined_data["Key_Stats_Score"] = combined_data[[f"{stat}_Scaled" for stat in key_stats]].sum(axis=1) * 5 

# Unpredictability Factor (5 pts)
combined_data["Unpredictability_Factor"] = np.random.randint(0, 6, size=len(combined_data))

# Train Model
predictors = ["Past_Encounters_Score", "Recent_Form_Score", "Key_Stats_Score", "Unpredictability_Factor"]
X = combined_data[predictors]
y = combined_data["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
model = CalibratedClassifierCV(base_model, cv=5)  
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

output_dir = Path("/Users/jventurav/La_Liga_Match_Predictor/03_training")
output_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(model, output_dir / "match_predictor.pkl")
joblib.dump(predictors, output_dir / "predictors.pkl")

print("Model & predictors saved, success.")

Accuracy: 0.77
Model & predictors saved, success.
