# Match Results Multi-Regression Model with FLAML

This notebook implements a multi-regression model to predict `FTHome` and `FTAway` scores using the FLAML AutoML library.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
df = pd.read_csv('../data-prep/matches-prepared.csv')
print(f"Dataset shape: {df.shape}")
display(df.head())

## Preprocessing

Selecting features and splitting the data.

In [None]:
# Define features (X) and target variables (y)
# Excluding columns that are not features or could cause data leakage
drop_cols = ['Division', 'MatchDate', 'FTHome', 'FTAway', 'TotalGoals', 'high_score', 'total_goals', 'over25_binary', 'FTResult', 'HTResult', 'match_result']
X = df.drop(columns=[col for col in drop_cols if col in df.columns])
y_home = df['FTHome']
y_away = df['FTAway']

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Split data into training and testing sets
X_train, X_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(
    X, y_home, y_away, test_size=0.2, random_state=42
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## AutoML Training with FLAML (Improved)

Training two separate models for `FTHome` and `FTAway` with increased budget and ensembling.

In [None]:
automl_home = AutoML()
settings = {
    "time_budget": 300,  # 5 minutes search budget
    "metric": 'mae',
    "task": 'regression',
    "ensemble": False,   # Disabled for now to get results faster, can be set to True later
    "log_file_name": 'flaml_home.log',
}

print("Training AutoML for FTHome...")
automl_home.fit(X_train=X_train, y_train=y_home_train, **settings)

In [None]:
automl_away = AutoML()
settings = {
    "time_budget": 300,  # 5 minutes search budget
    "metric": 'mae',
    "task": 'regression',
    "ensemble": False,   # Disabled for now to get results faster, can be set to True later
    "log_file_name": 'flaml_away.log',
}

print("Training AutoML for FTAway...")
automl_away.fit(X_train=X_train, y_train=y_away_train, **settings)

## Evaluation

Analyzing the results and metrics. Predictions are rounded to the nearest whole number for realistic score prediction.

In [None]:
def evaluate_regression(y_true, y_pred, label):
    # Round predictions to nearest integer for realistic football scores
    y_pred_rounded = np.round(y_pred).astype(int)
    
    mae = mean_absolute_error(y_true, y_pred_rounded)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred_rounded))
    r2 = r2_score(y_true, y_pred_rounded)
    
    print(f"\nMetrics for {label} (Rounded Predictions):")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    return mae, rmse, r2

y_home_pred = automl_home.predict(X_test)
y_away_pred = automl_away.predict(X_test)

evaluate_regression(y_home_test, y_home_pred, "FTHome")
evaluate_regression(y_away_test, y_away_pred, "FTAway")

## Error Analysis

Visualizing error distributions and residuals using rounded predictions.

In [None]:
y_home_pred_rounded = np.round(y_home_pred).astype(int)
y_away_pred_rounded = np.round(y_away_pred).astype(int)

plt.figure(figsize=(15, 10))

# Actual vs Predicted
plt.subplot(2, 2, 1)
sns.scatterplot(x=y_home_test, y=y_home_pred_rounded, alpha=0.1)
plt.plot([0, y_home_test.max()], [0, y_home_test.max()], '--r')
plt.title('FTHome: Actual vs Predicted (Rounded)')
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.subplot(2, 2, 2)
sns.scatterplot(x=y_away_test, y=y_away_pred_rounded, alpha=0.1)
plt.plot([0, y_away_test.max()], [0, y_away_test.max()], '--r')
plt.title('FTAway: Actual vs Predicted (Rounded)')
plt.xlabel('Actual')
plt.ylabel('Predicted')

# Error Distributions
plt.subplot(2, 2, 3)
sns.histplot(y_home_test - y_home_pred_rounded, kde=True, discrete=True)
plt.title('FTHome Error Distribution (Rounded)')
plt.xlabel('Error (Actual - Predicted)')

plt.subplot(2, 2, 4)
sns.histplot(y_away_test - y_away_pred_rounded, kde=True, discrete=True)
plt.title('FTAway Error Distribution (Rounded)')
plt.xlabel('Error (Actual - Predicted)')

plt.tight_layout()
plt.show()

In [None]:
print("\nBest model for FTHome:", automl_home.best_estimator)
print("Best model for FTAway:", automl_away.best_estimator)

## Prediction Demo

Predicting results for specific matches from the test set. Predictions are rounded to integers.

In [None]:
def show_predictions(num_samples=10):
    indices = np.random.choice(X_test.index, num_samples, replace=False)
    samples = X_test.loc[indices]
    
    actual_home = y_home_test.loc[indices]
    actual_away = y_away_test.loc[indices]
    
    pred_home = automl_home.predict(samples)
    pred_away = automl_away.predict(samples)
    
    # Round to integers
    pred_home_rounded = np.round(pred_home).astype(int)
    pred_away_rounded = np.round(pred_away).astype(int)
    
    results = pd.DataFrame({
        'Home Team': samples['HomeTeam'],
        'Away Team': samples['AwayTeam'],
        'Actual Home': actual_home.values,
        'Pred Home': pred_home_rounded,
        'Actual Away': actual_away.values,
        'Pred Away': pred_away_rounded
    })
    
    display(results)

print("Example predictions for random matches from the test set (Rounded to Integers):")
show_predictions(10)