In [1]:
# Cell 1: Imports & Data Loading
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load data
df = pd.read_csv("merged_football_understat_last_two_seasons_clean.csv")

# Preview
display(df.head())

Unnamed: 0,date,home_team,away_team,home_score,away_score,status,season,home_xG,home_xGA,home_result,away_xG,away_xGA,away_result
0,2023-08-11,Burnley FC,Manchester City FC,0,3,FINISHED,2023,0.311032,2.40074,l,2.40074,0.311032,w
1,2023-08-12,Arsenal FC,Nottingham Forest FC,2,1,FINISHED,2023,0.84262,0.966305,w,0.966305,0.84262,l
2,2023-08-12,AFC Bournemouth,West Ham United FC,1,1,FINISHED,2023,1.51025,1.4834,d,,,
3,2023-08-12,Brighton & Hove Albion FC,Luton Town FC,4,1,FINISHED,2023,,,,,,
4,2023-08-12,Everton FC,Fulham FC,0,1,FINISHED,2023,2.59001,1.58144,l,1.58144,2.59001,w


In [2]:
# Cell 2: Preprocessing
# Drop rows with missing scores
df = df.dropna(subset=['home_score', 'away_score'])

# Encode team names
le_home = LabelEncoder()
le_away = LabelEncoder()

df['home_team_encoded'] = le_home.fit_transform(df['home_team'])
df['away_team_encoded'] = le_away.fit_transform(df['away_team'])

# Features & target
feature_cols = ['home_team_encoded', 'away_team_encoded', 'home_xG', 'away_xG', 'home_xGA', 'away_xGA']
X = df[feature_cols]
y = df[['home_score', 'away_score']]

# Handle NaNs
X = X.fillna(0)
y = y.fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Cell 3: Model Training
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=200, random_state=42))
model.fit(X_train, y_train)

# Predictions & Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")


Mean Absolute Error: 0.84


In [4]:
# Cell 4: Prediction Function
def predict_match(home_team, away_team):
    if home_team not in le_home.classes_ or away_team not in le_away.classes_:
        return "One or both team names not in dataset."
    
    # Encode
    home_encoded = le_home.transform([home_team])[0]
    away_encoded = le_away.transform([away_team])[0]
    
    # Example: Using average xG and xGA from last season for each team
    home_stats = df[df['home_team'] == home_team][['home_xG', 'home_xGA']].mean().fillna(0)
    away_stats = df[df['away_team'] == away_team][['away_xG', 'away_xGA']].mean().fillna(0)
    
    match_features = np.array([
        home_encoded,
        away_encoded,
        home_stats['home_xG'],
        away_stats['away_xG'],
        home_stats['home_xGA'],
        away_stats['away_xGA']
    ]).reshape(1, -1)
    
    pred_home_score, pred_away_score = model.predict(match_features)[0]
    pred_home_score, pred_away_score = round(pred_home_score), round(pred_away_score)
    
    # Determine winner
    if pred_home_score > pred_away_score:
        outcome = f"{home_team} Win"
    elif pred_home_score < pred_away_score:
        outcome = f"{away_team} Win"
    else:
        outcome = "Draw"
    
    return {
        "Predicted Outcome": outcome,
        "Predicted Scoreline": f"{home_team} {pred_home_score} - {pred_away_score} {away_team}"
    }


In [9]:
# Cell 5: Example Prediction
result = predict_match("Manchester City FC", "Chelsea FC")
print(result)



{'Predicted Outcome': 'Manchester City FC Win', 'Predicted Scoreline': 'Manchester City FC 3 - 1 Chelsea FC'}
