# Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import penaltyblog as pb

from penaltyblog.scrapers import FootballData
from penaltyblog.matchflow import Flow, where_equals, get_field

import warnings
from statsbombpy.api_client import NoAuthWarning
warnings.filterwarnings("ignore", category=NoAuthWarning)

## 2.1: Fetch Match Data and Pinnacle Odds

In [2]:
LEAGUE = "ENG Premier League"
SEASON = "2015-2016" 

print(f"Fetching match data and Pinnacle odds for {LEAGUE} {SEASON}...")

df_odds = FootballData(LEAGUE, SEASON).get_fixtures()

odds_cols = [
    "date",
    "team_home",
    "team_away",
    "goals_home",
    "goals_away",
    "psh", 
    "psd", 
    "psa", 
]

df_v4 = df_odds[odds_cols].copy()
df_v4.dropna(subset=['psh', 'psd', 'psa'], inplace=True)
df_v4.sort_values(by='date', inplace=True)
df_v4.reset_index(drop=True, inplace=True)

print(f"\nSuccessfully fetched {len(df_v4)} matches with Pinnacle odds.")
display(df_v4.head())

Fetching match data and Pinnacle odds for ENG Premier League 2015-2016...

Successfully fetched 380 matches with Pinnacle odds.


Unnamed: 0,date,team_home,team_away,goals_home,goals_away,psh,psd,psa
0,2015-08-08,Bournemouth,Aston Villa,0,1,1.95,3.65,4.27
1,2015-08-08,Chelsea,Swansea,2,2,1.39,4.92,10.39
2,2015-08-08,Everton,Watford,2,2,1.7,3.95,5.62
3,2015-08-08,Leicester,Sunderland,4,2,1.99,3.48,4.34
4,2015-08-08,Man United,Tottenham,1,0,1.65,4.09,5.9


## Step 2.2: Combine Odds Data with StatsBomb Match IDs

In [3]:
COMPETITION_ID = 2
SEASON_ID = 27

print("Fetching official match list from StatsBomb...")
sb_matches_raw = (
    Flow.statsbomb.matches(competition_id=COMPETITION_ID, season_id=SEASON_ID)
    .select("match_id", "match_date", "home_team", "away_team")
    .collect()
)
df_sb_matches = pd.DataFrame(sb_matches_raw)

df_sb_matches['team_home'] = df_sb_matches['home_team'].apply(lambda x: x['home_team_name'])
df_sb_matches['team_away'] = df_sb_matches['away_team'].apply(lambda x: x['away_team_name'])
df_sb_matches.rename(columns={'match_date': 'date'}, inplace=True)

df_sb_matches = df_sb_matches[['match_id', 'date', 'team_home', 'team_away']]

df_v4['date'] = pd.to_datetime(df_v4['date'])
df_sb_matches['date'] = pd.to_datetime(df_sb_matches['date'])

print("Merging odds data with StatsBomb match IDs...")
df_master = pd.merge(
    df_v4,
    df_sb_matches,
    on=['date', 'team_home', 'team_away'],
    how='inner'
)

print(f"\nSuccessfully merged {len(df_master)} matches.")
print("We now have the Pinnacle odds and the StatsBomb match_id in one place.")
display(df_master.head())

Fetching official match list from StatsBomb...
Merging odds data with StatsBomb match IDs...

Successfully merged 72 matches.
We now have the Pinnacle odds and the StatsBomb match_id in one place.


Unnamed: 0,date,team_home,team_away,goals_home,goals_away,psh,psd,psa,match_id
0,2015-08-08,Everton,Watford,2,2,1.7,3.95,5.62,3754300
1,2015-08-15,Southampton,Everton,0,3,1.98,3.6,4.21,3754034
2,2015-08-16,Crystal Palace,Arsenal,1,2,5.34,3.97,1.72,3754312
3,2015-08-22,Crystal Palace,Aston Villa,2,1,1.85,3.63,4.89,3754190
4,2015-08-23,Watford,Southampton,0,0,2.76,3.36,2.78,3754189


## Step 3: Normalize Team Names and Re-merge

In [4]:
print("--- Team Names from football-data.co.uk ---")
print(sorted(df_v4['team_home'].unique()))

print("\n--- Team Names from StatsBomb ---")
print(sorted(df_sb_matches['team_home'].unique()))


# 2. Create a mapping to standardize the names
name_mapping = {
    'Man United': 'Manchester United',
    'Man City': 'Manchester City',
    'West Brom': 'West Bromwich Albion',
    'West Ham': 'West Ham United',
    'Stoke': 'Stoke City',
    'Swansea': 'Swansea City',
    'Leicester': 'Leicester City',
    'Norwich': 'Norwich City',
    'Bournemouth': 'AFC Bournemouth',
    'Tottenham': 'Tottenham Hotspur',
    'QPR': 'Queens Park Rangers'
}

print("\nStandardizing team names...")
df_v4.replace({'team_home': name_mapping, 'team_away': name_mapping}, inplace=True)


print("Re-merging the data...")
df_master = pd.merge(
    df_v4,
    df_sb_matches,
    on=['date', 'team_home', 'team_away'],
    how='inner'
)

print(f"\nSuccessfully merged {len(df_master)} matches.")
display(df_master.head())

--- Team Names from football-data.co.uk ---
['Arsenal', 'Aston Villa', 'Bournemouth', 'Chelsea', 'Crystal Palace', 'Everton', 'Leicester', 'Liverpool', 'Man City', 'Man United', 'Newcastle', 'Norwich', 'Southampton', 'Stoke', 'Sunderland', 'Swansea', 'Tottenham', 'Watford', 'West Brom', 'West Ham']

--- Team Names from StatsBomb ---
['AFC Bournemouth', 'Arsenal', 'Aston Villa', 'Chelsea', 'Crystal Palace', 'Everton', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester United', 'Newcastle United', 'Norwich City', 'Southampton', 'Stoke City', 'Sunderland', 'Swansea City', 'Tottenham Hotspur', 'Watford', 'West Bromwich Albion', 'West Ham United']

Standardizing team names...
Re-merging the data...

Successfully merged 342 matches.


Unnamed: 0,date,team_home,team_away,goals_home,goals_away,psh,psd,psa,match_id
0,2015-08-08,AFC Bournemouth,Aston Villa,0,1,1.95,3.65,4.27,3754128
1,2015-08-08,Chelsea,Swansea City,2,2,1.39,4.92,10.39,3754078
2,2015-08-08,Everton,Watford,2,2,1.7,3.95,5.62,3754300
3,2015-08-08,Leicester City,Sunderland,4,2,1.99,3.48,4.34,3754237
4,2015-08-08,Manchester United,Tottenham Hotspur,1,0,1.65,4.09,5.9,3754097


4: Engineer Tactical Features for Full Season

In [5]:
feature_data = []

for _, match in tqdm(df_master.iterrows(), total=df_master.shape[0], desc="Processing Matches"):
    try:
        match_id = match['match_id']
        home_team = match['team_home']
        away_team = match['team_away']
        flow = Flow.statsbomb.events(match_id).cache()

        # a) Shot Stats
        shot_stats = (
            flow.filter(where_equals("type.name", "Shot"))
            .group_by("team.name")
            .summary({"total_shots": ("count", "id"), "total_xg": ("sum", "shot.statsbomb_xg")})
            .collect()
        )
        df_shot_stats = pd.DataFrame(shot_stats)

        # b) Possession Stats
        team_possession = (
            flow.filter(lambda r: get_field(r, "duration") is not None)
            .group_by("team.name")
            .summary({"possession_duration": ("sum", "duration")})
            .collect()
        )
        df_possession = pd.DataFrame(team_possession)
        total_duration = df_possession['possession_duration'].sum()
        df_possession['possession_pct'] = (df_possession['possession_duration'] / total_duration) * 100 if total_duration > 0 else 0

        # c) Organize features into a single dictionary
        match_features = {'match_id': match_id}
        for team_name in [home_team, away_team]:
            prefix = 'home' if team_name == home_team else 'away'
            
            team_shots = df_shot_stats[df_shot_stats['team.name'] == team_name]
            team_poss = df_possession[df_possession['team.name'] == team_name]
            
            match_features[f'{prefix}_shots'] = team_shots['total_shots'].iloc[0] if not team_shots.empty else 0
            match_features[f'{prefix}_xg'] = team_shots['total_xg'].iloc[0] if not team_shots.empty else 0
            match_features[f'{prefix}_possession'] = team_poss['possession_pct'].iloc[0] if not team_poss.empty else 0
        
        feature_data.append(match_features)

    except Exception as e:
        print(f"\nSkipping match_id {match.get('match_id', 'N/A')} due to an error: {e}")
        continue

df_features = pd.DataFrame(feature_data)

df_v4_final = pd.merge(df_master, df_features, on='match_id', how='inner')


print(f"\nFeature engineering complete. Enriched dataset with {df_v4_final.shape[0]} matches.")
display(df_v4_final.head())

Processing Matches:   0%|          | 0/342 [00:00<?, ?it/s]


Feature engineering complete. Enriched dataset with 342 matches.


Unnamed: 0,date,team_home,team_away,goals_home,goals_away,psh,psd,psa,match_id,home_shots,home_xg,home_possession,away_shots,away_xg,away_possession
0,2015-08-08,AFC Bournemouth,Aston Villa,0,1,1.95,3.65,4.27,3754128,12,1.249248,55.115652,7,0.651935,44.884348
1,2015-08-08,Chelsea,Swansea City,2,2,1.39,4.92,10.39,3754078,10,0.486011,50.728541,16,2.471848,49.271459
2,2015-08-08,Everton,Watford,2,2,1.7,3.95,5.62,3754300,11,0.7807,61.354711,11,0.619914,38.645289
3,2015-08-08,Leicester City,Sunderland,4,2,1.99,3.48,4.34,3754237,19,2.261113,45.987571,9,1.36772,54.012429
4,2015-08-08,Manchester United,Tottenham Hotspur,1,0,1.65,4.09,5.9,3754097,9,0.525077,50.59303,9,0.584646,49.40697


#### 6: Prepare Data for Machine Learning 

In [9]:
from sklearn.model_selection import train_test_split

# 1. Convert Odds to Probabilities
print("Converting Pinnacle odds to implied probabilities...")
def remove_overround(row):
    odds = [row["psh"], row["psd"], row["psa"]]
    probs = pb.implied.power(odds)["implied_probabilities"]
    return pd.Series(probs)

df_v4_final[['prob_h', 'prob_d', 'prob_a']] = df_v4_final.apply(remove_overround, axis=1)


# 2. Define the Target Variable (y)
def get_result(row):
    if row['goals_home'] > row['goals_away']:
        return 1  # Home Win
    elif row['goals_home'] < row['goals_away']:
        return 2  # Away Win
    else:
        return 0  # Draw
df_v4_final['result'] = df_v4_final.apply(get_result, axis=1)


# 3. Define the Features (X)
features = [
    'home_shots', 'away_shots',
    'home_xg', 'away_xg',
    'home_possession', 'away_possession',
    'prob_h', 'prob_d', 'prob_a'
]
X = df_v4_final[features]
y = df_v4_final['result']

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData successfully prepared and split for V4 training.")
print(f"Training set size: {len(X_train)} matches")
print(f"Testing set size: {len(X_test)} matches")

Converting Pinnacle odds to implied probabilities...

Data successfully prepared and split for V4 training.
Training set size: 273 matches
Testing set size: 69 matches


### 7: Train and Evaluate the V4 XGBoost Model

In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# 1. Train the V4 Model
print("\nTraining the V4 XGBoost model...")
v4_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    seed=42
)
v4_model.fit(X_train, y_train)
print("Model training complete.")

# 2. Evaluate the V4 Model
print("\nEvaluating V4 model performance on the unseen test set...")
y_pred = v4_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- V4 Model Accuracy: {accuracy:.2%} ---")
print("\nClassification Report:")
target_names = ['Draw', 'Home Win', 'Away Win']
print(classification_report(y_test, y_pred, target_names=target_names))


Training the V4 XGBoost model...
Model training complete.

Evaluating V4 model performance on the unseen test set...

--- V4 Model Accuracy: 55.07% ---

Classification Report:
              precision    recall  f1-score   support

        Draw       0.39      0.35      0.37        20
    Home Win       0.68      0.70      0.69        27
    Away Win       0.52      0.55      0.53        22

    accuracy                           0.55        69
   macro avg       0.53      0.53      0.53        69
weighted avg       0.54      0.55      0.55        69



In [12]:
from sklearn.model_selection import train_test_split

# 1. Convert Odds to Probabilities
print("Converting Pinnacle odds to implied probabilities...")
def remove_overround(row):
    odds = [row["psh"], row["psd"], row["psa"]]
    probs = pb.implied.power(odds)["implied_probabilities"]
    return pd.Series(probs)

df_v4_final[['prob_h', 'prob_d', 'prob_a']] = df_v4_final.apply(remove_overround, axis=1)


# 2. Define the Target Variable (y)
def get_result(row):
    if row['goals_home'] > row['goals_away']: return 1  # Home Win
    elif row['goals_home'] < row['goals_away']: return 2  # Away Win
    else: return 0  # Draw
df_v4_final['result'] = df_v4_final.apply(get_result, axis=1)


# 3. Define the Features (X)
features = [
    'home_shots', 'away_shots', 'home_xg', 'away_xg',
    'home_possession', 'away_possession',
    'prob_h', 'prob_d', 'prob_a'
]
X = df_v4_final[features]
y = df_v4_final['result']

# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nData successfully prepared and split for V4 training.")
print(f"Training set size: {len(X_train)} matches")
print(f"Testing set size: {len(X_test)} matches")

Converting Pinnacle odds to implied probabilities...

Data successfully prepared and split for V4 training.
Training set size: 273 matches
Testing set size: 69 matches


In [13]:
import xgboost as xgb
from sklearn.metrics import log_loss, accuracy_score, classification_report

# 1. Train the V4 Model
print("\nTraining the V4 XGBoost model...")
v4_model = xgb.XGBClassifier(objective='multi:softprob', num_class=3, seed=42)
v4_model.fit(X_train, y_train)
print("Model training complete.")

# 2. Evaluate the V4 Model
print("\nEvaluating V4 model performance on the unseen test set...")
y_pred_proba = v4_model.predict_proba(X_test)
y_pred_class = v4_model.predict(X_test) 
loss = log_loss(y_test, y_pred_proba)
print(f"\n--- V4 Model Log Loss: {loss:.4f} ---")


# --- SECONDARY METRIC: Accuracy ---
accuracy = accuracy_score(y_test, y_pred_class)
print(f"V4 Model Accuracy: {accuracy:.2%}\n")


# --- DETAILED REPORT ---
print("Classification Report:")
target_names = ['Draw', 'Home Win', 'Away Win']
print(classification_report(y_test, y_pred_class, target_names=target_names))


Training the V4 XGBoost model...
Model training complete.

Evaluating V4 model performance on the unseen test set...

--- V4 Model Log Loss: 1.3037 ---
V4 Model Accuracy: 55.07%

Classification Report:
              precision    recall  f1-score   support

        Draw       0.39      0.35      0.37        20
    Home Win       0.68      0.70      0.69        27
    Away Win       0.52      0.55      0.53        22

    accuracy                           0.55        69
   macro avg       0.53      0.53      0.53        69
weighted avg       0.54      0.55      0.55        69

