Feature engineering for National League matches.

In [None]:

import pandas as pd
import numpy as np

# Load match data
csv_path = '../../england-national-league-matches-2023-to-2024-stats.csv'
df = pd.read_csv(csv_path, na_values=['N/A'])

# Ensure numeric columns are numeric
numeric_cols = ['team_a_xg','team_b_xg','home_ppg','away_ppg',
                'odds_ft_home_team_win','odds_ft_draw','odds_ft_away_team_win',
                'odds_ft_over15','odds_ft_over25','odds_ft_over35','odds_ft_over45',
                'odds_btts_yes','odds_btts_no',
                'home_team_goal_count','away_team_goal_count']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing numeric values
df[numeric_cols] = df[numeric_cols].fillna(0)

# Feature: expected goals difference
xg_diff = df['team_a_xg'] - df['team_b_xg']
df['xg_diff'] = xg_diff

# Feature: points-per-game difference
ppg_diff = df['home_ppg'] - df['away_ppg']
df['ppg_diff'] = ppg_diff

# Convert odds to implied probabilities
result_odds = ['odds_ft_home_team_win','odds_ft_draw','odds_ft_away_team_win']
# Avoid division by zero
for col in result_odds:
    df[col] = df[col].replace({0: np.nan})

implied = 1 / df[result_odds]
# Normalize to remove bookmaker margin
implied_sum = implied.sum(axis=1)
df['prob_ft_home_team_win'] = implied['odds_ft_home_team_win'] / implied_sum
df['prob_ft_draw'] = implied['odds_ft_draw'] / implied_sum
df['prob_ft_away_team_win'] = implied['odds_ft_away_team_win'] / implied_sum

other_odds = ['odds_ft_over15','odds_ft_over25','odds_ft_over35','odds_ft_over45',
              'odds_btts_yes','odds_btts_no']
for col in other_odds:
    df[col] = df[col].replace({0: np.nan})
    prob_col = col.replace('odds_','prob_')
    df[prob_col] = 1 / df[col]

# Compute match points for form calculations
home_win = df['home_team_goal_count'] > df['away_team_goal_count']
away_win = df['home_team_goal_count'] < df['away_team_goal_count']

df['home_points'] = np.select([home_win, df['home_team_goal_count']==df['away_team_goal_count']],
                              [3,1], default=0)
df['away_points'] = np.select([away_win, df['home_team_goal_count']==df['away_team_goal_count']],
                              [3,1], default=0)

# Sort chronologically
df = df.sort_values('timestamp')

# Lagging averages for form (last 5 matches)
N = 5
# Points form
df['home_points_last5'] = df.groupby('home_team_name')['home_points'].transform(lambda x: x.shift().rolling(N, min_periods=1).mean())
df['away_points_last5'] = df.groupby('away_team_name')['away_points'].transform(lambda x: x.shift().rolling(N, min_periods=1).mean())
# Goals form
df['home_goals_last5'] = df.groupby('home_team_name')['home_team_goal_count'].transform(lambda x: x.shift().rolling(N, min_periods=1).mean())
df['away_goals_last5'] = df.groupby('away_team_name')['away_team_goal_count'].transform(lambda x: x.shift().rolling(N, min_periods=1).mean())

# Display engineered features
feature_cols = ['xg_diff','ppg_diff','prob_ft_home_team_win','prob_ft_draw','prob_ft_away_team_win',
                'prob_ft_over15','prob_ft_over25','prob_ft_over35','prob_ft_over45',
                'prob_btts_yes','prob_btts_no',
                'home_points_last5','away_points_last5','home_goals_last5','away_goals_last5']

df[feature_cols].head()
