### Import Dataset
- Reads the dataset `game_statistics.csv` into a pandas DataFrame for analysis.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#CONSTS
READ_FOLDER_PATH = '../data/processed/'
WRITE_FOLDER_PATH = '../data/processed/'

data = pd.read_csv(READ_FOLDER_PATH+'game_statistics.csv')

print(data.shape[0])
pd.set_option("display.max_colwidth", None) 

941009


### Calculate League Averages
- Calculates league averages for: total_corners, home_corners and away_corners

In [2]:
def calculate_league_averages(df):
    """
    Calculate average statistics foreach league
    """
    league_stats = {}
    for game_id, game_data in df.groupby('id_odsp'):
        league=game_data['league'].iloc[0]
        country=game_data['country'].iloc[0]
        
        #Calculate full-time stats
        #inneficiency of calculating full-time corners twice is recognised, but this func is only run once so the inefficiency doesn't matter.
        home_corners_ft = game_data['home_corners'].iloc[-1]
        away_corners_ft = game_data['away_corners'].iloc[-1]
        total_corners_ft = home_corners_ft+away_corners_ft
        
        #Store stats for league averages: corners, and specifically only home and away corners
        key = (country,league)
        if key not in league_stats:
            league_stats[key] = {'corners':[], 'home_corners':[], 'away_corners':[]}
        
        league_stats[key]['corners'].append(total_corners_ft)
        league_stats[key]['home_corners'].append(home_corners_ft)
        league_stats[key]['away_corners'].append(away_corners_ft)
    
    #Finally use stats to calculate averages for each league...
    league_averages = {}
    for key, stats in league_stats.items():
        country, league = key
        league_averages[key] = {
            'avg_corners': sum(stats['corners']) /len(stats['corners']) if stats['corners'] else 0,
            'avg_home_corners': sum(stats['home_corners'])/len(stats['home_corners']) if stats['home_corners'] else 0,
            'avg_away_corners': sum(stats['away_corners'])/len(stats['away_corners']) if stats['away_corners'] else 0
        }
    
    return league_averages

### Process Game Data
- Groups match event data by game ID to get aggregated stats, including before the 80th minute, at full time, and in rolling windows (70-75, 75-80, 80-85 minutes).
- **Target variable** = whether a corner was won after 80 minutes

In [3]:
def process_game_data(df, league_averages):
    games = []
 
    # Group by game ID
    for game_id, game_data in df.groupby('id_odsp'):
        #Sort events by time
        game_data = game_data.sort_values(by='time')  

        #Get metadata
        id_odsp = game_id
        date = game_data['date'].iloc[0]
        season =game_data['season'].iloc[0] 
        league = game_data['league'].iloc[0]
        country= game_data['country'].iloc[0]
        home_team = game_data['ht'].iloc[0]
        away_team = game_data['at'].iloc[0]
        odd_h =game_data['odd_h'].iloc[0] 
        odd_d =game_data['odd_d'].iloc[0] 
        odd_a =game_data['odd_a'].iloc[0]

        #Split data at 80 mins
        pre_80_data = game_data[game_data['time'] <= 80]

        # Get score at 80 minutes
        score_pre_80 = pre_80_data['score'].iloc[-1]
        home_score_pre_80, away_score_pre_80 = map(int, score_pre_80.split('-'))
        # Get full-time score
        score_ft = game_data['score'].iloc[-1]

        # Calculate goal difference at 80 minutes
        goal_diff_80 = home_score_pre_80 - away_score_pre_80

        # Determine game state at 80 minutes
        if home_score_pre_80 > away_score_pre_80:
            game_state_80 = 'h'
        elif home_score_pre_80 < away_score_pre_80:
            game_state_80 = 'a'
        else:
            game_state_80 = 'd'  

        # One-hot encoding for game state
        game_state_encoded = {
            'game_state_80_h': 1 if game_state_80 =='h' else 0,
            'game_state_80_a': 1 if game_state_80 =='a' else 0,
            'game_state_80_d': 1 if game_state_80 =='d' else 0
        }

        #Pre-80 stats
        stats_pre_80 = {
            'home_shots_pre80': pre_80_data['home_shots'].iloc[-1], 
            'away_shots_pre80':pre_80_data['away_shots'].iloc[-1],
            'home_shots_on_target_pre80': pre_80_data['home_shots_on_target'].iloc[-1], 
            'away_shots_on_target_pre80': pre_80_data['away_shots_on_target'].iloc[-1],
            'home_shots_off_target_pre80':pre_80_data['home_shots_off_target'].iloc[-1],
            'away_shots_off_target_pre80': pre_80_data['away_shots_off_target'].iloc[-1],
            'home_yellow_cards_pre80': pre_80_data['home_yellow_cards'].iloc[-1],
            'away_yellow_cards_pre80':pre_80_data['away_yellow_cards'].iloc[-1], 
            'home_sending_off_pre80': pre_80_data['home_sending_off'].iloc[-1],
            'away_sending_off_pre80': pre_80_data['away_sending_off'].iloc[-1],
            'home_fouls_pre80':pre_80_data['home_fouls'].iloc[-1],
            'away_fouls_pre80': pre_80_data['away_fouls'].iloc[-1],
            'home_corners_pre80': pre_80_data['home_corners'].iloc[-1], 
            'away_corners_pre80': pre_80_data['away_corners'].iloc[-1],
        }

        #Full-time stats
        stats_ft = {  
            'home_shots_ft': game_data['home_shots'].iloc[-1], 
            'away_shots_ft':game_data['away_shots'].iloc[-1], 
            'home_shots_on_target_ft': game_data['home_shots_on_target'].iloc[-1], 
            'away_shots_on_target_ft': game_data['away_shots_on_target'].iloc[-1],
            'home_shots_off_target_ft': game_data['home_shots_off_target'].iloc[-1],
            'away_shots_off_target_ft': game_data['away_shots_off_target'].iloc[-1],
            'home_yellow_cards_ft':game_data['home_yellow_cards'].iloc[-1], 
            'away_yellow_cards_ft': game_data['away_yellow_cards'].iloc[-1],
            'home_sending_off_ft': game_data['home_sending_off'].iloc[-1],
            'away_sending_off_ft':game_data['away_sending_off'].iloc[-1],
            'home_fouls_ft': game_data['home_fouls'].iloc[-1],
            'away_fouls_ft': game_data['away_fouls'].iloc[-1],
            'home_corners_ft': game_data['home_corners'].iloc[-1], 
            'away_corners_ft':game_data['away_corners'].iloc[-1],
            'total_corners_ft': game_data['home_corners'].iloc[-1]+game_data['away_corners'].iloc[-1],
        }

        #Rolling windows for 70-75 and 75-80
        windows = { 
            '70_75': game_data[(game_data['time'] >= 70) & (game_data['time'] < 75)],
            '75_80': game_data[(game_data['time'] >= 75) & (game_data['time'] < 80)], 
        }
        window_stats = {}
        stats_columns = ['home_shots', 'away_shots', 'home_shots_on_target', 'away_shots_on_target',
                         'home_shots_off_target', 'away_shots_off_target', 'home_yellow_cards', 'away_yellow_cards',
                         'home_sending_off', 'away_sending_off', 'home_fouls', 'away_fouls',
                         'home_corners', 'away_corners']
        
        for window_name, window_data in windows.items():
            for stat in stats_columns:
                stat_change = (window_data[stat].iloc[-1] - window_data[stat].iloc[0]) if len(window_data) > 1 else 0
                window_stats[f'{stat}_{window_name}'] = stat_change


        #Corner difference (80 to FT)
        team_corner_diffs = {
            'home_corner_diff': stats_ft['home_corners_ft'] -stats_pre_80['home_corners_pre80'], 
            'away_corner_diff': stats_ft['away_corners_ft'] -stats_pre_80['away_corners_pre80'], 
        }

        corner_diff = team_corner_diffs['home_corner_diff'] + team_corner_diffs['away_corner_diff']

        # Target: Did either team get a corner after 80?
        target = int(corner_diff > 0)

        # Calculate trailing team corners (corners won by the trailing team between 80 and full-time)
        trailing_team_corners = None
        target2 = None
        trailing_team = None
        if home_score_pre_80 == away_score_pre_80 + 1: #away is trailing by 1 goal
            trailing_team = 'away'
            trailing_team_corners = team_corner_diffs['away_corner_diff']
            target2 = int(team_corner_diffs['away_corner_diff'] > 0)
        elif away_score_pre_80 == home_score_pre_80 + 1: #home is trailing by 1 goal
            trailing_team = 'home'
            trailing_team_corners = team_corner_diffs['home_corner_diff']
            target2 = int(team_corner_diffs['home_corner_diff'] > 0) 

        #**************
        # Use league_averages to calc league stats
        key = (country,league)

        league_data = {
            #One-hot encodings for leagues...
            'league_england': 1 if country.lower()=='england' else 0,
            'league_spain': 1 if country.lower()=='spain' else 0,
            'league_germany': 1 if country.lower() =='germany' else 0, 
            'league_italy':1 if country.lower()=='italy' else 0,
            'league_france': 1 if country.lower()=='france' else 0,
             
            #League average stats:
            'league_avg_corners': league_averages.get(key, {}).get('avg_corners'),
            'league_avg_home_corners': league_averages.get(key, {}).get('avg_home_corners'), 
            'league_avg_away_corners': league_averages.get(key, {}).get('avg_away_corners'), 
             
            # ---> Corner rate comparisons... to measure game pace vs. expected league average pace
            'corner_rate_vs_avg': ((stats_pre_80['home_corners_pre80']+stats_pre_80['away_corners_pre80']) / 80) / 
                                  (league_averages.get(key, {}).get('avg_corners') / 90),
            'home_corner_rate_vs_avg': (stats_pre_80['home_corners_pre80'] / 80) / 
                                       (league_averages.get(key, {}).get('avg_home_corners') / 90),
            'away_corner_rate_vs_avg': (stats_pre_80['away_corners_pre80'] / 80) / 
                                       (league_averages.get(key, {}).get('avg_away_corners') / 90)
        } 

        #Features to capture goal difference effects:
        goal_diff_features = {
            'goal_diff_80': goal_diff_80, 
            'abs_goal_diff_80': abs(goal_diff_80),
            'close_game_80': 1 if abs(goal_diff_80) <=1 else 0, 
            'goal_diff_squared_80': goal_diff_80**2
        }
        #**************
 
        # Append processed data
        games.append({
            'id_odsp': id_odsp,
            'date': date, 
            'season': season,
            'league':league,
            'country': country, 
            'home_team': home_team,
            'away_team': away_team,
            'odd_h': odd_h,
            'odd_d': odd_d, 
            'odd_a': odd_a,
            'score_pre_80':score_pre_80,
            'score_ft': score_ft,
            'game_state_80': game_state_80,
            'goal_diff_80':goal_diff_80,
            **stats_pre_80, 
            **stats_ft,
            **window_stats,
            **team_corner_diffs,
            'corner_diff': corner_diff,
            'target': target,
            'trailing_team': trailing_team,
            'trailing_team_corners': trailing_team_corners,
            'target2': target2, 
            **game_state_encoded, 
            **league_data,
            **goal_diff_features
        })

    # Convert to DataFrame
    return pd.DataFrame(games)

# Load dataset and process
league_averages = calculate_league_averages(data)
aggregated_data = process_game_data(data, league_averages) 

# Sort and reset index for clean output
aggregated_data = aggregated_data.sort_values(by=['date', 'id_odsp']).reset_index(drop=True)
aggregated_data

Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,league_france,league_avg_corners,league_avg_home_corners,league_avg_away_corners,corner_rate_vs_avg,home_corner_rate_vs_avg,away_corner_rate_vs_avg,abs_goal_diff_80,close_game_80,goal_diff_squared_80
0,UFot0hit/,2011-08-05,2012,D1,germany,Borussia Dortmund,Hamburg SV,1.56,4.41,7.42,...,0,9.501866,5.307836,4.194030,1.539171,1.483656,1.609431,2,0,4
1,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,0,9.501866,5.307836,4.194030,0.828785,0.847803,0.804715,1,1,1
2,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,1,9.479769,5.359345,4.120424,1.068064,0.839655,1.365151,1,1,1
3,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,1,9.479769,5.359345,4.120424,1.186738,0.419827,2.184241,1,1,1
4,M7PhlM2C/,2011-08-06,2012,F1,france,Brest,Evian Thonon Gaillard,2.29,3.25,3.85,...,1,9.479769,5.359345,4.120424,1.780107,1.679310,1.911211,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9069,vJy048Er/,2017-01-22,2017,I1,italy,Empoli,Udinese,2.90,3.18,2.96,...,0,10.287572,5.747110,4.540462,0.874842,0.978753,0.743316,0,1,0
9070,xAkY8l6R/,2017-01-22,2017,I1,italy,Genoa,Crotone,1.97,4.35,8.00,...,0,10.287572,5.747110,4.540462,1.093552,0.978753,1.238861,0,1,0
9071,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,0,10.576597,5.874519,4.702079,1.595504,1.532040,1.674791,1,1,1
9072,xY7uZwOI/,2017-01-22,2017,F1,france,AS Monaco,Lorient,1.32,6.24,11.50,...,1,9.479769,5.359345,4.120424,0.949390,0.629741,1.365151,4,0,16


### Construct Engineered Features

In [4]:
def calc_home_urgency(df):
    """
    Computes urgency for the home team to attack based on:
    - If they are losing at 80 minutes (and goal diff is not over 2)
    - If the match is drawn-> urgency is based on odds
    """
    urgency = []
    for i in range(len(df)):
        u = 0
        if df.loc[i, "goal_diff_80"]<0 and df.loc[i, "goal_diff_80"]>=-2:  # Home team losing
            # urgency is proportional to inverse of home team's odds .. normalised, by the absolute goal difference
            u = (1 / df.loc[i, "odd_h"]) / abs(df.loc[i,"goal_diff_80"])
        elif df.loc[i, "goal_diff_80"] == 0:  #drawing
            #proportional to inverse of home teams odds... normalised by sum of the inverse of both teams odds
            u = (1 / df.loc[i,"odd_h"]) / ((1 / df.loc[i, "odd_h"]) + (1 / df.loc[i, "odd_a"]))
        urgency.append(round(u, 3)) #else append urgency=0
    return urgency 

def calc_away_urgency(df):
    """
    Computes urgency for the away team to attack based on:
    - If they are losing at 80 minutes (and goal diff is not over 2)
    - If the match is drawn-> urgency is based on odds
    """
    urgency = []
    for i in range(len(df)):
        u = 0
        if df.loc[i, "goal_diff_80"] >0 and df.loc[i, "goal_diff_80"]<=2:
            u = 1 / df.loc[i, "odd_a"] / abs(df.loc[i,"goal_diff_80"])
        elif df.loc[i, "goal_diff_80"] ==0:
            u = (1 / df.loc[i,"odd_a"]) / ((1 / df.loc[i, "odd_h"]) + (1 / df.loc[i, "odd_a"]))
        urgency.append(round(u,3))
    return urgency

def construct_features(df):
    """
    Constructucts features dynamically based on config:
    """

    #dictionary mapping to calc each feature
    feature_operations = {
        "total_shots_pre_80": lambda df: df["home_shots_pre80"] +df["away_shots_pre80"],
        "total_fouls_pre_80":lambda df: df["home_fouls_pre80"]+ df["away_fouls_pre80"],
        "total_yellow_cards_pre_80": lambda df: df["home_yellow_cards_pre80"] + df["away_yellow_cards_pre80"],
        "total_sending_off_pre_80": lambda df: df["home_sending_off_pre80"] + df["away_sending_off_pre80"],
        "total_corners_pre_80": lambda df:df["home_corners_pre80"]+ df["away_corners_pre80"],
        "total_corners_70_75": lambda df: df["home_corners_70_75"] + df["away_corners_70_75"],
        "total_shots_70_75":lambda df: df["home_shots_70_75"] +df["away_shots_70_75"],
        "total_fouls_70_75": lambda df: df["home_fouls_70_75"] +df["away_fouls_70_75"],
        "total_corners_75_80": lambda df: df["home_corners_75_80"] + df["away_corners_75_80"],
        "total_shots_75_80": lambda df:df["home_shots_75_80"] +df["away_shots_75_80"],
        "total_fouls_75_80": lambda df: df["home_fouls_75_80"] +df["away_fouls_75_80"],

        "odds_ratio": lambda df: (df["odd_h"] / df["odd_a"]),

        "shot_to_corner_ratio_pre_80": lambda df: (df["total_shots_pre_80"] /df["total_corners_pre_80"]).fillna(0), #..avoid division by zero
        "team_aggression_score_pre_80": lambda df: (df["total_fouls_pre_80"]+df["total_yellow_cards_pre_80"]) /(df["total_shots_pre_80"]).fillna(0),

        "home_urgency_to_attack":lambda df: calc_home_urgency(df),
        "away_urgency_to_attack":lambda df: calc_away_urgency(df),

        "home_momentum_to_attack": lambda df: (
            (df["home_shots_75_80"]-df["home_shots_70_75"]) +
            (df["home_corners_75_80"]-df["home_corners_70_75"])
        ) *df["home_urgency_to_attack"],

        "away_momentum_to_attack": lambda df: (
            (df["away_shots_75_80"]-df["away_shots_70_75"]) +
            (df["away_corners_75_80"]-df["away_corners_70_75"])
        ) *df["away_urgency_to_attack"],

        "attack_intensity": lambda df: (df["goal_diff_80"].abs()==1) * (df["total_shots_75_80"]+df["total_corners_75_80"]),
        "defensive_pressure":lambda df: df["total_fouls_75_80"]-df["total_fouls_70_75"],

        "trailing_team_probability": lambda df: df.apply(lambda row: 1/row['odd_h'] *100 if row['trailing_team']=='home' else 1/row['odd_a']* 100, axis=1)
    }
    
    #Only aply features selected from config!
    for feature, operation in feature_operations.items():
        df[feature] =operation(df)
    df =df.round(3)
    return df

aggregated_full = construct_features(aggregated_data)
aggregated_full

Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,odds_ratio,shot_to_corner_ratio_pre_80,team_aggression_score_pre_80,home_urgency_to_attack,away_urgency_to_attack,home_momentum_to_attack,away_momentum_to_attack,attack_intensity,defensive_pressure,trailing_team_probability
0,UFot0hit/,2011-08-05,2012,D1,germany,Borussia Dortmund,Hamburg SV,1.56,4.41,7.42,...,0.210,1.692,1.273,0.000,0.067,0.000,0.067,0,-2,13.477
1,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,0.694,2.429,2.294,0.424,0.000,-0.424,-0.000,0,-3,42.373
2,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,0.165,2.222,1.250,0.645,0.000,1.935,-0.000,3,0,64.516
3,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,0.725,1.900,1.000,0.000,0.290,0.000,0.000,0,-2,28.986
4,M7PhlM2C/,2011-08-06,2012,F1,france,Brest,Evian Thonon Gaillard,2.29,3.25,3.85,...,0.595,2.000,0.767,0.627,0.373,2.508,0.000,0,-1,25.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9069,vJy048Er/,2017-01-22,2017,I1,italy,Empoli,Udinese,2.90,3.18,2.96,...,0.980,1.250,3.800,0.505,0.495,-0.505,0.000,0,0,33.784
9070,xAkY8l6R/,2017-01-22,2017,I1,italy,Genoa,Crotone,1.97,4.35,8.00,...,0.246,2.100,1.143,0.802,0.198,-0.802,-0.396,0,0,12.500
9071,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,0.060,1.000,1.533,0.000,0.050,0.000,0.050,1,-2,5.000
9072,xY7uZwOI/,2017-01-22,2017,F1,france,AS Monaco,Lorient,1.32,6.24,11.50,...,0.115,2.375,1.000,0.000,0.000,0.000,0.000,0,0,8.696


### Save Processed Data
- Saves the processed dataset with all statistics to `proccessed_game_data.csv`.  
- A snippet of the first 500 rows is saved as `proccessed_game_data_snippet.csv`.  

In [5]:
print(aggregated_full.shape)

aggregated_data.to_csv(WRITE_FOLDER_PATH+'aggregated_data.csv', index=False)
aggregated_data.tail(500).to_csv(WRITE_FOLDER_PATH+'snippets/aggregated_data_snippet.csv', index=False)

aggregated_full.to_csv(WRITE_FOLDER_PATH+'aggregated_full.csv', index=False)
aggregated_full.tail(500).to_csv(WRITE_FOLDER_PATH+'snippets/aggregated_full_snippet.csv', index=False)

(9074, 116)
