### Import Dataset
- Reads the dataset `game_statistics.csv` into a pandas DataFrame for analysis.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#CONSTS
READ_FOLDER_PATH = '../data/processed/'
WRITE_FOLDER_PATH = '../data/processed/'

data = pd.read_csv(READ_FOLDER_PATH+'game_statistics.csv')

print(data.shape[0])
pd.set_option("display.max_colwidth", None)

941009


### Calculate League Averages
- Calculates league averages for: total_corners, home_corners and away_corners

In [8]:
def calculate_league_averages(df):
    """
    Calculate average statistics foreach league
    """
    league_stats = {}
    for game_id, game_data in df.groupby('id_odsp'):
        league=game_data['league'].iloc[0]
        country=game_data['country'].iloc[0]
        
        #Calculate full-time stats
        #inneficiency of calculating full-time corners twice is recognised, but this func is only run once so the inefficiency doesn't matter.
        home_corners_ft = game_data['home_corners'].iloc[-1]
        away_corners_ft = game_data['away_corners'].iloc[-1]
        total_corners_ft = home_corners_ft+away_corners_ft
        
        #Store stats for league averages: corners, and specifically only home and away corners
        key = (country,league)
        if key not in league_stats:
            league_stats[key] = {'corners':[], 'home_corners':[], 'away_corners':[]}
        
        league_stats[key]['corners'].append(total_corners_ft)
        league_stats[key]['home_corners'].append(home_corners_ft)
        league_stats[key]['away_corners'].append(away_corners_ft)
    
    #Finally use stats to calculate averages for each league...
    league_averages = {}
    for key, stats in league_stats.items():
        country, league = key
        league_averages[key] = {
            'avg_corners': sum(stats['corners']) /len(stats['corners']) if stats['corners'] else 0,
            'avg_home_corners': sum(stats['home_corners'])/len(stats['home_corners']) if stats['home_corners'] else 0,
            'avg_away_corners': sum(stats['away_corners'])/len(stats['away_corners']) if stats['away_corners'] else 0
        }
    
    return league_averages

### Process Game Data
- Filtered by whether a game contains a team which is trailing by 1 goal or not
- Groups match event data by game ID to get aggregated stats, including before the 80th minute, at full time, and in rolling windows (70-75, 75-80, 80-85 minutes).
- **Target variable** = whether a corner was won after 80 minutes

In [None]:
def process_game_data(df, league_averages):
    games = [] 

    # Group by game ID
    for game_id, game_data in df.groupby('id_odsp'):
        #Sort events by time
        game_data = game_data.sort_values(by='time')  

        #Get metadata
        id_odsp = game_id
        date = game_data['date'].iloc[0]
        season =game_data['season'].iloc[0] 
        league = game_data['league'].iloc[0]
        country= game_data['country'].iloc[0]
        home_team = game_data['ht'].iloc[0]
        away_team = game_data['at'].iloc[0]
        odd_h =game_data['odd_h'].iloc[0] 
        odd_d =game_data['odd_d'].iloc[0] 
        odd_a =game_data['odd_a'].iloc[0]

        #Split data at 80 mins
        pre_80_data = game_data[game_data['time'] <= 80]

        # Get score at 80 minutes
        score_pre_80 = pre_80_data['score'].iloc[-1]
        home_score_pre_80, away_score_pre_80 = map(int, score_pre_80.split('-'))
        # Get full-time score
        score_ft = game_data['score'].iloc[-1]

        # Calculate goal difference at 80 minutes
        goal_diff_80 = home_score_pre_80 - away_score_pre_80

        #Determine trailing & leading team (only games losing by 1 goal)
        if goal_diff_80 == 1:
            trailing_team = 'away'
            leading_team = 'home'
            trailing_is_home = 0
        elif goal_diff_80 == -1:
            trailing_team = 'home'
            leading_team = 'away'
            trailing_is_home = 1
        else:
            continue  #othewise skip

        #Pre-80 stats
        stats_pre_80 = {
            'trailing_team_shots_pre80': pre_80_data[f'{trailing_team}_shots'].iloc[-1], 
            'leading_team_shots_pre80':pre_80_data[f'{leading_team}_shots'].iloc[-1],
            'trailing_team_shots_on_target_pre80': pre_80_data[f'{trailing_team}_shots_on_target'].iloc[-1], 
            'leading_team_shots_on_target_pre80': pre_80_data[f'{leading_team}_shots_on_target'].iloc[-1],
            'trailing_team_shots_off_target_pre80': pre_80_data[f'{trailing_team}_shots_off_target'].iloc[-1],
            'leading_team_shots_off_target_pre80': pre_80_data[f'{leading_team}_shots_off_target'].iloc[-1],
            'trailing_team_yellow_cards_pre80': pre_80_data[f'{trailing_team}_yellow_cards'].iloc[-1],
            'leading_team_yellow_cards_pre80': pre_80_data[f'{leading_team}_yellow_cards'].iloc[-1], 
            'trailing_team_sending_off_pre80': pre_80_data[f'{trailing_team}_sending_off'].iloc[-1],
            'leading_team_sending_off_pre80': pre_80_data[f'{leading_team}_sending_off'].iloc[-1],
            'trailing_team_fouls_pre80':pre_80_data[f'{trailing_team}_fouls'].iloc[-1],
            'leading_team_fouls_pre80': pre_80_data[f'{leading_team}_fouls'].iloc[-1],
            'trailing_team_corners_pre80': pre_80_data[f'{trailing_team}_corners'].iloc[-1], 
            'leading_team_corners_pre80': pre_80_data[f'{leading_team}_corners'].iloc[-1],
        }

        #Full-time stats
        stats_ft = {  
            'trailing_team_shots_ft': game_data[f'{trailing_team}_shots'].iloc[-1], 
            'leading_team_shots_ft':game_data[f'{leading_team}_shots'].iloc[-1], 
            'trailing_team_shots_on_target_ft': game_data[f'{trailing_team}_shots_on_target'].iloc[-1], 
            'leading_team_shots_on_target_ft': game_data[f'{leading_team}_shots_on_target'].iloc[-1],
            'trailing_team_shots_off_target_ft': game_data[f'{trailing_team}_shots_off_target'].iloc[-1],
            'leading_team_shots_off_target_ft': game_data[f'{leading_team}_shots_off_target'].iloc[-1],
            'trailing_team_yellow_cards_ft':game_data[f'{trailing_team}_yellow_cards'].iloc[-1], 
            'leading_team_yellow_cards_ft': game_data[f'{leading_team}_yellow_cards'].iloc[-1],
            'trailing_team_sending_off_ft': game_data[f'{trailing_team}_sending_off'].iloc[-1],
            'leading_team_sending_off_ft': game_data[f'{leading_team}_sending_off'].iloc[-1],
            'trailing_team_fouls_ft': game_data[f'{trailing_team}_fouls'].iloc[-1],
            'leading_team_fouls_ft': game_data[f'{leading_team}_fouls'].iloc[-1],
            'trailing_team_corners_ft': game_data[f'{trailing_team}_corners'].iloc[-1], 
            'leading_team_corners_ft': game_data[f'{leading_team}_corners'].iloc[-1],
        }

        #Rolling windows for 70-75 and 75-80
        windows = { 
            '70_75': game_data[(game_data['time'] >= 70) & (game_data['time'] < 75)],
            '75_80': game_data[(game_data['time'] >= 75) & (game_data['time'] < 80)], 
        }
        window_stats = {}
        stats_columns = ['shots', 'shots_on_target', 'shots_off_target', 'yellow_cards', 
                        'sending_off', 'fouls', 'corners']

        for window_name, window_data in windows.items():
            for stat in stats_columns:
                trailing_stat = f'{trailing_team}_{stat}'
                leading_stat = f'{leading_team}_{stat}'
                
                if len(window_data) > 1:
                    trailing_stat_change = window_data[trailing_stat].iloc[-1] -window_data[trailing_stat].iloc[0]
                    leading_stat_change = window_data[leading_stat].iloc[-1] -window_data[leading_stat].iloc[0]
                else:
                    trailing_stat_change=0
                    leading_stat_change=0
                
                window_stats[f'trailing_team_{stat}_{window_name}'] =trailing_stat_change
                window_stats[f'leading_team_{stat}_{window_name}'] =leading_stat_change

        #Corner difference (80 to FT)
        team_corner_diffs = {
            'trailing_team_corner_diff': stats_ft['trailing_team_corners_ft'] -stats_pre_80['trailing_team_corners_pre80'], 
            'leading_team_corner_diff': stats_ft['leading_team_corners_ft'] -stats_pre_80['leading_team_corners_pre80'], 
        }
        corner_diff = team_corner_diffs['trailing_team_corner_diff'] + team_corner_diffs['leading_team_corner_diff']

        # Target: Did either team get a corner after 80?
        target = int(team_corner_diffs['trailing_team_corner_diff'] > 0)

        # # Target: Did either team get a corner after 80?
        # if team_corner_diffs['trailing_team_corner_diff'] > 1:
        #     target = 2
        # elif team_corner_diffs['trailing_team_corner_diff'] > 0:
        #     target = 1
        # else:
        #     target = 0

        #**************
        # Use league_averages to calc league stats
        key = (country,league)

        league_data = {
            #One-hot encodings for leagues...
            'league_england': 1 if country.lower()=='england' else 0,
            'league_spain': 1 if country.lower()=='spain' else 0,
            'league_germany': 1 if country.lower() =='germany' else 0, 
            'league_italy':1 if country.lower()=='italy' else 0,
            'league_france': 1 if country.lower()=='france' else 0,
             
            #League average stats:
            'league_avg_corners': league_averages.get(key, {}).get('avg_corners'),
            'league_avg_home_corners': league_averages.get(key, {}).get('avg_home_corners'), 
            'league_avg_away_corners': league_averages.get(key, {}).get('avg_away_corners'), 
 
            # ---> Corner rate comparisons... to measure game pace vs. expected league average pace
            'corner_rate_vs_avg': ((stats_pre_80['trailing_team_corners_pre80']+stats_pre_80['leading_team_corners_pre80']) / 80) / 
                                (league_averages.get(key, {}).get('avg_corners') / 90),
            'trailing_team_corner_rate_vs_avg': (stats_pre_80['trailing_team_corners_pre80'] / 80) / 
                                    (league_averages.get(key, {}).get('avg_home_corners' if trailing_team=='home' else 'avg_away_corners') / 90),
            'leading_team_corner_rate_vs_avg': (stats_pre_80['leading_team_corners_pre80'] / 80) / 
                                    (league_averages.get(key, {}).get('avg_home_corners' if leading_team=='home' else 'avg_away_corners') / 90)
        } 
        #**************

        # Append processed data
        games.append({
            'id_odsp': id_odsp,
            'date': date, 
            'season': season,
            'league': league,
            'country': country, 
            'home_team': home_team,
            'away_team': away_team,
            'odd_h': odd_h,
            'odd_d': odd_d, 
            'odd_a': odd_a,
            'score_pre_80': score_pre_80,
            'score_ft': score_ft,
            'trailing_is_home': trailing_is_home,
            'trailing_team': trailing_team,
            'leading_team': leading_team,
            'goal_diff_80': goal_diff_80,
            **stats_pre_80, 
            **stats_ft,
            **window_stats,
            **team_corner_diffs,
            'corner_diff': corner_diff,
            'target': target,
            'trailing_team_corners': team_corner_diffs['trailing_team_corner_diff'], 
            **league_data,
        }) 

    # Convert to DataFrame
    return pd.DataFrame(games)

# Load dataset and process
league_averages = calculate_league_averages(data)
aggregated_data = process_game_data(data, league_averages)

# Sort and reset index for clean output
aggregated_data = aggregated_data.sort_values(by=['date', 'id_odsp']).reset_index(drop=True)
aggregated_data


Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,league_spain,league_germany,league_italy,league_france,league_avg_corners,league_avg_home_corners,league_avg_away_corners,corner_rate_vs_avg,trailing_team_corner_rate_vs_avg,leading_team_corner_rate_vs_avg
0,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,0,1,0,0,9.501866,5.307836,4.194030,0.828785,0.847803,0.804715
1,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,0,0,0,1,9.479769,5.359345,4.120424,1.068064,0.839655,1.365151
2,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,0,0,0,1,9.479769,5.359345,4.120424,1.186738,2.184241,0.419827
3,Wn69eU5B/,2011-08-06,2012,D1,germany,FC Cologne,VfL Wolfsburg,3.00,3.80,2.54,...,0,1,0,0,9.501866,5.307836,4.194030,1.183978,1.483656,0.804715
4,bkjpaC6n/,2011-08-06,2012,D1,germany,Werder Bremen,Kaiserslautern,1.83,4.20,4.80,...,0,1,0,0,9.501866,5.307836,4.194030,1.183978,1.072954,1.271705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3634,r5V5rw33/,2017-01-22,2017,E0,england,Arsenal,Burnley,1.23,7.65,17.75,...,0,0,0,0,10.576597,5.874519,4.702079,1.382770,0.957024,1.723545
3635,r5m8MY4G/,2017-01-22,2017,D1,germany,Bayer Leverkusen,Hertha Berlin,1.82,3.80,5.70,...,0,1,0,0,9.501866,5.307836,4.194030,1.065580,0.804715,1.271705
3636,trUaUcuk/,2017-01-22,2017,SP1,spain,Osasuna,Sevilla,6.80,4.42,1.58,...,1,0,0,0,10.495782,5.981638,4.514144,0.964673,0.564227,1.495300
3637,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,0,0,0,0,10.576597,5.874519,4.702079,1.595504,1.674791,1.532040


### Construct Engineered Features

In [10]:
def calc_trailing_urgency(df):
    """
    Computes urgency for the home team to attack based on:
    - If they are losing at 80 minutes
    - If the match is drawn, urgency is based on odds.
    """
    urgency = []
    for i in range(len(df)):
        u = 0
        # Urgency is proportional to inverse of trailing team's odds, normalized by goal difference
        trailing_team_odds = df.loc[i, "odd_h"] if df.loc[i, "trailing_team"] == "home" else df.loc[i, "odd_a"]
        u = (1 / trailing_team_odds)

        urgency.append(round(u, 3))  
    return urgency 

def calc_leading_urgency(df):
    """
    Computes urgency for the leading team to defend based on:
    - If they are winning at 80 minutes
    - If the match is drawn, urgency is based on odds.
    """
    urgency = []
    for i in range(len(df)):
        u = 0
        # Urgency is proportional to inverse of leading team's odds, normalized by goal difference
        leading_team_odds = df.loc[i, "odd_h"] if df.loc[i, "leading_team"] == "home" else df.loc[i, "odd_a"]
        u = (1 / leading_team_odds)

        urgency.append(round(u, 3))  
    return urgency 

def construct_features(df):
    """
    Constructucts features dynamically based on config:
    """
    
    #dictionary mapping to calc each feature
    feature_operations = {
        # **Total game stats**
        "total_shots_pre_80": lambda df: df["trailing_team_shots_pre80"] +df["leading_team_shots_pre80"],
        "total_fouls_pre_80":lambda df: df["trailing_team_fouls_pre80"]+ df["leading_team_fouls_pre80"],
        "total_yellow_cards_pre_80": lambda df: df["trailing_team_yellow_cards_pre80"] + df["leading_team_yellow_cards_pre80"],
        "total_sending_off_pre_80": lambda df: df["trailing_team_sending_off_pre80"] + df["leading_team_sending_off_pre80"],
        "total_corners_pre_80": lambda df: df["trailing_team_corners_pre80"]+ df["leading_team_corners_pre80"],
        "total_corners_70_75":lambda df: df["trailing_team_corners_70_75"] + df["leading_team_corners_70_75"],
        "total_shots_70_75": lambda df: df["trailing_team_shots_70_75"] +df["leading_team_shots_70_75"],
        "total_fouls_70_75": lambda df: df["trailing_team_fouls_70_75"] +df["leading_team_fouls_70_75"],
        "total_corners_75_80": lambda df: df["trailing_team_corners_75_80"] + df["leading_team_corners_75_80"],
        "total_shots_75_80": lambda df: df["trailing_team_shots_75_80"] +df["leading_team_shots_75_80"],
        "total_fouls_75_80": lambda df: df["trailing_team_fouls_75_80"] +df["leading_team_fouls_75_80"],

        "odds_ratio": lambda df: df.apply(lambda row: row["odd_h"] / row["odd_a"] if row["trailing_team"] == "home" else row["odd_a"] / row["odd_h"], axis=1),

        "trailing_team_urgency_to_attack":lambda df: calc_trailing_urgency(df),
        "leading_team_urgency_to_attack":lambda df: calc_leading_urgency(df),

        "trailing_team_momentum_to_attack": lambda df: (
            (df["trailing_team_shots_75_80"]-df["trailing_team_shots_70_75"]) +
            (df["trailing_team_corners_75_80"]-df["trailing_team_corners_70_75"])
        ) *df["trailing_team_urgency_to_attack"],

        "leading_team_momentum_to_attack": lambda df: (
            (df["leading_team_shots_75_80"]-df["leading_team_shots_70_75"]) +
            (df["leading_team_corners_75_80"]-df["leading_team_corners_70_75"])
        ) *df["leading_team_urgency_to_attack"],

        "trailing_team_attack_intensity": lambda df: (df["goal_diff_80"].abs() == 1) * (df["trailing_team_shots_75_80"]+df["trailing_team_corners_75_80"]),
        "leading_team_attack_intensity": lambda df: (df["goal_diff_80"].abs() == 1) * (df["leading_team_shots_75_80"]+df["leading_team_corners_75_80"]),
        "trailing_team_defensive_pressure": lambda df: df["trailing_team_fouls_75_80"] - df["trailing_team_fouls_70_75"],
        "leading_team_defensive_pressure": lambda df: df["leading_team_fouls_75_80"] - df["leading_team_fouls_70_75"],
        "trailing_team_shot_to_corner_ratio_pre_80": lambda df: (df["trailing_team_shots_pre80"] / df["trailing_team_corners_pre80"]).fillna(0),
        "leading_team_shot_to_corner_ratio_pre_80": lambda df: (df["leading_team_shots_pre80"] / df["leading_team_corners_pre80"]).fillna(0),
        "trailing_team_aggression_score_pre_80":lambda df: (df["trailing_team_fouls_pre80"] + df["trailing_team_yellow_cards_pre80"])/(df["trailing_team_shots_pre80"]).fillna(0),
        "leading_team_aggression_score_pre_80": lambda df: (df["leading_team_fouls_pre80"] + df["leading_team_yellow_cards_pre80"])/(df["leading_team_shots_pre80"]).fillna(0),

        "trailing_team_probability": lambda df: df.apply(lambda row: 1/row['odd_h'] *100 if row['trailing_team']=='home' else 1/row['odd_a']* 100, axis=1)
    }
    
    # Apply selected feature transformations
    for feature, operation in feature_operations.items():
        df[feature] = operation(df)
    
    df = df.round(3)
    return df

# Construct features based on the updated dataset
aggregated_full = construct_features(aggregated_data)
aggregated_full

Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,leading_team_momentum_to_attack,trailing_team_attack_intensity,leading_team_attack_intensity,trailing_team_defensive_pressure,leading_team_defensive_pressure,trailing_team_shot_to_corner_ratio_pre_80,leading_team_shot_to_corner_ratio_pre_80,trailing_team_aggression_score_pre_80,leading_team_aggression_score_pre_80,trailing_team_probability
0,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,-0.294,0,0,1,-4,2.250,2.667,1.556,3.125,42.373
1,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,-0.106,3,0,0,0,2.750,1.800,1.727,0.667,64.516
2,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,0.000,0,0,0,-2,1.250,4.500,1.000,1.000,28.986
3,Wn69eU5B/,2011-08-06,2012,D1,germany,FC Cologne,VfL Wolfsburg,3.00,3.80,2.54,...,-0.788,2,0,-1,0,1.000,4.667,1.857,1.286,33.333
4,bkjpaC6n/,2011-08-06,2012,D1,germany,Werder Bremen,Kaiserslautern,1.83,4.20,4.80,...,1.092,1,4,-1,0,1.500,2.833,4.167,0.647,20.833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3634,r5V5rw33/,2017-01-22,2017,E0,england,Arsenal,Burnley,1.23,7.65,17.75,...,0.813,0,1,-1,-1,2.500,2.333,1.100,0.381,5.634
3635,r5m8MY4G/,2017-01-22,2017,D1,germany,Bayer Leverkusen,Hertha Berlin,1.82,3.80,5.70,...,0.000,0,0,1,1,2.000,2.000,2.500,0.917,17.544
3636,trUaUcuk/,2017-01-22,2017,SP1,spain,Osasuna,Sevilla,6.80,4.42,1.58,...,0.633,0,1,1,0,4.000,1.833,1.083,0.909,14.706
3637,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,0.000,1,0,0,-2,1.143,0.875,1.500,1.571,5.000


### Save Processed Data
- Saves the processed dataset with all statistics to `trailing_data.csv`.  
- A snippet of the first 500 rows is saved as `trailing_data_snippet.csv`.  

In [11]:
print(aggregated_full.shape)

aggregated_full.to_csv(WRITE_FOLDER_PATH+'trailing_data.csv', index=False)
aggregated_full.tail(500).to_csv(WRITE_FOLDER_PATH+'snippets/trailing_data_snippet.csv', index=False)

(3639, 113)
