### Import Dataset
- Reads the dataset `aggregated_full.csv` into a pandas DataFrame for analysis.

In [19]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#CONSTS
READ_FOLDER_PATH = '../data/processed/'
WRITE_FOLDER_PATH = '../data/processed/'

data = pd.read_csv(READ_FOLDER_PATH+'aggregated_full.csv')

print(data.shape)
pd.set_option("display.max_colwidth", None) 
data

(9074, 109)


Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,home_games,home_corners_for_avg,home_corners_against_avg,home_post80_corners_for_avg,home_post80_corners_against_avg,away_games,away_corners_for_avg,away_corners_against_avg,away_post80_corners_for_avg,away_post80_corners_against_avg
0,UFot0hit/,2011-08-05,2012,D1,germany,Borussia Dortmund,Hamburg SV,1.56,4.41,7.42,...,0.0,5.307836,4.194030,0.717662,0.584577,0.0,4.194030,5.307836,0.584577,0.717662
1,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,0.0,5.307836,4.194030,0.717662,0.584577,0.0,4.194030,5.307836,0.584577,0.717662
2,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,0.0,5.359345,4.120424,0.767341,0.625241,0.0,4.120424,5.359345,0.625241,0.767341
3,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,0.0,5.359345,4.120424,0.767341,0.625241,0.0,4.120424,5.359345,0.625241,0.767341
4,M7PhlM2C/,2011-08-06,2012,F1,france,Brest,Evian Thonon Gaillard,2.29,3.25,3.85,...,0.0,5.359345,4.120424,0.767341,0.625241,0.0,4.120424,5.359345,0.625241,0.767341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9069,vJy048Er/,2017-01-22,2017,I1,italy,Empoli,Udinese,2.90,3.18,2.96,...,10.0,4.431555,4.140042,0.893589,0.971624,9.0,4.454046,5.974711,0.968786,0.482948
9070,xAkY8l6R/,2017-01-22,2017,I1,italy,Genoa,Crotone,1.97,4.35,8.00,...,10.0,3.886101,4.321860,1.257225,0.517078,10.0,3.685497,9.613374,0.880715,1.166316
9071,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,10.0,5.806774,4.154734,0.624116,0.701869,10.0,5.427462,7.443138,0.701869,0.533207
9072,xY7uZwOI/,2017-01-22,2017,F1,france,AS Monaco,Lorient,1.32,6.24,11.50,...,10.0,5.487213,4.738220,0.706122,0.875022,10.0,5.374584,5.578122,1.238658,0.797031


### Construct Track1-specific Features

In [20]:
### Construct Engineered Features
def calc_home_urgency(df):
    """
    Computes urgency for the home team to attack based on:
    - If they are losing at 80 minutes (and goal diff is not over 2)
    - If the match is drawn-> urgency is based on odds
    """
    urgency = []
    for i in range(len(df)):
        u = 0
        if df.loc[i, "goal_diff_80"]<0 and df.loc[i, "goal_diff_80"]>=-2:  # Home team losing
            # urgency is proportional to inverse of home team's odds .. normalised, by the absolute goal difference
            u = (1 / df.loc[i, "odd_h"]) / abs(df.loc[i,"goal_diff_80"])
        elif df.loc[i, "goal_diff_80"] == 0:  #drawing
            #proportional to inverse of home teams odds... normalised by sum of the inverse of both teams odds
            u = (1 / df.loc[i,"odd_h"]) / ((1 / df.loc[i, "odd_h"]) + (1 / df.loc[i, "odd_a"]))
        urgency.append(round(u, 3)) #else append urgency=0
    return urgency 

def calc_away_urgency(df):
    """
    Computes urgency for the away team to attack based on:
    - If they are losing at 80 minutes (and goal diff is not over 2)
    - If the match is drawn-> urgency is based on odds
    """
    urgency = []
    for i in range(len(df)):
        u = 0
        if df.loc[i, "goal_diff_80"] >0 and df.loc[i, "goal_diff_80"]<=2:
            u = 1 / df.loc[i, "odd_a"] / abs(df.loc[i,"goal_diff_80"])
        elif df.loc[i, "goal_diff_80"] ==0:
            u = (1 / df.loc[i,"odd_a"]) / ((1 / df.loc[i, "odd_h"]) + (1 / df.loc[i, "odd_a"]))
        urgency.append(round(u,3))
    return urgency

def construct_features(df):
    """
    Constructucts features dynamically based on config:
    """

    #dictionary mapping to calc each feature
    feature_operations = {
        #One-hot encoded features to capture game state:
        "game_state_80_h": lambda df: (df["goal_diff_80"] > 0).astype(int),
        "game_state_80_a": lambda df: (df["goal_diff_80"] < 0).astype(int),
        "game_state_80_d": lambda df: (df["goal_diff_80"] == 0).astype(int),

        #Features to capture goal difference effects:
        "goal_diff_80": lambda df: df["goal_diff_80"],
        "abs_goal_diff_80": lambda df: df["goal_diff_80"].abs(),
        "close_game_80": lambda df: (df["goal_diff_80"].abs() <= 1).astype(int),
        "goal_diff_squared_80": lambda df: df["goal_diff_80"]**2,

        #Special indicators:
        "odds_ratio": lambda df: (df["odd_h"] / df["odd_a"]),

        "shot_to_corner_ratio_pre_80": lambda df: (df["total_shots_pre_80"] /df["total_corners_pre_80"]).fillna(0), #..avoid division by zero
        "team_aggression_score_pre_80": lambda df: (df["total_fouls_pre_80"]+df["total_yellow_cards_pre_80"]) /(df["total_shots_pre_80"]).fillna(0),

        "home_urgency_to_attack":lambda df: calc_home_urgency(df),
        "away_urgency_to_attack":lambda df: calc_away_urgency(df),

        "home_momentum_to_attack": lambda df: (
            (df["home_shots_75_80"]-df["home_shots_70_75"]) +
            (df["home_corners_75_80"]-df["home_corners_70_75"])
        ) *df["home_urgency_to_attack"],

        "away_momentum_to_attack": lambda df: (
            (df["away_shots_75_80"]-df["away_shots_70_75"]) +
            (df["away_corners_75_80"]-df["away_corners_70_75"])
        ) *df["away_urgency_to_attack"],

        "attack_intensity": lambda df: (df["goal_diff_80"].abs()==1) * (df["total_shots_75_80"]+df["total_corners_75_80"]),
        "defensive_pressure":lambda df: df["total_fouls_75_80"]-df["total_fouls_70_75"],

        "trailing_team_probability": lambda df: df.apply(lambda row: 1/row['odd_h'] *100 if row['trailing_team']=='home' else 1/row['odd_a']* 100, axis=1)
    }
    
    #Only aply features selected from config!
    for feature, operation in feature_operations.items():
        df[feature] =operation(df)
    df =df.round(3)

    return df

def construct_target(df):
    df['target'] = (df['corner_diff'] > 0).astype(int)
    return df

track1_dataset = construct_features(data)
track1_dataset = construct_target(track1_dataset)

track1_dataset

Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,shot_to_corner_ratio_pre_80,team_aggression_score_pre_80,home_urgency_to_attack,away_urgency_to_attack,home_momentum_to_attack,away_momentum_to_attack,attack_intensity,defensive_pressure,trailing_team_probability,target
0,UFot0hit/,2011-08-05,2012,D1,germany,Borussia Dortmund,Hamburg SV,1.56,4.41,7.42,...,1.692,1.273,0.000,0.067,0.000,0.067,0,-2,13.477,1
1,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,2.429,2.294,0.424,0.000,-0.424,-0.000,0,-3,42.373,1
2,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,2.222,1.250,0.645,0.000,1.935,-0.000,3,0,64.516,1
3,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,1.900,1.000,0.000,0.290,0.000,0.000,0,-2,28.986,1
4,M7PhlM2C/,2011-08-06,2012,F1,france,Brest,Evian Thonon Gaillard,2.29,3.25,3.85,...,2.000,0.767,0.627,0.373,2.508,0.000,0,-1,25.974,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9069,vJy048Er/,2017-01-22,2017,I1,italy,Empoli,Udinese,2.90,3.18,2.96,...,1.250,3.800,0.505,0.495,-0.505,0.000,0,0,33.784,1
9070,xAkY8l6R/,2017-01-22,2017,I1,italy,Genoa,Crotone,1.97,4.35,8.00,...,2.100,1.143,0.802,0.198,-0.802,-0.396,0,0,12.500,1
9071,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,1.000,1.533,0.000,0.050,0.000,0.050,1,-2,5.000,1
9072,xY7uZwOI/,2017-01-22,2017,F1,france,AS Monaco,Lorient,1.32,6.24,11.50,...,2.375,1.000,0.000,0.000,0.000,0.000,0,0,8.696,0


### Save Processed Data
- Saves the processed dataset with all statistics to `track1_dataset.csv`.  
- A snippet of the first 500 rows is saved as `track1_dataset_snippet.csv`.  

In [21]:
print(track1_dataset.shape)

track1_dataset.to_csv(WRITE_FOLDER_PATH+'track1_dataset.csv', index=False)
track1_dataset.tail(500).to_csv(WRITE_FOLDER_PATH+'snippets/track1_dataset_snippet.csv', index=False)

(9074, 126)
