### Import Dataset
- Reads the dataset `aggregated_data.csv` into a pandas DataFrame for analysis.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#CONSTS
READ_FOLDER_PATH = '../data/processed/'
WRITE_FOLDER_PATH = '../data/processed/'

data = pd.read_csv(READ_FOLDER_PATH+'aggregated_data.csv')

print(data.shape)
pd.set_option("display.max_colwidth", None) 

(9074, 109)


### Construct Track2-specific Features

- Filters the input df to only games wehre a team is trailing by 1 goal, to be fit for Track 2 .
- Transform the dataset from home/away perspective to trailing/leading team perspective.

In [2]:
def transform_home_away_to_trailing_leading(df):
    #Filter: to only games where a team is trailing by 1 goal
    result_df = df[df['goal_diff_80'].abs() == 1].copy()
    
    # Get matchng home and away columns. Don't inlcude team name columns...
    home_cols = result_df.filter(like='home_').columns.difference(['home_team', 'away_team']) 
    
    transformed_df = result_df.copy()
    for home_col in home_cols: 
        base_name = home_col[5:]  #remove 'home_' prefix
        away_col = f'away_{base_name}'
        
        #Skip if no matches
        if away_col not in result_df.columns: continue
            
        # Create new trailing and leading columns:
        transformed_df[f'trailing_team_{base_name}'] = np.where(
            result_df['is_home_trailing'] ==1, result_df[home_col], result_df[away_col]
        )
        transformed_df[f'leading_team_{base_name}'] = np.where(
            result_df['is_home_trailing'] ==1, result_df[away_col], result_df[home_col]
        )
        
        #Then drop the original home/away columns
        transformed_df.drop([home_col, away_col], axis=1, inplace=True)
        
    #Finally... drop unecessary rolling average columns:
    columns_to_drop = [
        'trailing_team_games', 'leading_team_games', 
        'leading_team_corners_for_avg', 'trailing_team_corners_against_avg', 
        'leading_team_post80_corners_for_avg', 'trailing_team_post80_corners_against_avg'
    ]
    transformed_df.drop(columns=columns_to_drop, inplace=True)
    
    return transformed_df
    
transformed_df = transform_home_away_to_trailing_leading(data)
transformed_df

Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,trailing_team_shots_pre80,leading_team_shots_pre80,trailing_team_yellow_cards_70_75,leading_team_yellow_cards_70_75,trailing_team_yellow_cards_75_80,leading_team_yellow_cards_75_80,trailing_team_yellow_cards_ft,leading_team_yellow_cards_ft,trailing_team_yellow_cards_pre80,leading_team_yellow_cards_pre80
1,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,9,8,0,0,0,0,2,4,2,2
2,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,11,9,0,0,0,0,1,3,1,1
3,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,10,9,0,0,0,0,1,2,1,1
8,Wn69eU5B/,2011-08-06,2012,D1,germany,FC Cologne,VfL Wolfsburg,3.00,3.80,2.54,...,7,14,0,0,0,1,2,5,2,3
9,bkjpaC6n/,2011-08-06,2012,D1,germany,Werder Bremen,Kaiserslautern,1.83,4.20,4.80,...,6,17,1,1,0,0,2,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9066,r5V5rw33/,2017-01-22,2017,E0,england,Arsenal,Burnley,1.23,7.65,17.75,...,10,21,1,0,0,0,3,1,2,0
9067,r5m8MY4G/,2017-01-22,2017,D1,germany,Bayer Leverkusen,Hertha Berlin,1.82,3.80,5.70,...,6,12,0,0,0,0,2,0,2,0
9068,trUaUcuk/,2017-01-22,2017,SP1,spain,Osasuna,Sevilla,6.80,4.42,1.58,...,12,11,0,0,0,0,5,3,2,3
9071,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,8,7,0,0,0,0,3,1,3,1


In [3]:
def calc_trailing_urgency(df):
    urgency = []
    for _, row in df.iterrows():
        # Urgency is proportional to inverse of trailing team's odds
        trailing_team_odds = row["odd_h"] if row["is_home_trailing"]==1 else row["odd_a"]
        u = (1 / trailing_team_odds)
        urgency.append(round(u, 3))  
    return urgency 

def calc_leading_urgency(df):
    urgency = []
    for _, row in df.iterrows():
        # Urgency is proportional to inverse of leading team's odds
        leading_team_odds = row["odd_h"] if row["is_home_trailing"]==0 else row["odd_a"]
        u = (1 / leading_team_odds)
        urgency.append(round(u, 3))  
    return urgency 

def construct_features(df):
    """
    Constructucts features dynamically based on config:
    """
    
    #dictionary mapping to calc each feature
    feature_operations = {
        "odds_ratio": lambda df: df.apply(lambda row: row["odd_h"] / row["odd_a"] if row["is_home_trailing"] == 1 else row["odd_a"] / row["odd_h"], axis=1),

        "trailing_team_urgency_to_attack":lambda df: calc_trailing_urgency(df),
        "leading_team_urgency_to_attack":lambda df: calc_leading_urgency(df),

        "trailing_team_momentum_to_attack": lambda df: (
            (df["trailing_team_shots_75_80"]-df["trailing_team_shots_70_75"]) +
            (df["trailing_team_corners_75_80"]-df["trailing_team_corners_70_75"])
        ) *df["trailing_team_urgency_to_attack"],

        "leading_team_momentum_to_attack": lambda df: (
            (df["leading_team_shots_75_80"]-df["leading_team_shots_70_75"]) +
            (df["leading_team_corners_75_80"]-df["leading_team_corners_70_75"])
        ) *df["leading_team_urgency_to_attack"],

        "trailing_team_attack_intensity": lambda df: (df["goal_diff_80"].abs() == 1) * (df["trailing_team_shots_75_80"]+df["trailing_team_corners_75_80"]),
        "leading_team_attack_intensity": lambda df: (df["goal_diff_80"].abs() == 1) * (df["leading_team_shots_75_80"]+df["leading_team_corners_75_80"]),
        "trailing_team_defensive_pressure": lambda df: df["trailing_team_fouls_75_80"] - df["trailing_team_fouls_70_75"],
        "leading_team_defensive_pressure": lambda df: df["leading_team_fouls_75_80"] - df["leading_team_fouls_70_75"],
        "trailing_team_shot_to_corner_ratio_pre_80": lambda df: (df["trailing_team_shots_pre80"] / df["trailing_team_corners_pre80"]).fillna(0),
        "leading_team_shot_to_corner_ratio_pre_80": lambda df: (df["leading_team_shots_pre80"] / df["leading_team_corners_pre80"]).fillna(0),
        "trailing_team_aggression_score_pre_80":lambda df: (df["trailing_team_fouls_pre80"] + df["trailing_team_yellow_cards_pre80"])/(df["trailing_team_shots_pre80"]).fillna(0),
        "leading_team_aggression_score_pre_80": lambda df: (df["leading_team_fouls_pre80"] + df["leading_team_yellow_cards_pre80"])/(df["leading_team_shots_pre80"]).fillna(0),

        "trailing_team_probability": lambda df: df.apply(lambda row: 1/row['odd_h'] *100 if row['is_home_trailing']==1 else 1/row['odd_a']* 100, axis=1)
    }

    """
    * Note: It was realised after reviewing that trailing_team_probability is essentially the same metric as trailing_team_urgency_to_attack (except 
    trailing_team_probability is expressed as a percentage). This was due to the nature in which I copied over my code from Track1, and forgot to 
    adjust the way calc_trailing_urgency and calc_leading_urgency calculate their metrics. Interestingly however, removing one significantly weakens
    the results from the model. Therefore, although it is typically bad practice to use two related features in model training (and may be a risk 
    of overfitting), they are both kept basesd on these results, and for completeness with my report (as it was noticed too late to change).
    """
    
    # Apply selected feature transformations
    for feature, operation in feature_operations.items():
        df[feature] = operation(df)
    
    df = df.round(3)
    return df

def construct_target(df):
    """
    Constructucts target variable for that df
    """
    df['target'] = (df['trailing_team_corners'] > 0).astype(int)
    return df

# Construct features based on the updated dataset
track2_dataset = construct_features(transformed_df)
track2_dataset = construct_target(track2_dataset)
track2_dataset

Unnamed: 0,id_odsp,date,season,league,country,home_team,away_team,odd_h,odd_d,odd_a,...,trailing_team_attack_intensity,leading_team_attack_intensity,trailing_team_defensive_pressure,leading_team_defensive_pressure,trailing_team_shot_to_corner_ratio_pre_80,leading_team_shot_to_corner_ratio_pre_80,trailing_team_aggression_score_pre_80,leading_team_aggression_score_pre_80,trailing_team_probability,target
1,Aw5DflLH/,2011-08-06,2012,D1,germany,FC Augsburg,SC Freiburg,2.36,3.60,3.40,...,0,0,1,-4,2.250,2.667,1.556,3.125,42.373,1
2,CzPV312a/,2011-08-06,2012,F1,france,Paris Saint-Germain,Lorient,1.55,4.50,9.40,...,3,0,0,0,2.750,1.800,1.727,0.667,64.516,1
3,GUOdmtII/,2011-08-06,2012,F1,france,Caen,Valenciennes,2.50,3.40,3.45,...,0,0,0,-2,1.250,4.500,1.000,1.000,28.986,1
8,Wn69eU5B/,2011-08-06,2012,D1,germany,FC Cologne,VfL Wolfsburg,3.00,3.80,2.54,...,2,0,-1,0,1.000,4.667,1.857,1.286,33.333,0
9,bkjpaC6n/,2011-08-06,2012,D1,germany,Werder Bremen,Kaiserslautern,1.83,4.20,4.80,...,1,4,-1,0,1.500,2.833,4.167,0.647,20.833,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9066,r5V5rw33/,2017-01-22,2017,E0,england,Arsenal,Burnley,1.23,7.65,17.75,...,0,1,-1,-1,2.500,2.333,1.100,0.381,5.634,0
9067,r5m8MY4G/,2017-01-22,2017,D1,germany,Bayer Leverkusen,Hertha Berlin,1.82,3.80,5.70,...,0,0,1,1,2.000,2.000,2.500,0.917,17.544,0
9068,trUaUcuk/,2017-01-22,2017,SP1,spain,Osasuna,Sevilla,6.80,4.42,1.58,...,0,1,1,0,4.000,1.833,1.083,0.909,14.706,0
9071,xSU9scI9/,2017-01-22,2017,E0,england,Chelsea,Hull,1.19,8.50,20.00,...,1,0,0,-2,1.143,0.875,1.500,1.571,5.000,0


### Save Processed Data
- Saves the processed dataset with all statistics to `track2_dataset.csv`.  
- A snippet of the first 500 rows is saved as `track2_dataset_snippet.csv`.  

In [4]:
print(track2_dataset.shape)

track2_dataset.to_csv(WRITE_FOLDER_PATH+'track2_dataset.csv', index=False)
track2_dataset.tail(500).to_csv(WRITE_FOLDER_PATH+'snippets/track2_dataset_snippet.csv', index=False)

(3639, 118)
