In [None]:
### This notebook is used to clean and process the raw data. 
### First steps are oriented towards ensuring that the data is usable, whilst the later cells delve into feature engeneering.
### WARNING: Some feature engeneering functions are not yet optimized (one ccould consider to dramatically speed up the generation of features by implementing 
###          parallelizable versions, but for now the chosen method of splitting the fearture generation into seperate moduls is pragmatic) so feature generation
###          might take very long if run on a single machine. 
### Note: file paths need to be modified if used in a different file structure 

In [None]:
### Clean data and bring it into usable form

import pandas as pd
import os # We'll use this library to handle file paths robustly


def load_and_clean_data(start_year,end_year):
    print("Starting the data loading process...")

    # Define the years and the path to your data
    target_years = range(start_year, end_year + 1)

    data_directory = '../tennis_data/ATP_data/'

    # Loop through the years and load each file
    # Create an empty list to hold the DataFrame for each year
    yearly_dfs = []

    for year in target_years:
        # Construct the full path to the file for the current year
        file_path = os.path.join(data_directory, f'{year}.csv')
        
        try:
            # Read the CSV file into a temporary DataFrame
            temp_df = pd.read_csv(file_path)
            # Add the loaded DataFrame to our list
            yearly_dfs.append(temp_df)
            print(f"Successfully loaded {year}.csv")
            
        except FileNotFoundError:
            # If a file for a specific year doesn't exist, print a warning and continue
            print(f"Warning: File for {year}.csv not found at {file_path}. Skipping.")

    # Combine all yearly DataFrames into one
    if yearly_dfs:
        # pd.concat is the function that stacks all the DataFrames in our list together
        full_df = pd.concat(yearly_dfs, ignore_index=True)

        print("\n✅ All files have been loaded and combined successfully!")
        print(f"The DataFrame has {full_df.shape[0]} rows (matches) and {full_df.shape[1]} columns.")
        
        # Convert the 'tourney_date' column
        # '%Y' corresponds to the 4-digit year.
        # '%m' corresponds to the 2-digit month.
        # '%d' corresponds to the 2-digit day.
        full_df['tourney_date'] = pd.to_datetime(full_df['tourney_date'], format='%Y%m%d')

        # Clean and Convert Statistical Columns to Numeric

        print("Converting all statistical columns to a numeric data type...")

        # Create a list of all the columns that should contain numbers
        numeric_cols = [
            'winner_rank', 'loser_rank', 'winner_age', 'loser_age',
            'w_ace', 'l_ace', 'w_df', 'l_df', 'w_svpt', 'l_svpt',
            'w_1stIn', 'l_1stIn', 'w_1stWon', 'l_1stWon', 'w_2ndWon', 'l_2ndWon',
            'w_bpSaved', 'l_bpSaved', 'w_bpFaced', 'l_bpFaced',
            'winner_ht', 'loser_ht', 'draw_size'
        ]

        # Loop through each column in our list
        for col in numeric_cols:
            # Convert the column to a numeric type.
            # The key is errors='coerce', which will replace any value that
            # cannot be converted to a number with NaN
            full_df[col] = pd.to_numeric(full_df[col], errors='coerce')

        print("✅ Statistical columns successfully converted!")

        # As a final cleaning step, we can drop any rows that are missing crucial data
        # that would make feature calculation impossible later on.
        print("\nDropping rows with missing essential data (like rank or stats)...")
        full_df.dropna(subset=numeric_cols, inplace=True)
        full_df.dropna(subset=['surface'], inplace=True)
        

        #Also drop the carpet matches since they are irrelevant for the modern game.
        full_df = full_df[full_df['surface'] != 'Carpet'].copy()

        print(f"The cleaned DataFrame now has {full_df.shape[0]} rows.")

        # Display the first few rows to verify everything looks correct
        print("\nHere's a preview of the combined data:")
        pd.set_option('display.max_columns', None)
        display(full_df.head())
    
    else:
        print("\n❌ No data files were found. Please check your 'data_directory' path.")


    return full_df

master_df = load_and_clean_data(2000,2025)
training_df = load_and_clean_data(2001,2023)
test_df = load_and_clean_data(2024,2025)
df_2025 = load_and_clean_data(2025,2025)

# We make sure to chronologically order the matches:
master_df.sort_values(by=['tourney_date','tourney_id','match_num'], inplace=True)
training_df.sort_values(by=['tourney_date','tourney_id','match_num'], inplace=True)
test_df.sort_values(by=['tourney_date','tourney_id','match_num'], inplace=True)
df_2025.sort_values(by=['tourney_date','tourney_id','match_num'], inplace=True)

Starting the data loading process...
Successfully loaded 2000.csv
Successfully loaded 2001.csv
Successfully loaded 2002.csv
Successfully loaded 2003.csv
Successfully loaded 2004.csv
Successfully loaded 2005.csv
Successfully loaded 2006.csv
Successfully loaded 2007.csv
Successfully loaded 2008.csv
Successfully loaded 2009.csv
Successfully loaded 2010.csv
Successfully loaded 2011.csv
Successfully loaded 2012.csv
Successfully loaded 2013.csv
Successfully loaded 2014.csv
Successfully loaded 2015.csv
Successfully loaded 2016.csv
Successfully loaded 2017.csv
Successfully loaded 2018.csv
Successfully loaded 2019.csv
Successfully loaded 2020.csv
Successfully loaded 2021.csv
Successfully loaded 2022.csv
Successfully loaded 2023.csv
Successfully loaded 2024.csv
Successfully loaded 2025.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 77198 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully c

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2000-7308,Adelaide,Hard,32.0,A,2000-01-03,1.0,E113,1.0,,Thomas Enqvist,R,190.0,SWE,25.791,4.0,,C487,,,Arnaud Clement,R,173.0,FRA,22.026,56.0,,6-3 6-4,3.0,R32,76.0,6.0,0.0,66.0,29.0,23.0,23.0,10.0,2.0,2.0,1.0,3.0,59.0,37.0,25.0,13.0,9.0,2.0,4.0
1,2000-7308,Adelaide,Hard,32.0,A,2000-01-03,2.0,F324,,,Roger Federer,R,185.0,SUI,18.385,64.0,,K260,,,Jens Knippschild,R,190.0,GER,24.862,91.0,,6-1 6-4,3.0,R32,45.0,6.0,3.0,46.0,28.0,24.0,12.0,9.0,0.0,0.0,3.0,5.0,42.0,15.0,13.0,12.0,8.0,0.0,3.0
2,2000-7308,Adelaide,Hard,32.0,A,2000-01-03,3.0,G352,,,Jan-Michael Gambill,R,190.0,USA,22.565,58.0,,A202,,,Wayne Arthurs,L,190.0,AUS,28.778,105.0,,3-6 7-6(5) 6-4,3.0,R32,115.0,8.0,3.0,81.0,40.0,35.0,28.0,15.0,0.0,1.0,26.0,2.0,103.0,59.0,49.0,22.0,16.0,4.0,5.0
3,2000-7308,Adelaide,Hard,32.0,A,2000-01-03,4.0,G379,7.0,,Sebastien Grosjean,R,175.0,FRA,21.58,27.0,,I052,,,Andrew Ilie,R,180.0,AUS,23.691,54.0,,6-2 6-1,3.0,R32,65.0,4.0,2.0,66.0,35.0,28.0,14.0,7.0,4.0,4.0,0.0,3.0,49.0,22.0,12.0,8.0,8.0,1.0,6.0
4,2000-7308,Adelaide,Hard,32.0,A,2000-01-03,5.0,N250,3.0,,Magnus Norman,R,188.0,SWE,23.576,15.0,,D270,,WC,Scott Draper,L,178.0,AUS,25.561,154.0,,6-4 6-4,3.0,R32,68.0,6.0,2.0,52.0,32.0,26.0,12.0,10.0,0.0,1.0,4.0,2.0,73.0,40.0,25.0,16.0,10.0,7.0,10.0


Starting the data loading process...
Successfully loaded 2001.csv
Successfully loaded 2002.csv
Successfully loaded 2003.csv
Successfully loaded 2004.csv
Successfully loaded 2005.csv
Successfully loaded 2006.csv
Successfully loaded 2007.csv
Successfully loaded 2008.csv
Successfully loaded 2009.csv
Successfully loaded 2010.csv
Successfully loaded 2011.csv
Successfully loaded 2012.csv
Successfully loaded 2013.csv
Successfully loaded 2014.csv
Successfully loaded 2015.csv
Successfully loaded 2016.csv
Successfully loaded 2017.csv
Successfully loaded 2018.csv
Successfully loaded 2019.csv
Successfully loaded 2020.csv
Successfully loaded 2021.csv
Successfully loaded 2022.csv
Successfully loaded 2023.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 68551 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully converted!

Dropping rows with missing essential data (like rank or stats)...
The cleane

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2001-7308,Adelaide,Hard,32.0,A,2001-01-01,1.0,H432,1.0,,Lleyton Hewitt,R,180.0,AUS,19.852,7.0,,A202,,,Wayne Arthurs,L,190.0,AUS,29.793,83.0,,6-4 6-3,3,R32,83.0,3.0,2.0,60.0,31.0,23.0,18.0,9.0,1.0,1.0,8.0,5.0,65.0,32.0,26.0,13.0,10.0,5.0,8.0
1,2001-7308,Adelaide,Hard,32.0,A,2001-01-01,2.0,P436,,Q,Bjorn Phau,R,175.0,GER,21.246,208.0,,W136,,WC,Todd Woodbridge,R,178.0,AUS,29.752,187.0,,7-5 6-1,3,R32,81.0,5.0,5.0,69.0,34.0,24.0,22.0,9.0,7.0,7.0,1.0,3.0,66.0,35.0,21.0,16.0,10.0,0.0,4.0
2,2001-7308,Adelaide,Hard,32.0,A,2001-01-01,3.0,M680,,Q,Xavier Malisse,R,185.0,BEL,20.454,127.0,,W212,,,Chris Woodruff,R,188.0,USA,27.997,67.0,,2-6 6-3 6-2,3,R32,96.0,5.0,2.0,79.0,41.0,27.0,23.0,13.0,9.0,12.0,5.0,6.0,70.0,43.0,30.0,12.0,12.0,4.0,8.0
3,2001-7308,Adelaide,Hard,32.0,A,2001-01-01,4.0,H355,5.0,,Tommy Haas,R,188.0,GER,22.749,23.0,,S765,,WC,Luke Smith,R,188.0,AUS,24.186,485.0,,6-3 6-3,3,R32,64.0,2.0,1.0,48.0,30.0,25.0,12.0,9.0,1.0,1.0,3.0,5.0,50.0,27.0,15.0,13.0,9.0,3.0,6.0
4,2001-7308,Adelaide,Hard,32.0,A,2001-01-01,5.0,S331,,,Jason Stoltenberg,R,185.0,AUS,30.746,66.0,,G379,4.0,,Sebastien Grosjean,R,175.0,FRA,22.595,19.0,,6-3 6-2,3,R32,74.0,3.0,3.0,59.0,26.0,21.0,17.0,9.0,2.0,3.0,0.0,3.0,65.0,35.0,26.0,8.0,8.0,8.0,12.0


Starting the data loading process...
Successfully loaded 2024.csv
Successfully loaded 2025.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 5269 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully converted!

Dropping rows with missing essential data (like rank or stats)...
The cleaned DataFrame now has 4894 rows.

Here's a preview of the combined data:


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2024-339,Brisbane,Hard,32.0,A,2024-01-01,270.0,D875,2.0,,Grigor Dimitrov,R,191.0,BUL,32.6,14.0,2570.0,MC10,,,Andy Murray,R,191.0,GBR,36.6,42.0,1050.0,4-6 7-5 6-2,3.0,R32,147.0,13.0,2.0,82.0,51.0,42.0,16.0,15.0,3.0,5.0,4.0,0.0,86.0,51.0,32.0,21.0,15.0,4.0,8.0
1,2024-339,Brisbane,Hard,32.0,A,2024-01-01,271.0,AE14,,,Daniel Altmaier,R,188.0,GER,25.3,56.0,891.0,TC01,,Q,Li Tu,R,183.0,AUS,27.5,225.0,280.0,7-6(5) 7-6(4),3.0,R32,121.0,7.0,1.0,81.0,55.0,42.0,17.0,12.0,0.0,0.0,3.0,2.0,83.0,46.0,38.0,22.0,12.0,2.0,2.0
3,2024-339,Brisbane,Hard,32.0,A,2024-01-01,273.0,M0FH,,Q,Tomas Machac,R,185.0,CZE,23.2,78.0,722.0,EA24,7.0,,Tomas Martin Etcheverry,R,196.0,ARG,24.4,30.0,1375.0,6-7(5) 7-5 7-6(1),3.0,R32,189.0,5.0,1.0,111.0,70.0,57.0,21.0,18.0,4.0,6.0,6.0,2.0,116.0,82.0,59.0,17.0,18.0,5.0,8.0
4,2024-339,Brisbane,Hard,32.0,A,2024-01-01,274.0,HH26,4.0,,Ugo Humbert,L,188.0,FRA,25.5,20.0,1765.0,M0QL,,Q,Alex Michelsen,R,193.0,USA,19.3,97.0,653.0,6-4 6-4,3.0,R32,95.0,4.0,1.0,53.0,29.0,24.0,19.0,10.0,2.0,2.0,5.0,5.0,72.0,38.0,24.0,18.0,10.0,8.0,10.0
5,2024-339,Brisbane,Hard,32.0,A,2024-01-01,275.0,TC61,,,Jordan Thompson,R,183.0,AUS,29.7,55.0,902.0,V832,,,Aleksandar Vukic,R,188.0,AUS,27.7,62.0,835.0,6-3 6-2,3.0,R32,96.0,7.0,1.0,46.0,28.0,23.0,11.0,9.0,0.0,1.0,3.0,0.0,54.0,32.0,18.0,10.0,8.0,4.0,8.0


Starting the data loading process...
Successfully loaded 2025.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 2193 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully converted!

Dropping rows with missing essential data (like rank or stats)...
The cleaned DataFrame now has 2041 rows.

Here's a preview of the combined data:


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced
0,2025-9900,United Cup,Hard,18.0,A,2024-12-29,1.0,CD85,,,Pablo Carreno Busta,R,188.0,ESP,33.467,196.0,292.0,S0H2,,,Alexander Shevchenko,R,188.0,KAZ,24.082,78.0,715.0,6-2 6-1,3.0,RR,64.0,3.0,0.0,40.0,27.0,21.0,9.0,8.0,1.0,2.0,3.0,4.0,42.0,24.0,14.0,4.0,7.0,2.0,7.0
1,2025-9900,United Cup,Hard,18.0,A,2024-12-29,2.0,Z371,,,Zhizhen Zhang,R,193.0,CHN,28.203,45.0,1155.0,MJ08,,,Thiago Monteiro,L,183.0,BRA,30.582,109.0,566.0,6-3 6-0,3.0,RR,54.0,10.0,0.0,39.0,30.0,26.0,7.0,8.0,0.0,0.0,3.0,2.0,35.0,22.0,13.0,3.0,7.0,1.0,5.0
2,2025-9900,United Cup,Hard,18.0,A,2024-12-29,3.0,TE51,,,Stefanos Tsitsipas,R,193.0,GRE,26.382,11.0,3165.0,CD85,,,Pablo Carreno Busta,R,188.0,ESP,33.467,196.0,292.0,6-4 4-6 6-3,3.0,RR,128.0,13.0,6.0,84.0,49.0,40.0,19.0,15.0,1.0,3.0,2.0,3.0,92.0,70.0,48.0,9.0,14.0,6.0,9.0
3,2025-9900,United Cup,Hard,18.0,A,2024-12-29,4.0,CG80,,,Borna Coric,R,188.0,CRO,28.123,90.0,639.0,AG37,,,Felix Auger-Aliassime,R,193.0,CAN,24.392,29.0,1635.0,0-6 6-4 6-4,3.0,RR,139.0,1.0,6.0,81.0,40.0,26.0,17.0,13.0,5.0,10.0,13.0,3.0,90.0,67.0,43.0,11.0,13.0,8.0,12.0
4,2025-9900,United Cup,Hard,18.0,A,2024-12-29,5.0,HH26,,,Ugo Humbert,L,188.0,FRA,26.511,14.0,2765.0,S0LA,,,Dominic Stricker,L,183.0,SUI,22.371,299.0,173.0,6-3 7-5,3.0,RR,72.0,10.0,2.0,51.0,34.0,30.0,10.0,11.0,0.0,1.0,10.0,1.0,56.0,34.0,27.0,10.0,10.0,1.0,4.0


In [None]:
### Since our dataset only gives us the date of the first monday of the tournament, in the following cell
#   we try to approximate the actual match date by using usual tournament structures and player number

In [None]:
### Approximate match date by using tournament type, draw size and round

import math
import pandas as pd
import numpy as np
import re

# mapping for the men's level codes
TOURNEY_LENGTH_BY_LEVEL_MEN = {
    'G': 14,   # Grand Slam
    'M': 9,    # Masters 1000 typical (special-case: draw_size==96)
    'A': None, # ambiguous: pick by draw_size fallback
    'C': 7,    # Challenger
    'S': 7,    # Satellite / ITF
    'F': 8,    # Finals / season-ending
    'D': 7     # Davis Cup (flag as special)
}

def infer_tourney_length_men(draw_size, tourney_level):
    """
    Deterministic mapping using men-level codes.
    Returns (tourney_length_days, uncertain_flag)
    """
    uncertain = False

    if pd.isna(tourney_level) or str(tourney_level).strip() == '':
        # no level provided -> fallback to draw-size rules but mark uncertain
        uncertain = True
        if not pd.isna(draw_size):
            ds = int(draw_size)
            if ds >= 128:
                return 14, uncertain
            if ds >= 96:
                return 13, uncertain
            if ds >= 64:
                return 9, uncertain
            return 7, uncertain
        return 7, uncertain

    lvl = str(tourney_level).strip().upper()

    # special case: masters with 96 draw (Indian Wells, Miami)
    if (not pd.isna(draw_size)) and int(draw_size) == 96:
        return 13, False

    if lvl in TOURNEY_LENGTH_BY_LEVEL_MEN:
        val = TOURNEY_LENGTH_BY_LEVEL_MEN[lvl]
        if val is None:
            # 'A' ambiguous: choose by draw size if available, else default 7 but mark uncertain
            if not pd.isna(draw_size):
                ds = int(draw_size)
                if ds >= 64:
                    return 9, False
                return 7, False
            # ambiguous, default and mark uncertain
            return 7, True
        # normal case
        # For Davis Cup ('D') we return 7 but caller can treat it specially via a flag
        return val, False

    # fallback: draw-size-based heuristic
    uncertain = True
    if not pd.isna(draw_size):
        ds = int(draw_size)
        if ds >= 128:
            return 14, uncertain
        if ds >= 96:
            return 13, uncertain
        if ds >= 64:
            return 9, uncertain
        return 7, uncertain
    return 7, uncertain


def round_to_offset_biased(round_label, draw_size, tourney_length):
    """
    Determine an offset (0..tourney_length-1) for a round label.
    Bias later rounds toward the end (non-linear).
    """
    if pd.isna(round_label) or str(round_label).strip() == '':
        return 0
    rl = str(round_label).upper().strip()

    # named rounds
    if rl in ('F', 'FINAL'):
        return tourney_length - 1
    if rl in ('SF', 'SEMI', 'SEMI-FINAL', 'SEMI_FINAL'):
        return max(tourney_length - 3, 0)
    if rl in ('QF', 'QUARTER', 'QUARTER-FINAL'):
        return max(tourney_length - 6, 0)


    # Try R<number> pattern like R128, R64, R32, R16
    early_frac=0.2
    m = re.search(r'R(\d+)', rl)
    if m:
        try:
            match_size = int(m.group(1))
            # compute number of rounds R = log2(draw_size) if possible
            if not pd.isna(draw_size):
                try:
                    R = int(round(math.log2(int(draw_size))))
                except Exception:
                    R = None
            else:
                R = None

            if R is None or R <= 1:
                # fallback small offset near beginning (draw/round ambiguous)
                return int(round((tourney_length - 1) * early_frac))

            # compute round_index and frac in [0,1]
            round_index = int(R - math.log2(match_size) + 1)
            frac = float((round_index - 1) / max(R - 1, 1))
            # clip to [0,1] to avoid negative or >1 values
            frac = max(0.0, min(1.0, frac))
            # bias transform that keeps frac in [0,1]
            frac_bias = frac ** 1.15
            frac_bias = max(0.0, min(1.0, frac_bias))
            offset = int(round(frac_bias * (tourney_length - 1)))
            return offset
        except Exception:
            # anything unexpected -> safe fallback
            return int(round((tourney_length - 1) * early_frac))

    # pattern '1R','2R' etc.
    m2 = re.search(r'(\d+)R', rl)
    if m2:
        rn = int(m2.group(1))
        if not pd.isna(draw_size):
            try:
                match_size = int(draw_size) // (2 ** (rn - 1))
                return round_to_offset_biased(f"R{match_size}", draw_size, tourney_length)
            except Exception:
                pass

    # unknown round -> small offset
    return int((tourney_length - 1) * 0.2)


def compute_approx_match_date_men(df,
                                  monday_col='tourney_date',
                                  draw_col='draw_size',
                                  round_col='round',
                                  level_col='tourney_level'):
    """
    Input: df must contain tourney monday, draw_size (optional), round (string), tourney_level (your codes).
    Output: df with columns:
      - approx_match_date (datetime)
      - tourney_length_days (int)
      - approx_offset_days (int)
      - approx_date_uncertain (bool)
    """
    df = df.copy()
    df[monday_col] = pd.to_datetime(df[monday_col])
    approx_dates, lengths, offsets, unc_flags = [], [], [], []

    for _, row in df.iterrows():
        monday = row[monday_col]
        draw = row.get(draw_col, np.nan)
        rlabel = row.get(round_col, None)
        level = row.get(level_col, None)

        tlen, uncertain = infer_tourney_length_men(draw, level)
        off = round_to_offset_biased(rlabel, draw, tlen)
        approx = monday + pd.Timedelta(days=int(off))

        approx_dates.append(approx)
        lengths.append(int(tlen))
        offsets.append(int(off))
        unc_flags.append(bool(uncertain or (level == 'D')))  # mark Davis Cup as uncertain/special

    out = df.copy()
    out['tourney_length_days'] = lengths
    out['approx_offset_days'] = offsets
    out['approx_match_date'] = pd.to_datetime(approx_dates)
    out['approx_date_uncertain'] = unc_flags
    return out


master_df = compute_approx_match_date_men(master_df)
training_df = compute_approx_match_date_men(training_df)
test_df = compute_approx_match_date_men(test_df)
df_2025 = compute_approx_match_date_men(df_2025)

<class 'pandas.core.frame.DataFrame'>
Index: 2041 entries, 56 to 2192
Data columns (total 57 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   tourney_id              2041 non-null   object        
 1   tourney_name            2041 non-null   object        
 2   surface                 2041 non-null   object        
 3   draw_size               2041 non-null   float64       
 4   tourney_level           2041 non-null   object        
 5   tourney_date            2041 non-null   datetime64[ns]
 6   match_num               1901 non-null   float64       
 7   winner_id               2041 non-null   object        
 8   winner_seed             888 non-null    float64       
 9   winner_entry            302 non-null    object        
 10  winner_name             2041 non-null   object        
 11  winner_hand             2041 non-null   object        
 12  winner_ht               2041 non-null   float64     

None

In [None]:
### To compare betting odds we use a different data set which we indepently load into the notebook

In [None]:
### Import betting odds
file_path = '../tennis_data/tennis-data.co.uk/2025_with_betting_odds.csv'

betting_odds_df = pd.read_csv(file_path, sep=';', skiprows=1)
betting_odds_df['row_idx'] = 0

# Map betting odds file to existing 2025.csv data file
for idx, match in betting_odds_df.iterrows():
        winner_name = match['Winner'].split(' ')[:-1][0]
        loser_name = match['Loser'].split(' ')[:-1][0]
        location = match['Location']

        mask = (df_2025['winner_name'].str.contains(winner_name, case=False) & df_2025['loser_name'].str.contains(loser_name, case=False) & df_2025['tourney_name'].str.contains(location, case=False))
        relevant_matches = df_2025[mask]
        try:
                relevant_match_idx = relevant_matches.index[0]
                betting_odds_df.loc[idx, 'row_idx'] = relevant_match_idx
        except:
                continue

# Get betting odds for matches in 2025.csv
df_2025['avg_odds_winner'] = 0.0
df_2025['avg_odds_winner_exists'] = False
df_2025['avg_odds_loser'] = 0.0
df_2025['avg_odds_loser_exists'] = False


for idx, match in df_2025.iterrows():
        try:
                winner_odds = betting_odds_df.loc[betting_odds_df['row_idx'] == idx, 'AvgW'].item()
                # Convert to decimals with point as seperator
                decomposition = winner_odds.split(',')
                if len(decomposition)==1:
                        decomposition.append('0')
                winner_odds = float(decomposition[0] + '.' + decomposition[1])
                df_2025.loc[idx, 'avg_odds_winner'] = winner_odds
                df_2025.loc[idx, 'avg_odds_winner_exists'] = True
                loser_odds = betting_odds_df.loc[betting_odds_df['row_idx'] == idx, 'AvgL'].item()
                # Convert to decimals with point as seperator
                decomposition = loser_odds.split(',')
                if len(decomposition)==1:
                        decomposition.append('0')
                loser_odds = float(decomposition[0] + '.' + decomposition[1])
                df_2025.loc[idx, 'avg_odds_loser'] = loser_odds
                df_2025.loc[idx, 'avg_odds_loser_exists'] = True

        except ValueError:
                continue

df_2025 = df_2025.loc[(df_2025['avg_odds_winner_exists'] == True) & (df_2025['avg_odds_loser_exists'] == True)]

df_2025.to_csv('2025_raw_data_with_betting_odds.csv', index=False)
print("✅ Saved '2025_raw_data_with_betting_odds.csv'")


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,Loser,WRank,LRank,WPts,LPts,W1,L1,W2,L2,W3,L3,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL,row_idx
0,1,Brisbane,Brisbane International,12/29/24,ATP250,Outdoor,Hard,1st Round,3,Vukic A.,Goffin D.,68,52.0,778,1037.0,6.0,2.0,6.0,3.0,,,,,,,2.0,0.0,Completed,2,18,208,183,214,183,203,178,25
1,1,Brisbane,Brisbane International,12/30/24,ATP250,Outdoor,Hard,1st Round,3,Michelsen A.,O Connell C.,41,64.0,1245,795.0,6.0,4.0,4.0,6.0,7.0,6.0,,,,,2.0,1.0,Completed,144,275,148,285,148,29,143,274,26
2,1,Brisbane,Brisbane International,12/30/24,ATP250,Outdoor,Hard,1st Round,3,Bonzi B.,Tabilo A.,75,23.0,730,1943.0,6.0,7.0,7.0,6.0,6.0,4.0,,,,,2.0,1.0,Completed,167,22,173,222,173,225,167,218,27
3,1,Brisbane,Brisbane International,12/30/24,ATP250,Outdoor,Hard,1st Round,3,Nishioka Y.,Rinderknech A.,69,59.0,776,927.0,6.0,3.0,6.0,4.0,,,,,,,2.0,0.0,Completed,153,25,164,239,165,25,159,236,28
4,1,Brisbane,Brisbane International,12/30/24,ATP250,Outdoor,Hard,1st Round,3,Thompson J.,Berrettini M.,26,34.0,1745,1380.0,3.0,6.0,6.0,3.0,6.0,4.0,,,,,2.0,1.0,Completed,263,15,247,16,263,16,248,154,29


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,winner_rank_points,loser_id,loser_seed,loser_entry,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,loser_rank_points,score,best_of,round,minutes,w_ace,w_df,w_svpt,w_1stIn,w_1stWon,w_2ndWon,w_SvGms,w_bpSaved,w_bpFaced,l_ace,l_df,l_svpt,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,tourney_length_days,approx_offset_days,approx_match_date,approx_date_uncertain,avg_odds_winner,avg_odds_winner_exists,avg_odds_loser,avg_odds_loser_exists
56,2025-336,Hong Kong,Hard,28.0,A,2024-12-29,1.0,KI95,,,Miomir Kecmanovic,R,183.0,SRB,25.331,54.0,1021.0,D0FJ,,,Luciano Darderi,R,183.0,ITA,22.872,44.0,1198.0,6-3 6-3,3.0,R32,67.0,9.0,1.0,57.0,39.0,32.0,8.0,9.0,2.0,2.0,9.0,1.0,53.0,35.0,22.0,10.0,9.0,6.0,9.0,7,0,2024-12-29,False,1.32,True,3.27,True
57,2025-336,Hong Kong,Hard,28.0,A,2024-12-29,2.0,M0CI,,,Fabian Marozsan,R,193.0,HUN,25.227,58.0,935.0,SX50,,,Roman Safiullin,R,185.0,RUS,27.395,60.0,923.0,6-1 7-6(9),3.0,R32,93.0,6.0,3.0,70.0,41.0,30.0,14.0,10.0,1.0,5.0,4.0,3.0,72.0,47.0,29.0,8.0,9.0,10.0,16.0,7,0,2024-12-29,False,2.54,True,1.53,True
58,2025-336,Hong Kong,Hard,28.0,A,2024-12-29,3.0,SU87,,,Lorenzo Sonego,R,191.0,ITA,29.637,53.0,1026.0,N0AE,,,Brandon Nakashima,R,188.0,USA,23.406,38.0,1335.0,7-6(4) 6-3,3.0,R32,90.0,5.0,1.0,61.0,38.0,34.0,17.0,11.0,0.0,0.0,6.0,0.0,56.0,39.0,33.0,10.0,10.0,1.0,2.0,7,0,2024-12-29,False,2.43,True,1.54,True
59,2025-336,Hong Kong,Hard,28.0,A,2024-12-29,4.0,N771,,,Cameron Norrie,L,188.0,GBR,29.352,49.0,1119.0,T0HA,,,Learner Tien,L,180.0,USA,19.075,122.0,493.0,6-3 7-5,3.0,R32,98.0,11.0,1.0,78.0,60.0,38.0,8.0,11.0,5.0,7.0,6.0,5.0,69.0,42.0,27.0,11.0,10.0,5.0,9.0,7,0,2024-12-29,False,1.63,True,2.22,True
60,2025-336,Hong Kong,Hard,28.0,A,2024-12-29,5.0,BU13,,,Zizou Bergs,R,185.0,BEL,25.574,71.0,768.0,M0HT,,,Alejandro Moro Canas,R,183.0,ESP,24.06,169.0,345.0,6-4 6-4,3.0,R32,91.0,11.0,1.0,54.0,38.0,31.0,11.0,10.0,1.0,2.0,2.0,3.0,78.0,48.0,31.0,12.0,10.0,6.0,9.0,7,0,2024-12-29,False,1.46,True,2.66,True


✅ Saved '2025_raw_data_with_betting_odds.csv'


In [None]:
### Now we start with the feature generation
### The first idea is to implement an ELO feature which tries to capture player strength in a more optimal way than the ATP rank.
### An ELO rating system rates players on the assumption that their performance is a normally distributed random variable with mean at their "actual" paying strenght.

In [17]:
from tqdm import tqdm

### Write ELO functions to generate player ELO's

# ELO update function using logistic distribution
def update_elo(elo_winner, elo_loser, k_factor=20):
    """
    Updates ELO ratings for a winner and loser.
    """
    expected_win = 1 / (1 + 10**((elo_loser - elo_winner) / 400))
    
    # Calculate the change in ELO
    change_in_elo = k_factor * (1 - expected_win)
    
    # Update ratings
    new_elo_winner = elo_winner + change_in_elo
    new_elo_loser = elo_loser - change_in_elo
    
    return new_elo_winner, new_elo_loser

# Calculate ELO for every player and every match

# Make sure data frame is sorted. This is very important for ELO calculation
master_df['approx_match_date'] = pd.to_datetime(master_df['approx_match_date'])
master_df.sort_values(by='approx_match_date', inplace=True)

def calculate_ELO_for_df(df):

    # Initialize a dictionary to store the current ELO of each player
    elo_ratings = {}
    elo_ratings_Clay = {}
    elo_ratings_Grass = {}
    elo_ratings_Hard = {}
    STARTING_ELO = 1500

    # Lists to store the calculated pre-match ELO ratings
    winner_elos = []
    winner_elos_Clay = []
    winner_elos_Grass = []
    winner_elos_Hard = []
    loser_elos = []
    loser_elos_Clay = []
    loser_elos_Grass = []
    loser_elos_Hard = []

    # Add (surface -  relevent ELO dictionary) dictionary
    elo_dictionaries = {
        'Clay': elo_ratings_Clay, 'Grass': elo_ratings_Grass, 'Hard': elo_ratings_Hard
    } 
    
    print("Calculating ELO ratings for all matches...")
    # Loop through every match in chronological order
    for index, match in tqdm(df.iterrows()):
        winner_name = match['winner_name']
        loser_name = match['loser_name']
        match_surface = match['surface']
        
        # Look-up Step 
        # Get the current ELO for both players. If a player is new, assign the starting ELO.
        winner_pre_match_elo = elo_ratings.get(winner_name, STARTING_ELO)
        loser_pre_match_elo = elo_ratings.get(loser_name, STARTING_ELO)

        winner_pre_match_elo_Clay = elo_ratings_Clay.get(winner_name, STARTING_ELO)
        loser_pre_match_elo_Clay = elo_ratings_Clay.get(loser_name, STARTING_ELO)

        winner_pre_match_elo_Grass = elo_ratings_Grass.get(winner_name, STARTING_ELO)
        loser_pre_match_elo_Grass = elo_ratings_Grass.get(loser_name, STARTING_ELO)

        winner_pre_match_elo_Hard = elo_ratings_Hard.get(winner_name, STARTING_ELO)
        loser_pre_match_elo_Hard = elo_ratings_Hard.get(loser_name, STARTING_ELO)

        
        # Store these pre-match ratings as our features for this row
        winner_elos.append(winner_pre_match_elo)
        loser_elos.append(loser_pre_match_elo)

        winner_elos_Clay.append(winner_pre_match_elo_Clay)
        loser_elos_Clay.append(loser_pre_match_elo_Clay)

        winner_elos_Grass.append(winner_pre_match_elo_Grass)
        loser_elos_Grass.append(loser_pre_match_elo_Grass)

        winner_elos_Hard.append(winner_pre_match_elo_Hard)
        loser_elos_Hard.append(loser_pre_match_elo_Hard)

        # Add (surface -  relevent prematch ELO) dictionary
        pre_match_elos = {
        'Clay': [winner_pre_match_elo_Clay,loser_pre_match_elo_Clay], 'Grass': [winner_pre_match_elo_Grass,loser_pre_match_elo_Grass], 
        'Hard': [winner_pre_match_elo_Hard,loser_pre_match_elo_Hard]
        }

        
        # Update Step
        # Calculate the new ELO ratings after the match
        new_winner_elo, new_loser_elo = update_elo(winner_pre_match_elo, loser_pre_match_elo)
        new_winner_elo_surface, new_loser_elo_surface = update_elo(pre_match_elos[match_surface][0], pre_match_elos[match_surface][1])
        
        # Save the new ratings back to our dictionary for the next match
        elo_ratings[winner_name] = new_winner_elo
        elo_ratings[loser_name] = new_loser_elo
        elo_dictionaries[match_surface][winner_name] = new_winner_elo_surface
        elo_dictionaries[match_surface][loser_name] = new_loser_elo_surface


    return winner_elos, loser_elos, winner_elos_Clay, loser_elos_Clay, winner_elos_Grass, loser_elos_Grass, winner_elos_Hard, loser_elos_Hard

ELOs = calculate_ELO_for_df(master_df)

# Add general ELO column to data frame
master_df['winner_ELO'] = ELOs[0]
master_df['loser_ELO'] = ELOs[1]
# Add ELO's for specific surfaces
master_df['winner_ELO_clay'] = ELOs[2]
master_df['loser_ELO_clay'] = ELOs[3]

master_df['winner_ELO_grass'] = ELOs[4]
master_df['loser_ELO_grass'] = ELOs[5]

master_df['winner_ELO_hard'] = ELOs[6]
master_df['loser_ELO_hard'] = ELOs[7]

# Split master_df into training_df and test_df

# Define the date where your test set begins
test_start_date = pd.to_datetime('2024-01-01')
# Define the date where training set begins
training_start_date = pd.to_datetime('2001-01-01')

# Create ELO-enriched training and test sets
training_df = master_df[(master_df['tourney_date'] >= training_start_date) & (master_df['tourney_date'] < test_start_date)].copy()
test_df = master_df[master_df['tourney_date'] >= test_start_date].copy()

print(f"\nFinal training set with ELO has {len(training_df)} matches.")
print(f"Final test set with ELO has {len(test_df)} matches.")

Calculating ELO ratings for all matches...


67730it [00:01, 49610.32it/s]


Final training set with ELO has 59987 matches.
Final test set with ELO has 4894 matches.





In [None]:
### Implement function that finds recent matches of a specific player in a given timewindow and which will be used to calculate rolling averages.

In [8]:
from datetime import timedelta

def get_recent_matches(player_name, cutoff_date, df, time_window_days):
    # ---
    # Outputs recent matches for a specific player within a specified timeframe ending at a specified cutoff date
    # ---

    player_matches = df[
        (df['winner_name'] == player_name) |
        (df['loser_name'] == player_name)
    ]

    start_date = cutoff_date - timedelta(days=time_window_days)
    recent_matches = player_matches[
        (player_matches['approx_match_date'] >= start_date) &
        (player_matches['approx_match_date'] < cutoff_date)
    ].copy()


    return recent_matches

In [None]:
### Here the main feature generation of player statistics starts

In [18]:
### This cell is mainly for player stat calculations

import numpy as np

def get_stats_from_df(filtered_df,player_name): 
        # ---
        # Takes as input a dataframe that must be already filtered for the specific players matches
        # Outputs player specific stats which are specified in the dictionary
        # ---
        
        # Handle edge case of empty data frame
        if filtered_df.empty:
            return pd.Series({
                'win_pc': 0.0,
                'matches_played': 0,
                'ace_ratio': 0.0,
                'df_ratio': 0.0,
                'ace_vs_df_ratio': 0.0,
                '1st_serve_in_pc': 0.0,
                '1st_serve_win_pc': 0.0,
                '2nd_serve_win_pc': 0.0,
                'return_win_pc': 0.0,
                'bp_save_pc': 0.0,
                'bp_conversion_pc': 0.0,
                'tiebreak_win_pc': 0.0,
                'win_pc_vs_top10': 0.0
            })

        df = filtered_df.copy()     #Work on copy to avoid warnings related to pandas slices

        # Get stats for player and create new columns to store
        is_win = (df['winner_name'] == player_name).values
        df['aces'] = np.where(is_win, df['w_ace'], df['l_ace'])
        df['dfs'] = np.where(is_win, df['w_df'], df['l_df'])
        df['svpt'] = np.where(is_win, df['w_svpt'], df['l_svpt'])
        df['first_in'] = np.where(is_win, df['w_1stIn'], df['l_1stIn'])
        df['first_won'] = np.where(is_win, df['w_1stWon'], df['l_1stWon'])
        df['second_won'] = np.where(is_win, df['w_2ndWon'], df['l_2ndWon'])
        df['bp_saved'] = np.where(is_win, df['w_bpSaved'], df['l_bpSaved'])
        df['bp_faced'] = np.where(is_win, df['w_bpFaced'], df['l_bpFaced'])
        df['opp_svpt'] = np.where(is_win, df['l_svpt'], df['w_svpt'])
        df['opp_svpts_won'] = np.where(is_win, 
                                    df['l_1stWon'] + df['l_2ndWon'], 
                                    df['w_1stWon'] + df['w_2ndWon'])
        df['return_pts_won'] = df['opp_svpt'] - df['opp_svpts_won']
        df['break_opportunities'] = np.where(is_win, df['l_bpFaced'], df['w_bpFaced'])
        df['bp_won'] = np.where(is_win, df['l_bpFaced'] - df['l_bpSaved'], df['w_bpFaced'] - df['w_bpSaved'])

        # Calculate the aggregated metrics
        # TIE-BREAK WIN PC
        # Filter for matches that included a tiebreak
        tiebreak_matches = df[df['score'].str.contains('7-6|6-7', na=False)]
        
        if not tiebreak_matches.empty:
            # Check who won in those specific tiebreak matches
            tb_wins = (tiebreak_matches['winner_name'] == player_name).sum()
            tiebreak_win_pc = tb_wins / len(tiebreak_matches)
        else:
            tiebreak_win_pc = 0.0 # No tiebreaks played

        #WIN PC VS TOP10
        # Create new column with opponent rank to filter for top ten matches
        df.loc[:, 'opponent_rank'] = np.where(is_win, df['loser_rank'], df['winner_rank'])
        top_10_matches = df[df['opponent_rank'] <= 10]
        
        if not top_10_matches.empty:
            top_10_wins = np.nansum((top_10_matches['winner_name'] == player_name))
            win_pc_vs_top10 = top_10_wins / top_10_matches.shape[0]
        else:
            win_pc_vs_top10 = 0.0 # No matches against top 10 in this period

        # Per match ratios
        df['ace_ratio'] = np.where(df['svpt'] > 0, df['aces'] / df['svpt'], 0)
        df['df_ratio'] = np.where(df['svpt'] > 0, df['dfs'] / df['svpt'], 0)
        df['ace_vs_df_ratio'] = df['aces']/(df['dfs'] + 1)
        df['1st_serve_in_pc'] = np.where(df['svpt'] > 0, df['first_in']/df['svpt'], 0)
        df['1st_serve_win_pc'] = np.where(df['first_in'] > 0, df['first_won'] / df['first_in'], 0)
        df['2nd_serve_win_pc'] = np.where((df['svpt'] - df['first_in']) > 0, df['second_won'] / (df['svpt'] - df['first_in']), 0)
        df['return_win_pc'] = np.where(df['opp_svpt'] > 0, df['return_pts_won'] / df['opp_svpt'], 0)
        df['bp_save_pc'] = np.where(df['bp_faced'] > 0, df['bp_saved'] / df['bp_faced'], 1)
        df['bp_conversion_pc'] = np.where(df['break_opportunities'] > 0, df['bp_won'] / df['break_opportunities'], 0 )


        stats = {
            'win_pc': is_win.mean(),
            'matches_played': len(df),
            'ace_ratio': df['ace_ratio'].mean(),
            'df_ratio': df['df_ratio'].mean(),
            'ace_vs_df_ratio': df['ace_vs_df_ratio'].mean(),
            '1st_serve_in_pc': df['1st_serve_in_pc'].mean(), 
            '1st_serve_win_pc': df['1st_serve_win_pc'].mean(),
            '2nd_serve_win_pc': df['2nd_serve_win_pc'].mean(),
            'return_win_pc': df['return_win_pc'].mean(),
            'bp_save_pc': df['bp_save_pc'].mean(),
            'bp_conversion_pc': df['bp_conversion_pc'].mean(),
            'tiebreak_win_pc': tiebreak_win_pc,
            'win_pc_vs_top10': win_pc_vs_top10
        }

        return pd.Series(stats)


def get_overall_form(player_name, cutoff_date, df, time_window_days):
     # --- 
     # Calculates player stats for given data frame over specified time window on all surfaces
     # ---
     
     recent_matches = get_recent_matches(player_name,cutoff_date,df,time_window_days)

     return get_stats_from_df(recent_matches,player_name)

def get_surface_form(player_name, cutoff_date, match_surface, df, time_window_days):
     # --- 
     # Calculates player stats for given data frame over specified time window on a specific surface
     # ---
    
    recent_matches = get_recent_matches(player_name, cutoff_date, df, time_window_days)
    surface_matches = recent_matches[recent_matches['surface'] == match_surface].copy()

    return get_stats_from_df(surface_matches, player_name)


def _get_fatigue_stats_for_player(player_name, cutoff_date, df):
    """
    Private helper function to calculate all fatigue and rust stats for one player.
    """
    # 1. Filter for the longest time window ONCE
    matches_last_30d = get_recent_matches(player_name, cutoff_date, df, time_window_days = 30)
    
    # 2. Find smaller windows from the already-filtered data
    window_14d = cutoff_date - timedelta(days=14)
    matches_last_14d = matches_last_30d[matches_last_30d['approx_match_date'] >= window_14d]
    
    window_7d = cutoff_date - timedelta(days=7)
    matches_last_7d = matches_last_14d[matches_last_14d['approx_match_date'] >= window_7d]
    
    # 3. Calculate "days since last match" (rust feature)
    if matches_last_30d.empty:
        days_since_last_match = 90 # Assign a default large value
    else:
        last_match_date = matches_last_30d['approx_match_date'].max()
        days_since_last_match = (cutoff_date - last_match_date).days
        
    # 4. Return all stats in a dictionary
    return {
        'matches_last_7d': len(matches_last_7d),
        'minutes_on_court_last_7d': matches_last_7d['minutes'].sum(),
        'matches_last_14d': len(matches_last_14d),
        'minutes_on_court_last_14d': matches_last_14d['minutes'].sum(),
        'matches_last_30d': len(matches_last_30d),
        'minutes_on_court_last_30d': matches_last_30d['minutes'].sum(),
        'days_since_last_match': days_since_last_match
    }

In [19]:
### Feature generation

from tqdm import tqdm

def create_static_features(df_to_process):
    # ---
    # Outputs a data frame with all static and head to head features
    # ---

    feature_rows = []

    for index, match in tqdm(df_to_process.iterrows()):
        # Alphabetical assignment for P1/P2 and getting static features
        if match['winner_name'] < match['loser_name']:
            p1_name, p2_name = match['winner_name'], match['loser_name']
            p1_rank, p2_rank = match['winner_rank'], match['loser_rank']
            p1_age, p2_age = match['winner_age'], match['loser_age']
            p1_height, p2_height = match['winner_ht'], match['loser_ht']
            target = 1
        else:
            p1_name, p2_name = match['loser_name'], match['winner_name']
            p1_rank, p2_rank = match['loser_rank'], match['winner_rank']
            p1_age, p2_age = match['loser_age'], match['winner_age']
            p1_height, p2_height = match['loser_ht'], match['winner_ht']
            target = 0


        feature_rows.append({
            'p1_rank': p1_rank, 'p2_rank': p2_rank, 'rank_diff': p1_rank - p2_rank,
            'p1_age': p1_age, 'p2_age': p2_age, 'age_diff': p1_age - p2_age,
            'p1_height': p1_height, 'p2_height': p2_height, 'height_diff': p1_height - p2_height, 
            'surface': match['surface'], 'round': match['round'],
            'target': target
        })
    
    feature_rows = pd.DataFrame(feature_rows)

    # 1. Define all possible categories for 'surface' and 'round'.
    #    Make sure these lists contain every category your model was trained on.
    all_surfaces = ['Clay', 'Grass', 'Hard']
    all_rounds = ['BR', 'F', 'QF','R128','R16', 'R32', 'R64', 'RR', 'SF'] # Example for Grand Slams

    # 2. Convert the columns to a categorical type using the full list of categories.
    #    This tells pandas about all possible categories that should exist.
    feature_rows['surface'] = pd.Categorical(feature_rows['surface'], categories=all_surfaces)
    feature_rows['round'] = pd.Categorical(feature_rows['round'], categories=all_rounds)

    # Convert surface and round info to binary data
    feature_rows = pd.get_dummies(feature_rows, columns=['surface', 'round'], prefix=['surface', 'round'])
    
    return feature_rows

def create_h2h_features(df_to_process, historical_df):
    
    feature_rows = []
    surfaces = ['Clay', 'Grass', 'Hard']

    for index, match in tqdm(df_to_process.iterrows()):
        # Alphabetical assignment for P1/P2 and getting static features
        if match['winner_name'] < match['loser_name']:
            p1_name, p2_name = match['winner_name'], match['loser_name']
        else: 
             p1_name, p2_name = match['loser_name'], match['winner_name']
        
        # Get H2H features
        h2h_matches = historical_df[
            ((historical_df['winner_name'] == p1_name) & (historical_df['loser_name'] == p2_name)) |
            ((historical_df['winner_name'] == p2_name) & (historical_df['loser_name'] == p1_name))
        ]
        h2h_matches_before = h2h_matches[h2h_matches['approx_match_date'] < match['approx_match_date']]
        
        #Calculate basic stats
        def h2h_stats(h2h_matches_before):
            p1_h2h_wins = h2h_matches_before[h2h_matches_before['winner_name'] == p1_name].shape[0]
            h2h_matches_played = h2h_matches_before.shape[0]
            p2_h2h_wins = h2h_matches_played - p1_h2h_wins
            p1_h2h_win_pc = p1_h2h_wins/h2h_matches_played if h2h_matches_played != 0 else 0.5
            p2_h2h_win_pc = p2_h2h_wins/h2h_matches_played if h2h_matches_played != 0 else 0.5
            diff_h2h_win_pc = p1_h2h_win_pc - p2_h2h_win_pc

            return [p1_h2h_wins, p2_h2h_wins, h2h_matches_played, p1_h2h_win_pc, p2_h2h_win_pc, diff_h2h_win_pc]

        general_stats = h2h_stats(h2h_matches_before)

        # Add stats to dictionary
        features = {
            'p1_h2h_wins': general_stats[0], 'p2_h2h_wins': general_stats[1], 'h2h_matches_played': general_stats[2],
            'p1_h2h_win_pc': general_stats[3], 'p2_h2h_win_pc': general_stats[4], 'diff_h2h_win_pc': general_stats[5]
        }

        # Loop through all surfaces to create same stats
        features_surface = {}
        for surface in surfaces:
            h2h_matches_surface = h2h_matches_before[h2h_matches_before['surface'] == surface]
            

            for idx,stat in enumerate(features.keys()):
                features_surface[f'{stat}_{surface}'] = h2h_stats(h2h_matches_surface)[idx]
            
        features.update(features_surface)

        feature_rows.append(features)

    return pd.DataFrame(feature_rows)
        

def create_general_dynamic_features(df_to_process, historical_df):
    # ---
    # Outputs a data frame with all dynamic player features over all surfaces and different time windows
    # ---

    feature_rows = []
    time_windows = [90, 180, 360]

    for index, match in tqdm(df_to_process.iterrows()):
        # Alphabetical assignment for P1/P2
        if match['winner_name'] < match['loser_name']:
            p1_name, p2_name = match['winner_name'], match['loser_name']
        else:
            p1_name, p2_name = match['loser_name'], match['winner_name']

        # Getting rolling features for time frame and add it to dictionary
        row_features = {}
        for window in time_windows:
            p1_form = get_overall_form(p1_name, match['approx_match_date'], historical_df, window)
            p2_form = get_overall_form(p2_name, match['approx_match_date'], historical_df, window)

            # Calculate differences in player stats
            for stat, val in p1_form.items(): row_features[f'p1_{stat}_{window}d'] = val
            for stat, val in p2_form.items(): row_features[f'p2_{stat}_{window}d'] = val
            for stat in p1_form.index:
                row_features[f'diff_{stat}_{window}d'] = p1_form[stat] - p2_form[stat]

        feature_rows.append(row_features)
    return pd.DataFrame(feature_rows)


def create_surface_dynamic_features(df_to_process, historical_df):
    # ---
    # Outputs a data frame with all dynamic player features for the specific surface of the match and different time windows
    # ---

    feature_rows = []
    time_windows = [90, 180, 360]
    surfaces_to_calculate = ['Hard', 'Clay', 'Grass']

    for index, match in tqdm(df_to_process.iterrows()):
        # Alphabetical assignment for P1/P2
        if match['winner_name'] < match['loser_name']:
            p1_name, p2_name = match['winner_name'], match['loser_name']
        else:
            p1_name, p2_name = match['loser_name'], match['winner_name']

        row_features = {}

        for window in time_windows:
            # Inner loop to calculate stats for each surface
            for surface in surfaces_to_calculate:
                p1_form = get_surface_form(p1_name, match['approx_match_date'], surface, historical_df, window)
                p2_form = get_surface_form(p2_name, match['approx_match_date'], surface, historical_df, window)

                # Add a descriptive suffix, e.g., '_Hard'
                p1_form = p1_form.add_suffix(f'_{surface}')
                p2_form = p2_form.add_suffix(f'_{surface}')

                # Add the stats to our main feature dictionary for the row
                for stat, val in p1_form.items():
                    row_features[f'p1_{stat}_{window}d'] = val
                for stat, val in p2_form.items():
                    row_features[f'p2_{stat}_{window}d'] = val
                for stat in p1_form.index:
                    if stat in p2_form.index:
                        row_features[f'diff_{stat}_{window}d'] = p1_form[stat] - p2_form[stat]

        feature_rows.append(row_features)
    return pd.DataFrame(feature_rows)


def create_fatigue_features(df_to_process, historical_df):
    # --- 
    # Outputs a dataframe with fatigue features, i.e. number of matches in past 14 days and number of matches in last 30 days
    # ---

    feature_rows = []

    for index, match in tqdm(df_to_process.iterrows()):
        # Alphabetical assignment for P1/P2
        if match['winner_name'] < match['loser_name']:
            p1_name, p2_name = match['winner_name'], match['loser_name']
        else:
            p1_name, p2_name = match['loser_name'], match['winner_name']

        # Call the helper function once for each player
        p1_stats = _get_fatigue_stats_for_player(p1_name, match['approx_match_date'], historical_df)
        p2_stats = _get_fatigue_stats_for_player(p2_name, match['approx_match_date'], historical_df)
        
        # Assemble the final feature row
        feature_row = {}
        for stat_name, p1_val in p1_stats.items():
            p2_val = p2_stats[stat_name]
            feature_row[f'p1_{stat_name}'] = p1_val
            feature_row[f'p2_{stat_name}'] = p2_val
            feature_row[f'diff_{stat_name}'] = p1_val - p2_val
        
        feature_rows.append(feature_row)

    return pd.DataFrame(feature_rows)
    

def create_ELO_features(df_to_process):
    feature_rows = []

    for index, match in tqdm(df_to_process.iterrows()):
           # Alphabetical assignment for P1/P2
            if match['winner_name'] < match['loser_name']:
                p1_name, p2_name = match['winner_name'], match['loser_name']
                p1_ELO, p2_ELO = match['winner_ELO'], match['loser_ELO']
                p1_ELO_clay, p2_ELO_clay = match['winner_ELO_clay'], match['loser_ELO_clay']
                p1_ELO_grass, p2_ELO_grass = match['winner_ELO_grass'], match['loser_ELO_grass']
                p1_ELO_hard, p2_ELO_hard = match['winner_ELO_hard'], match['loser_ELO_hard']
            else:
                p1_name, p2_name = match['loser_name'], match['winner_name']
                p1_ELO, p2_ELO = match['loser_ELO'], match['winner_ELO']
                p1_ELO_clay, p2_ELO_clay = match['loser_ELO_clay'], match['winner_ELO_clay']
                p1_ELO_grass, p2_ELO_grass = match['loser_ELO_grass'], match['winner_ELO_grass']
                p1_ELO_hard, p2_ELO_hard = match['loser_ELO_hard'], match['winner_ELO_hard']


            feature_rows.append({
                'p1_ELO': p1_ELO, 'p2_ELO': p2_ELO, 'diff_ELO': p1_ELO - p2_ELO,
                'p1_ELO_clay': p1_ELO_clay, 'p2_ELO_clay': p2_ELO_clay, 'diff_ELO_clay': p1_ELO_clay - p2_ELO_clay,
                'p1_ELO_grass': p1_ELO_grass, 'p2_ELO_grass': p2_ELO_grass, 'diff_ELO_grass': p1_ELO_grass - p2_ELO_grass,
                'p1_ELO_hard': p1_ELO_hard, 'p2_ELO_hard': p2_ELO_hard, 'diff_ELO_hard': p1_ELO_hard - p2_ELO_hard
            })

    return pd.DataFrame(feature_rows)


In [None]:
### Having implemented all feature generation functions the below cells run them to actually calculate the feature sets.
### Runtime is very long for some sets!

In [23]:
### Execute and save STATIC FEATURES

print("\nGenerating static features for TRAINING data...")
static_train_df = create_static_features(training_df)
static_train_df.to_csv('static_features_train.csv', index=False)
print("✅ Saved 'static_features_train.csv'")

print("\nGenerating static features for TEST data...")
static_test_df = create_static_features(test_df)
static_test_df.to_csv('static_features_test.csv', index=False)
print("✅ Saved 'static_features_test.csv'")


Generating static features for TRAINING data...


59987it [00:01, 33208.43it/s]


✅ Saved 'static_features_train.csv'

Generating static features for TEST data...


4894it [00:00, 37350.81it/s]

✅ Saved 'static_features_test.csv'





In [20]:
### Execute and save H2H FEATURES

print("\nGenerating H2H features for TRAINING data...")
static_train_df = create_h2h_features(training_df, master_df)
static_train_df.to_csv('h2h_features_train.csv', index=False)
print("✅ Saved 'h2h_features_train.csv'")

print("\nGenerating H2H features for TEST data...")
static_test_df = create_h2h_features(test_df, master_df)
static_test_df.to_csv('h2h_features_test.csv', index=False)
print("✅ Saved 'h2h_features_test.csv'")


Generating H2H features for TRAINING data...


59987it [12:18, 81.19it/s]


✅ Saved 'h2h_features_train.csv'

Generating H2H features for TEST data...


4894it [01:00, 81.34it/s]

✅ Saved 'h2h_features_test.csv'





In [21]:
### Execute and save GENERAL DYNAMIC FEATURES

print("\nGenerating OVERALL rolling features for TRAINING data...")
rolling_train_df = create_general_dynamic_features(training_df, training_df)
rolling_train_df.to_csv('rolling_features_train_2_0.csv', index=False)
print("✅ Saved 'rolling_features_train_2_0.csv'")

print("\nGenerating OVERALL rolling features for TEST data...")
rolling_test_df = create_general_dynamic_features(test_df, master_df)
rolling_test_df.to_csv('rolling_features_test_2_0.csv', index=False)
print("✅ Saved 'rolling_features_test_2_0.csv'")


Generating OVERALL rolling features for TRAINING data...


59987it [48:01, 20.82it/s]


✅ Saved 'rolling_features_train_2_0.csv'

Generating OVERALL rolling features for TEST data...


4894it [04:21, 18.70it/s]


✅ Saved 'rolling_features_test_2_0.csv'


In [22]:
### Execute and save SURFACE DYNAMIC FEATURES

print("\nGenerating SURFACE-SPECIFIC features for TRAINING data...")
surface_train_df = create_surface_dynamic_features(training_df, training_df)
surface_train_df.to_csv('surface_features_train_2_0.csv', index=False)
print("✅ Saved 'surface_features_train_2_0.csv'")

print("\nGenerating SURFACE-SPECIFIC features for TEST data...")
surface_test_df = create_surface_dynamic_features(test_df, master_df)
surface_test_df.to_csv('surface_features_test_2_0.csv', index=False)
print("✅ Saved 'surface_features_test_2_0.csv'")


Generating SURFACE-SPECIFIC features for TRAINING data...


59987it [2:10:06,  7.68it/s]


✅ Saved 'surface_features_train_2_0.csv'

Generating SURFACE-SPECIFIC features for TEST data...


4894it [11:57,  6.82it/s]


✅ Saved 'surface_features_test_2_0.csv'


In [24]:
### Execute and save FATIGUE FEATURES

print("\nGenerating FATIGUE features for TRAINING data...")
fatigue_train_df = create_fatigue_features(training_df, training_df)
fatigue_train_df.to_csv('fatigue_features_train.csv', index=False)
print("✅ Saved 'fatigue_features_train.csv'")

print("\nGenerating FATIGUE features for TEST data...")
fatigue_test_df = create_fatigue_features(test_df, master_df)
fatigue_test_df.to_csv('fatigue_features_test.csv', index=False)
print("✅ Saved 'fatigue_features_test.csv'")


Generating FATIGUE features for TRAINING data...


59987it [09:50, 101.62it/s]


✅ Saved 'fatigue_features_train.csv'

Generating FATIGUE features for TEST data...


4894it [00:56, 87.05it/s]

✅ Saved 'fatigue_features_test.csv'





In [25]:
### Execute and save ELO FEATURES

print("\nGenerating ELO features for TRAINING data...")
elo_train_df = create_ELO_features(training_df)
elo_train_df.to_csv('elo_features_train.csv', index=False)
print("✅ Saved 'elo_features_train.csv'")

print("\nGenerating ELO features for TEST data...")
elo_test_df = create_ELO_features(test_df)
elo_test_df.to_csv('elo_features_test.csv', index=False)
print("✅ Saved 'elo_features_test.csv'")


Generating ELO features for TRAINING data...


59987it [00:01, 34942.88it/s]


✅ Saved 'elo_features_train.csv'

Generating ELO features for TEST data...


4894it [00:00, 37573.28it/s]

✅ Saved 'elo_features_test.csv'



