In [None]:
### Goal of this notebook is to apply and test the trained XGBoost model on the US Open 2025

In [None]:
import requests
from typing import Dict
from datetime import timedelta
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
### Fetch draw data from the US Open website

In [None]:
def fetch_json(feed_url: str, timeout: int = 10) -> Dict:
    r = requests.get(feed_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=timeout)
    r.raise_for_status()
    return r.json()

# JSON URL for draw data
ms_feed = "https://www.usopen.org/en_US/scores/feeds/2025/draws/MS.json"

draw_data = fetch_json(ms_feed)

In [None]:
start_date = '2025-08-17'
start_date = pd.to_datetime(start_date)

def create_df_from_draw_data_json(draw_data):

    # Access the relevant data within the JSON
    # Assume the match data is under a key called 'matches'

    try:
        match_data = draw_data['matches']
        
        # Create a pandas DataFrame
        df = pd.DataFrame(match_data)
        
        # Display the first 5 rows of the DataFrame
        print("DataFrame created successfully!")
        pd.set_option('display.max_columns', None)
        display(df.info())
        display(df[df['eventDay'].isnull() == True])


    except KeyError:
        print("The key 'matches' was not found. Please inspect the available keys:")
        print(draw_data.keys())
    except TypeError:
        print("The JSON data might not be a dictionary. Let's see what it is:")
        print(type(draw_data))
        print(draw_data)

    df = df[df['status'] != 'Walkover']

    return df

draw_df = create_df_from_draw_data_json(draw_data)

def fetch_matches_for_round(draw_df, round):
    round_matches = draw_df[draw_df['roundNameShort'] == round]

    return round_matches



# possible round short hands as input: 'R1', 'R2', 'R3', 'R4', 'QF', 'SF', 'F'
def get_match_stats_for_input_df(df, round):
    round_matches = fetch_matches_for_round(df,round)

    # Reset the index of the filtered DataFrame to ensure it starts from 0
    # drop=True prevents the old index from being added as a new column
    round_matches = round_matches.reset_index(drop=True)

    input_df = pd.DataFrame({})

    input_df['winner_name'] = [round_matches['team1'][i]['firstNameA'] + ' ' + round_matches['team1'][i]['lastNameA'] for i in range(len(round_matches['team1']))]
    input_df['loser_name'] = [round_matches['team2'][i]['firstNameA'] + ' ' + round_matches['team2'][i]['lastNameA'] for i in range(len(round_matches['team1']))]
    input_df['approx_match_date'] = [start_date + timedelta(days=round_matches['eventDay'][i]) for i in range(len(round_matches['team1']))]
    input_df['round'] = [round for i in range(len(round_matches['roundNameShort']))]
    input_df['surface'] = ['Hard' for i in range(len(round_matches['roundNameShort']))]

    return input_df


input_df = get_match_stats_for_input_df(draw_df, 'R1')
display(input_df.head())


DataFrame created successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127 entries, 0 to 126
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   match_id        127 non-null    object 
 1   eventName       127 non-null    object 
 2   shortEventName  127 non-null    object 
 3   eventCode       127 non-null    object 
 4   courtName       127 non-null    object 
 5   shortCourtName  127 non-null    object 
 6   courtId         127 non-null    object 
 7   roundCode       127 non-null    object 
 8   roundName       127 non-null    object 
 9   roundNameShort  127 non-null    object 
 10  eventDay        126 non-null    float64
 11  duration        126 non-null    object 
 12  statsLevel      126 non-null    object 
 13  status          127 non-null    object 
 14  statusCode      127 non-null    object 
 15  winner          126 non-null    object 
 16  epoch           126 non-null    float64
 17  tea

None

Unnamed: 0,match_id,eventName,shortEventName,eventCode,courtName,shortCourtName,courtId,roundCode,roundName,roundNameShort,eventDay,duration,statsLevel,status,statusCode,winner,epoch,team1,team2,flags,scores
71,1208,Men's Singles,Men's Singles,MS,,,,2,Round 2,R2,,0:00,S,Walkover,F,1,1756316000000.0,"{'firstNameA': 'Zizou', 'lastNameA': 'Bergs', ...","{'firstNameA': 'Jack', 'lastNameA': 'Draper', ...",{'upset': False},"{'setsWon': [1, 0, 0, 0, 0, 0], 'setDurations'..."


Unnamed: 0,winner_name,loser_name,approx_match_date,round,surface
0,Jannik Sinner,Vit Kopriva,2025-08-26,R1,Hard
1,Alexei Popyrin,Emil Ruusuvuori,2025-08-26,R1,Hard
2,Valentin Royer,Yunchaokete Bu,2025-08-26,R1,Hard
3,Marton Fucsovics,Denis Shapovalov,2025-08-26,R1,Hard
4,Alexander Bublik,Marin Cilic,2025-08-26,R1,Hard


In [None]:
# Extract winners

R1_matches = fetch_matches_for_round(draw_df,round='R1')
R2_matches = fetch_matches_for_round(draw_df,round='R2')
R3_matches = fetch_matches_for_round(draw_df,round='R3')
R4_matches = fetch_matches_for_round(draw_df, round='R4')
QF_matches = fetch_matches_for_round(draw_df, round='QF')
SF_matches = fetch_matches_for_round(draw_df, round='SF')
F_match = fetch_matches_for_round(draw_df, round='F')

def get_winnners_of_round(round_matches):   
    

    winner_list = []

    for id, match in tqdm(round_matches.iterrows()):
        if match['winner'] == None:
            continue
        if int(match['winner']) == 1:
            winner_list.append(round_matches['team1'][id]['firstNameA'] + ' ' + round_matches['team1'][id]['lastNameA'])
        elif int(match['winner']) == 2:
            winner_list.append(round_matches['team2'][id]['firstNameA'] + ' ' + round_matches['team2'][id]['lastNameA'])


    return winner_list


winner_list_R1 = get_winnners_of_round(R1_matches)
winner_list_R2 = get_winnners_of_round(R2_matches)
winner_list_R3 = get_winnners_of_round(R3_matches)
winner_list_R4 = get_winnners_of_round(R4_matches)
winner_list_QF = get_winnners_of_round(QF_matches)
winner_list_SF = get_winnners_of_round(SF_matches)
winner_list_F = get_winnners_of_round(F_match)


Unnamed: 0,match_id,eventName,shortEventName,eventCode,courtName,shortCourtName,courtId,roundCode,roundName,roundNameShort,eventDay,duration,statsLevel,status,statusCode,winner,epoch,team1,team2,flags,scores
124,1601,Men's Singles,Men's Singles,MS,Arthur Ashe Stadium,Arthur Ashe Stadium,AA,S,Semi-Finals,SF,19.0,3:21,H,Completed,D,1,1757127000000.0,"{'firstNameA': 'Jannik', 'lastNameA': 'Sinner'...","{'firstNameA': 'Felix', 'lastNameA': 'Auger-Al...",{'upset': False},"{'setsWon': [1, 1, 2, 1, 1, 0], 'setDurations'..."
125,1602,Men's Singles,Men's Singles,MS,Arthur Ashe Stadium,Arthur Ashe Stadium,AA,S,Semi-Finals,SF,19.0,2:23,H,Completed,D,2,1757108000000.0,"{'firstNameA': 'Novak', 'lastNameA': 'Djokovic...","{'firstNameA': 'Carlos', 'lastNameA': 'Alcaraz...",{'upset': False},"{'setsWon': [2, 2, 2, 2, 0, 0], 'setDurations'..."


64it [00:00, 15252.88it/s]
31it [00:00, 30723.87it/s]
16it [00:00, 26051.58it/s]
8it [00:00, 16946.68it/s]
4it [00:00, 13283.62it/s]
2it [00:00, 6605.20it/s]
1it [00:00, 5152.71it/s]

['Jannik Sinner', 'Alexander Bublik', 'Lorenzo Musetti', 'Jaume Munar', 'Felix Auger-Aliassime', 'Andrey Rublev', 'Leandro Riedi', 'Alex de Minaur', 'Novak Djokovic', 'Jan-Lennard Struff', 'Tomas Machac', 'Taylor Fritz', 'Adrian Mannarino', 'Jiri Lehecka', 'Arthur Rinderknech', 'Carlos Alcaraz']
['Jannik Sinner', 'Lorenzo Musetti', 'Felix Auger-Aliassime', 'Alex de Minaur', 'Novak Djokovic', 'Taylor Fritz', 'Jiri Lehecka', 'Carlos Alcaraz']
['Jannik Sinner', 'Felix Auger-Aliassime', 'Novak Djokovic', 'Carlos Alcaraz']
['Jannik Sinner', 'Carlos Alcaraz']
[]





In [None]:
### We define a data cleaning function

In [None]:
def clean_data(df):
    # Clean and Convert Statistical Columns to Numeric

    print("Converting all statistical columns to a numeric data type...")

    # Create a list of all the columns that should contain numbers
    numeric_cols = [
                'winner_rank', 'loser_rank', 'winner_age', 'loser_age',
                'w_ace', 'l_ace', 'w_df', 'l_df', 'w_svpt', 'l_svpt',
                'w_1stIn', 'l_1stIn', 'w_1stWon', 'l_1stWon', 'w_2ndWon', 'l_2ndWon',
                'w_bpSaved', 'l_bpSaved', 'w_bpFaced', 'l_bpFaced',
                'winner_ht', 'loser_ht', 'draw_size'
            ]

    # Loop through each column in our list
    for col in numeric_cols:
        # Convert the column to a numeric type.
        # The key is errors='coerce', which will replace any value that
        # cannot be converted to a number with NaN (Not a Number).
        df[col] = pd.to_numeric(df[col], errors='coerce')

    print("✅ Statistical columns successfully converted!")

    # As a final cleaning step, we can drop any rows that are missing crucial data
    # that would make feature calculation impossible later on.
    print("\nDropping rows with missing essential data (like rank or stats)...")
    df.dropna(subset=numeric_cols, inplace=True)
    df.dropna(subset=['surface'], inplace=True)

    print(f"The cleaned DataFrame now has {df.shape[0]} rows.")

    return df

In [None]:
# We create a master_df with all the available information before the match starts

In [None]:
### Prepare historical dataframe

# Here we import the functions from our data_prep notebook
from data_prep_refactored import load_and_clean_data, compute_approx_match_date_men, calculate_ELO_for_df

def get_full_master_df(start_year, end_year, enable_update = True):

    master_df = load_and_clean_data(start_year,end_year)
    # We make sure to chronologically order the matches:
    master_df.sort_values(by=['tourney_date','tourney_id','match_num'], inplace=True)

    if enable_update == True:
        data_directory = '../tennis_data/ATP_data/'
        filename = 'us_open_update.csv' 

        us_open_update_df = pd.read_csv(data_directory + filename)
        us_open_update_df['tourney_date'] = pd.to_datetime(us_open_update_df['tourney_date'], format='%Y%m%d')

        master_df = pd.concat([master_df,us_open_update_df], axis = 0)

    ELOs = calculate_ELO_for_df(master_df)

    # Add general ELO column to data frame
    master_df['winner_ELO'] = ELOs[0]
    master_df['loser_ELO'] = ELOs[1]
    # Add ELO's for specific surfaces
    master_df['winner_ELO_clay'] = ELOs[2]
    master_df['loser_ELO_clay'] = ELOs[3]

    master_df['winner_ELO_grass'] = ELOs[4]
    master_df['loser_ELO_grass'] = ELOs[5]

    master_df['winner_ELO_hard'] = ELOs[6]
    master_df['loser_ELO_hard'] = ELOs[7]

    master_df = compute_approx_match_date_men(master_df)

    return master_df

In [None]:
static_stats_winner = ['winner_rank', 'winner_age', 'winner_ht', 'winner_ELO', 'winner_ELO_clay', 'winner_ELO_grass', 'winner_ELO_hard']
static_stats_loser = ['loser_rank', 'loser_age', 'loser_ht', 'loser_ELO', 'loser_ELO_clay', 'loser_ELO_grass', 'loser_ELO_hard']

# Assume 'master_df' is full, chronologically-sorted DataFrame
# and it has been loaded and prepared.

def get_most_recent_match(player_name, df):
    """
    Finds the most recent match for a specific player from a chronological DataFrame.

    Args:
        player_name (str): The name of the player.
        df (pd.DataFrame): The DataFrame to search within (must be sorted by date).

    Returns:
        pd.Series: A Series representing the row of the most recent match, or None if not found.
    """
    # Filter for all matches involving the player
    player_matches = df[
        (df['winner_name'] == player_name) |
        (df['loser_name'] == player_name)
    ]

    # Check if the player was found
    if player_matches.empty:
        print(f"No matches found for {player_name}.")
        return None

    # The last row of the filtered DataFrame is the most recent match
    most_recent_match = player_matches.iloc[-1]
    
    return most_recent_match

    
def get_player_stats_for_input_df(input_df, master_df):
    static_features_list = []

    # Get historical stats for players
    for idx, match in tqdm(input_df.iterrows()):
        
        static_feature_row = {}

        p1_name = match['winner_name']
        p2_name = match['loser_name']

        if p1_name == 'Botic van De Zandschulp':
            p1_name = 'Botic van de Zandschulp'
        elif p2_name == 'Botic van De Zandschulp':
            p2_name = 'Botic van de Zandschulp'

        p1_last_match = get_most_recent_match(p1_name, master_df)
        p2_last_match = get_most_recent_match(p2_name, master_df)

        if p1_last_match is not None:
            if p1_name == p1_last_match['winner_name']:
                for stat in static_stats_winner:
                    static_feature_row[stat] = p1_last_match[stat]
            else: 
                for id, stat in enumerate(static_stats_loser):
                    static_feature_row[static_stats_winner[id]] = p1_last_match[stat]

        else:
        # Handle the case where the player is new and has no history
            print(f"Player {p1_name} has no match history. Using default values.")
        # You might want to fill in default stats for a new player here

        if p2_last_match is not None:
            if p2_name == p2_last_match['loser_name']:
                for stat in static_stats_loser:
                    static_feature_row[stat] = p2_last_match[stat]
            else: 
                for id, stat in enumerate(static_stats_winner):
                    static_feature_row[static_stats_loser[id]] = p2_last_match[stat]

        else:
        # Handle the case where the player is new and has no history
            print(f"Player {p1_name} has no match history. Using default values.")
        # You might want to fill in default stats for a new player here


        static_features_list.append(static_feature_row)


    static_features_df = pd.DataFrame(static_features_list)
    input_df = pd.concat([input_df,static_features_df], axis = 1)

    return input_df

master_df = get_full_master_df(2000,2025, enable_update=False)
input_df = get_player_stats_for_input_df(input_df,master_df)

display(input_df)



Starting the data loading process...
Successfully loaded 2000.csv
Successfully loaded 2001.csv
Successfully loaded 2002.csv
Successfully loaded 2003.csv
Successfully loaded 2004.csv
Successfully loaded 2005.csv
Successfully loaded 2006.csv
Successfully loaded 2007.csv
Successfully loaded 2008.csv
Successfully loaded 2009.csv
Successfully loaded 2010.csv
Successfully loaded 2011.csv
Successfully loaded 2012.csv
Successfully loaded 2013.csv
Successfully loaded 2014.csv
Successfully loaded 2015.csv
Successfully loaded 2016.csv
Successfully loaded 2017.csv
Successfully loaded 2018.csv
Successfully loaded 2019.csv
Successfully loaded 2020.csv
Successfully loaded 2021.csv
Successfully loaded 2022.csv
Successfully loaded 2023.csv
Successfully loaded 2024.csv
Successfully loaded 2025.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 77198 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully c

67730it [00:01, 49907.30it/s]
64it [00:00, 90.15it/s]


Unnamed: 0,winner_name,loser_name,approx_match_date,round,surface,winner_rank,winner_age,winner_ht,winner_ELO,winner_ELO_clay,winner_ELO_grass,winner_ELO_hard,loser_rank,loser_age,loser_ht,loser_ELO,loser_ELO_clay,loser_ELO_grass,loser_ELO_hard
0,Jannik Sinner,Vit Kopriva,2025-08-26,R1,Hard,1.0,23.975,191.0,2139.524617,1820.081151,1696.151289,2111.278476,87.0,28.180,178.0,1494.611881,1504.305411,1489.978317,1479.969705
1,Alexei Popyrin,Emil Ruusuvuori,2025-08-26,R1,Hard,26.0,26.007,196.0,1666.926765,1567.568907,1451.562601,1636.248472,71.0,25.300,188.0,1651.420307,1475.921231,1507.483880,1653.218875
2,Valentin Royer,Yunchaokete Bu,2025-08-26,R1,Hard,111.0,24.192,188.0,1498.491439,1488.414692,1503.845640,1501.579267,76.0,23.580,185.0,1536.151403,1454.506118,1479.763801,1567.136481
3,Marton Fucsovics,Denis Shapovalov,2025-08-26,R1,Hard,94.0,33.520,188.0,1658.962835,1596.270019,1555.290896,1568.151935,29.0,26.313,185.0,1668.783528,1557.797305,1512.819998,1707.125561
4,Alexander Bublik,Marin Cilic,2025-08-26,R1,Hard,30.0,28.093,196.0,1723.225814,1606.160250,1643.560110,1546.455533,83.0,36.753,198.0,1690.720756,1646.498661,1759.601622,1645.145221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Alexander Shevchenko,Alejandro Davidovich Fokina,2025-08-24,R1,Hard,110.0,24.641,188.0,1545.083072,1521.653373,1473.690988,1521.198830,19.0,26.174,180.0,1735.052313,1608.483601,1510.865670,1682.078036
60,Luciano Darderi,Rinky Hijikata,2025-08-24,R1,Hard,34.0,23.510,183.0,1632.528966,1673.433269,1480.230576,1408.119209,87.0,24.348,178.0,1515.310025,1455.827610,1492.828410,1539.209289
61,Stefan Dostanic,Eliot Spizzirri,2025-08-24,R1,Hard,412.0,23.790,188.0,1510.448136,1500.000000,1500.000000,1510.044208,134.0,23.231,185.0,1503.373327,1500.000000,1500.339190,1501.455869
62,Mattia Bellucci,Juncheng Shang,2025-08-25,R1,Hard,63.0,24.210,175.0,1528.171287,1479.531851,1496.323726,1520.888557,109.0,20.509,180.0,1583.451769,1474.154798,1513.220136,1563.671675


In [None]:
### Now we import the trained XGBoost model (used data is from 2001 to 2023)

In [43]:
import joblib

# Define the filename
model_filename = 'tennis_predictor_model.joblib'

# Load the pre-trained model from the file
print(f"Loading trained model from {model_filename}...")
trained_model = joblib.load(model_filename)
print("✅ Model loaded successfully.")

Loading trained model from tennis_predictor_model.joblib...
✅ Model loaded successfully.


In [None]:
### Now we have to generate the relevant features for the upcoming matches which we can then input into our model

In [None]:
from data_prep_refactored import create_static_features, create_h2h_features, create_general_dynamic_features, create_surface_dynamic_features, create_fatigue_features, create_ELO_features

def create_features_for_matches(matches_df, master_df):
    """
    Takes a DataFrame of upcoming matches and generates the full feature set.
    Each row in matches_df should contain 'winner_name' and 'loser_name' just because it matches the format of the training data and other info like 'round'.
    """
    #  Create static features (based on most recent player stats)
    #  This part requires modifying your 'get_most_recent_match' logic
    #  to handle the new format.
    static_features = create_static_features(matches_df).drop('target', axis=1)

    # Create all other features
    h2h_features = create_h2h_features(matches_df, master_df)
    general_dynamic_features = create_general_dynamic_features(matches_df, master_df)
    surface_features = create_surface_dynamic_features(matches_df, master_df)
    fatigue_features = create_fatigue_features(matches_df, master_df)
    elo_features = create_ELO_features(matches_df)

    # Combine all features into one DataFrame
    all_features = pd.concat([
        static_features, h2h_features, general_dynamic_features,
        surface_features, fatigue_features, elo_features
    ], axis=1)

    return all_features

# Helper function to get the latest stats for a single player
def get_player_latest_stats(player_name, master_df):
    """
    Finds the most recent stats for a player from the master DataFrame.
    Returns a dictionary of their stats with generic keys (e.g., 'rank', 'age').
    """
    last_match = get_most_recent_match(player_name, master_df)
    
    if last_match is None:
        # Handle new players with no history - return default/NaN values
        print(f"Player {player_name} has no history, using default stats.")
        return {
            'rank': 150.0, 'age': 25.0, 'ht': 185.0, 'ELO': 1500,
            'ELO_clay': 1500, 'ELO_grass': 1500, 'ELO_hard': 1500
        }

    stats = {}
    if player_name == last_match['winner_name']:
        # Player was the winner in their last recorded match
        stats['rank'] = last_match['winner_rank']
        stats['age'] = last_match['winner_age']
        stats['ht'] = last_match['winner_ht']
        stats['ELO'] = last_match['winner_ELO']
        stats['ELO_clay'] = last_match['winner_ELO_clay']
        stats['ELO_grass'] = last_match['winner_ELO_grass']
        stats['ELO_hard'] = last_match['winner_ELO_hard']
    else:
        # Player was the loser in their last recorded match
        stats['rank'] = last_match['loser_rank']
        stats['age'] = last_match['loser_age']
        stats['ht'] = last_match['loser_ht']
        stats['ELO'] = last_match['loser_ELO']
        stats['ELO_clay'] = last_match['loser_ELO_clay']
        stats['ELO_grass'] = last_match['loser_ELO_grass']
        stats['ELO_hard'] = last_match['loser_ELO_hard']
        
    return stats

In [None]:
### Now we're all set to predict the winner of the current round.

In [None]:
### Predict winners of current round based on latest results

master_df = get_full_master_df(2000,2025, enable_update=True)

round_name = 'F'

R3_input_df = get_match_stats_for_input_df(draw_df, round_name)

R3_input_df = get_player_stats_for_input_df(R3_input_df, master_df)

print(f'--- Predicting {round_name} ---')

#  Create features for the current round's matches
features = create_features_for_matches(R3_input_df, master_df)

#  Align features and predict probabilities
expected_features = trained_model.get_booster().feature_names
aligned_features = features.reindex(columns=expected_features, fill_value=0)
win_probabilities = trained_model.predict_proba(aligned_features)

predicted_round_winners = []
for idx, match in R3_input_df.iterrows():
    p1_name = min(match['winner_name'], match['loser_name']) # Assuming p1 is alphabetical
    p2_name = max(match['winner_name'], match['loser_name'])
        
    p1_win_prob = win_probabilities[idx][1]
    p2_win_prob = win_probabilities[idx][0]

    if p1_win_prob >= 0.5:
        match_winner_name = p1_name
        match_loser_name = p2_name
        winner_prob = p1_win_prob
    else:
        match_winner_name = p2_name
        match_loser_name = p1_name
        winner_prob = p2_win_prob
    
    predicted_round_winners.append(match_winner_name)
    print(f'{p1_name} vs. {p2_name}    WINNER: {match_winner_name} with prob {winner_prob}.')


# Compare with actual reslt

actual_round_winners = winner_list_SF

correctness_counter = 0
number_matches_already_played = len(actual_round_winners)

for winner in actual_round_winners:
    if winner in predicted_round_winners:
        correctness_counter += 1

acc = 0
if number_matches_already_played != 0:
    acc = correctness_counter/number_matches_already_played
else: 
    acc = 1

print(f'The accuracy for {round_name} is {acc}')




Starting the data loading process...
Successfully loaded 2000.csv
Successfully loaded 2001.csv
Successfully loaded 2002.csv
Successfully loaded 2003.csv
Successfully loaded 2004.csv
Successfully loaded 2005.csv
Successfully loaded 2006.csv
Successfully loaded 2007.csv
Successfully loaded 2008.csv
Successfully loaded 2009.csv
Successfully loaded 2010.csv
Successfully loaded 2011.csv
Successfully loaded 2012.csv
Successfully loaded 2013.csv
Successfully loaded 2014.csv
Successfully loaded 2015.csv
Successfully loaded 2016.csv
Successfully loaded 2017.csv
Successfully loaded 2018.csv
Successfully loaded 2019.csv
Successfully loaded 2020.csv
Successfully loaded 2021.csv
Successfully loaded 2022.csv
Successfully loaded 2023.csv
Successfully loaded 2024.csv
Successfully loaded 2025.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 77198 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully c

67854it [00:01, 52685.52it/s]
1it [00:00, 76.74it/s]


--- Predicting F ---


1it [00:00, 5127.51it/s]
1it [00:00, 69.64it/s]
1it [00:00, 16.56it/s]
1it [00:00,  5.89it/s]
1it [00:00, 66.87it/s]
1it [00:00, 4766.25it/s]


Carlos Alcaraz vs. Jannik Sinner    WINNER: Jannik Sinner with prob 0.5745657086372375.
The accuracy for F is 0.5


In [None]:
### We can also predict the whole tournament from the start:

In [None]:
### Predict whole tournament from the start

# Load model and historical data
trained_model = joblib.load('tennis_predictor_model.joblib')

master_df = get_full_master_df(2000,2025, enable_update=False)

# Create the initial DataFrame for Round 1 matches from the JSON feed
# This is the current 'input_df'
current_round_df = input_df 

# This list will store the winners of each round
predicted_winners = []

tournament_predictions = {} # Dictionary to store results for each round

# --- SIMULATION LOOP ---
rounds = ['R128', 'R64', 'R32', 'R16', 'QF', 'SF', 'F']
for i, round_name in enumerate(rounds):
    print(f'--- Predicting {round_name} ---')

    # Store the current round's matchups before predicting
    tournament_predictions[round_name] = {'matches': current_round_df.copy()}

    # Create features for the current round's matches
    features = create_features_for_matches(current_round_df, master_df)

    # Align features and predict probabilities
    expected_features = trained_model.get_booster().feature_names
    aligned_features = features.reindex(columns=expected_features, fill_value=0)
    win_probabilities = trained_model.predict_proba(aligned_features)

    # Determine winners and store them
    round_winners = []
    for idx, match in current_round_df.iterrows():
        p1_name = min(match['winner_name'], match['loser_name']) # Assuming p1 is alphabetical
        p2_name = max(match['winner_name'], match['loser_name'])
        
        p1_win_prob = win_probabilities[idx][1]

        if p1_win_prob >= 0.5:
            round_winners.append(p1_name)
        else:
            round_winners.append(p2_name)
    
    predicted_winners.append(round_winners)

    # Store the list of winners for the current round
    tournament_predictions[round_name]['winners'] = round_winners
    
    # Check if we have a final tournament winner
    if len(round_winners) == 1:
        print(f' Predicted Tournament Winner: {round_winners[0]}')
        break

    # Prepare the DataFrame for the NEXT round
    next_round_matches = []
    # Pair up winners sequentially
    for j in range(0, len(round_winners), 2):
        p1_name = round_winners[j]
        p2_name = round_winners[j+1]

        # Fetch latest stats for both players
        p1_stats = get_player_latest_stats(p1_name, master_df)
        p2_stats = get_player_latest_stats(p2_name, master_df)

        # Create the detailed dictionary for the new match.
        # The 'winner_' and 'loser_' prefixes are placeholders that the
        # feature functions expect as input.
        next_match = {
            'winner_name': p1_name,
            'loser_name': p2_name,
            'round': rounds[i+1], # e.g., 'R64'
            'surface': 'Hard',    # Constant for US Open
            
            # Player 1 stats with 'winner_' prefix
            'winner_rank': p1_stats['rank'],
            'winner_age': p1_stats['age'],
            'winner_ht': p1_stats['ht'],
            'winner_ELO': p1_stats['ELO'],
            'winner_ELO_clay': p1_stats['ELO_clay'],
            'winner_ELO_grass': p1_stats['ELO_grass'],
            'winner_ELO_hard': p1_stats['ELO_hard'],
            
            # Player 2 stats with 'loser_' prefix
            'loser_rank': p2_stats['rank'],
            'loser_age': p2_stats['age'],
            'loser_ht': p2_stats['ht'],
            'loser_ELO': p2_stats['ELO'],
            'loser_ELO_clay': p2_stats['ELO_clay'],
            'loser_ELO_grass': p2_stats['ELO_grass'],
            'loser_ELO_hard': p2_stats['ELO_hard'],
            
            # Add a placeholder date for fatigue calculations
            'approx_match_date': pd.to_datetime('2025-08-25') + pd.Timedelta(days=i+2)
        }
        next_round_matches.append(next_match)

    # Overwrite the current round DataFrame with the newly generated one
    current_round_df = pd.DataFrame(next_round_matches)

def visualize_tournament_tree(predictions):
    """
    Prints a text-based visualization of the predicted tournament bracket.
    """
    for round_name, round_data in predictions.items():
        print(f"""
---------------------------------
|         {round_name.upper()}         |
---------------------------------""")
        
        matches = round_data['matches']
        winners = round_data['winners']
        
        for i in range(len(matches)):
            # Note: We use the original 'winner_name' and 'loser_name' columns
            # from the input DataFrame, which act as p1 and p2 placeholders.
            p1 = matches.iloc[i]['winner_name']
            p2 = matches.iloc[i]['loser_name']
            winner = winners[i]
            
            # Highlight the winner using an asterisk
            if p1 == winner:
                p1_display = f"*{p1}"
                p2_display = p2
            else:
                p1_display = p1
                p2_display = f"*{p2}"
            
            print(f"Match {i+1}: {p1_display:<25} vs. {p2_display:<25}")
            
    print("\n* = Predicted Winner")


visualize_tournament_tree(tournament_predictions)

Starting the data loading process...
Successfully loaded 2000.csv
Successfully loaded 2001.csv
Successfully loaded 2002.csv
Successfully loaded 2003.csv
Successfully loaded 2004.csv
Successfully loaded 2005.csv
Successfully loaded 2006.csv
Successfully loaded 2007.csv
Successfully loaded 2008.csv
Successfully loaded 2009.csv
Successfully loaded 2010.csv
Successfully loaded 2011.csv
Successfully loaded 2012.csv
Successfully loaded 2013.csv
Successfully loaded 2014.csv
Successfully loaded 2015.csv
Successfully loaded 2016.csv
Successfully loaded 2017.csv
Successfully loaded 2018.csv
Successfully loaded 2019.csv
Successfully loaded 2020.csv
Successfully loaded 2021.csv
Successfully loaded 2022.csv
Successfully loaded 2023.csv
Successfully loaded 2024.csv
Successfully loaded 2025.csv

✅ All files have been loaded and combined successfully!
The DataFrame has 77198 rows (matches) and 49 columns.
Converting all statistical columns to a numeric data type...
✅ Statistical columns successfully c

67730it [00:01, 53641.53it/s]


--- Predicting R128 ---


64it [00:00, 29398.25it/s]
64it [00:00, 77.68it/s]
64it [00:03, 17.68it/s]
64it [00:10,  6.21it/s]
64it [00:00, 77.69it/s]
64it [00:00, 30504.03it/s]


--- Predicting R64 ---


32it [00:00, 26379.27it/s]
32it [00:00, 78.73it/s]
32it [00:01, 17.50it/s]
32it [00:05,  6.01it/s]
32it [00:00, 81.52it/s]
32it [00:00, 28544.82it/s]


--- Predicting R32 ---


16it [00:00, 26940.53it/s]
16it [00:00, 82.12it/s]
16it [00:00, 17.40it/s]
16it [00:02,  5.83it/s]
16it [00:00, 80.67it/s]
16it [00:00, 20225.70it/s]


--- Predicting R16 ---


8it [00:00, 15835.03it/s]
8it [00:00, 77.27it/s]
8it [00:00, 16.49it/s]
8it [00:01,  5.45it/s]
8it [00:00, 67.00it/s]
8it [00:00, 15932.78it/s]


--- Predicting QF ---


4it [00:00, 7560.71it/s]
4it [00:00, 73.64it/s]
4it [00:00, 12.45it/s]
4it [00:00,  5.86it/s]
4it [00:00, 74.72it/s]
4it [00:00, 11147.65it/s]


--- Predicting SF ---


2it [00:00, 8962.19it/s]
2it [00:00, 73.25it/s]
2it [00:00, 17.08it/s]
2it [00:00,  6.04it/s]
2it [00:00, 76.14it/s]
2it [00:00, 7810.62it/s]


--- Predicting F ---


1it [00:00, 4629.47it/s]
1it [00:00, 74.61it/s]
1it [00:00, 17.62it/s]
1it [00:00,  5.88it/s]
1it [00:00, 78.69it/s]
1it [00:00, 4854.52it/s]


🏆 Predicted Tournament Winner: Jannik Sinner

---------------------------------
|         R128         |
---------------------------------
Match 1: *Jannik Sinner            vs. Vit Kopriva              
Match 2: *Alexei Popyrin           vs. Emil Ruusuvuori          
Match 3: Valentin Royer            vs. *Yunchaokete Bu          
Match 4: *Marton Fucsovics         vs. Denis Shapovalov         
Match 5: Alexander Bublik          vs. *Marin Cilic             
Match 6: *Lorenzo Sonego           vs. Tristan Schoolkate       
Match 7: *Nuno Borges              vs. Brandon Holt             
Match 8: Elmer Moller              vs. *Tommy Paul              
Match 9: *Lorenzo Musetti          vs. Giovanni Mpetshi Perricard
Match 10: *Quentin Halys            vs. David Goffin             
Match 11: *Jenson Brooksby          vs. Aleksandar Vukic         
Match 12: Francesco Passaro         vs. *Flavio Cobolli          
Match 13: *Gabriel Diallo           vs. Damir Dzumhur            
Match 14: *