# Testing tournament code, to be moved into tournament.py


In [1]:
import pandas as pd
import requests
from io import StringIO

In [2]:
import requests
import pandas as pd
from io import StringIO
import sys

class GetTennisData():
    """
    Class to create dataframe from a dataset on github.
    """
    def __init__(self):
        """
        Initialize the GetTennisData class
        """
        self.base_url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{}.csv"

    def get_data(self, year_lower = 2000, year_upper = 2025):
        """
        Reads data from github url and creates dataframe for each url.

        Args:
            year_lower (int): The lower bound for the years you want data for. Default set to 2000
            year_upper (int): The upper bound (Exclusive) for the years you want data for. Default set to 2025

        Returns:
            Final dataframe across every github url for given years.

        Raises:
            Exception: Years must be integers
            ValueError: Year must be in specified range of possible data.
        """

        if (type(year_lower) != int or type(year_upper) != int):
            raise Exception("This is not of type int (integer), you must input an int for years")

        try:
            if not (1968 <= year_lower <= 2024) or not (1969 <= year_upper <= 2025):
                raise ValueError("Year must be between 1968 and 2024 for the lower year, and between 1969 and 2025 for the upper year")
        except ValueError as e:
            print(e)
            sys.exit(1)

        df_list = []
        for year in range(year_lower, year_upper):  

            url = self.base_url.format(year)

            response = requests.get(url)

            df = pd.read_csv(StringIO(response.text))

            df['Year'] = year

            df_list.append(df)

        final_df = pd.concat(df_list)

        # Saved only columns we deemed relevant for analysis.
        final_df = final_df[['tourney_name', 'surface', 'draw_size', 'tourney_level', 'best_of', 
                   'winner_name', 'winner_age', 'loser_name', 'loser_age', 'Year']]
        
        final_df = final_df[final_df['surface'] != 'Carpet']
        
        final_df = final_df.dropna()

        file_path = f'../data/tennis_data.csv'

        final_df.to_csv(file_path, index=0)

In [37]:
tennis_data = GetTennisData()
tennis_data.get_data(year_lower = 2000, year_upper = 2024)

In [76]:
# Find rows with missing winner_ht or loser_ht
missing_ht = data[data['winner_ht'].isnull() | data['loser_ht'].isnull()]

missing_ht[missing_ht['tourney_name'] == 'Roland Garros']


Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,winner_ht,loser_name,loser_age,loser_ht,Year


In [317]:
data = pd.read_csv('../data/tennis_data.csv')

In [567]:
rg01 = pd.read_csv('../data/tournament_results_Roland_Garros_head_to_head_0.1.csv', index_col=0)

In [568]:
rg01['Champion'].sort_values()

Thiago Seyboth Wild    0.00
Constant Lestienne     0.00
David Goffin           0.00
Pedro Martinez         0.00
Juncheng Shang         0.00
                       ... 
Casper Ruud            0.08
Stefanos Tsitsipas     0.08
Alexander Zverev       0.12
Carlos Alcaraz         0.14
Novak Djokovic         0.36
Name: Champion, Length: 128, dtype: float64

In [597]:
win_per = pd.read_csv('../data/win_percentage.csv', index_col='Player_Name')
games_played = pd.read_csv('../data/games_played_opponents.csv', index_col='Player_Name')

In [651]:
odds = pd.read_csv('../data/2023_Australian_Open_Prob.csv', index_col=0)

model = pd.read_csv('../data/tournament_results_Australian_Open.csv', index_col = 0)

In [652]:
k_list = [0.05, 0.1, 0.5]

In [653]:
import numpy as np
def RMSE(true, pred):
    return np.sqrt(np.mean(np.square(true-pred)))

def Linf(true, pred):
    return np.max(np.abs(true-pred))

def L1(true, pred):
    return np.mean(np.absolute(true-pred))

In [654]:
csv_dict_k = {}
csv_dict_k['original'] = model
if k_list is not None:
    for k in k_list:
        csv_dict_k[k] = pd.read_csv(f'../data/tournament_results_Australian_Open_head_to_head_{k}.csv', index_col=0)

In [655]:
csv_dict_k[0.05]

Unnamed: 0,Round_64,Round_32,Round_16,Round_8,Round_4,Round_2,Runner_up,Champion
Rafael Nadal,0.9228,0.7334,0.6594,0.5278,0.3784,0.2702,0.0982,0.0982
Mackenzie Mcdonald,0.4910,0.0766,0.0356,0.0046,0.0010,0.0000,0.0000,0.0000
Dalibor Svrcina,0.6836,0.1654,0.0276,0.0064,0.0012,0.0004,0.0000,0.0000
Yoshihito Nishioka,0.6476,0.5168,0.0996,0.0186,0.0044,0.0010,0.0002,0.0002
Karen Khachanov,0.9326,0.8610,0.5814,0.2116,0.0936,0.0346,0.0066,0.0066
...,...,...,...,...,...,...,...,...
Joao Sousa,0.1972,0.0682,0.0114,0.0008,0.0000,0.0000,0.0000,0.0000
Alexander Bublik,0.5798,0.1602,0.0382,0.0202,0.0084,0.0002,0.0000,0.0000
Jan Lennard Struff,0.1086,0.0332,0.0078,0.0004,0.0000,0.0000,0.0000,0.0000
Christopher Oconnell,0.2690,0.0386,0.0048,0.0014,0.0002,0.0000,0.0000,0.0000


In [656]:
odds_comparison = odds[['normalized_winning_probability']]
odds_comparison = odds_comparison.dropna()

In [657]:
csv_dict_k.items()

dict_items([('original',                       Round_64  Round_32  Round_16  Round_8  Round_4  Round_2  \
Rafael Nadal            0.7886    0.6150    0.5264   0.3664   0.2432   0.1558   
Mackenzie Mcdonald      0.3216    0.0606    0.0264   0.0060   0.0024   0.0006   
Dalibor Svrcina         0.5082    0.1740    0.0290   0.0062   0.0016   0.0000   
Yoshihito Nishioka      0.4470    0.2822    0.0686   0.0232   0.0076   0.0024   
Karen Khachanov         0.8290    0.6750    0.3742   0.1890   0.0966   0.0474   
...                        ...       ...       ...      ...      ...      ...   
Joao Sousa              0.2300    0.0820    0.0152   0.0026   0.0006   0.0000   
Alexander Bublik        0.5834    0.2710    0.1112   0.0548   0.0192   0.0040   
Jan Lennard Struff      0.1858    0.0528    0.0100   0.0020   0.0004   0.0000   
Christopher Oconnell    0.2722    0.0402    0.0114   0.0040   0.0002   0.0000   
Tomas Machac            0.1622    0.0700    0.0204   0.0064   0.0010   0.0000   

  

In [660]:
odds_comparison

Unnamed: 0_level_0,normalized_winning_probability,Champion_original,Champion_0.05,Champion_0.1,Champion_0.5
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Novak Djokovic,0.348616,0.3798,0.5792,0.5624,0.4950
Stefanos Tsitsipas,0.042609,0.0428,0.0272,0.0314,0.0294
Karen Khachanov,0.006328,0.0172,0.0066,0.0056,0.0088
Tommy Paul,0.002123,0.0038,0.0020,0.0026,0.0060
Sebastian Korda,0.018798,0.0050,0.0012,0.0020,0.0042
...,...,...,...,...,...
Tomas Machac,0.001276,0.0000,0.0000,0.0000,0.0000
Vasek Pospisil,0.001276,0.0000,0.0000,0.0000,0.0000
Yannick Hanfmann,0.001276,0.0000,0.0000,0.0000,0.0000
Yu Hsiou Hsu,0.001276,0.0000,0.0000,0.0000,0.0000


In [659]:
actual = odds_comparison['normalized_winning_probability']
for model_name, model_df in csv_dict_k.items():
    # Extract the 'Champion' column corresponding to the current model.
    champion_column = model_df[f'Champion']
    
    # Ensure the indices match and join the 'Champion' column to the 'odds_comparison' DataFrame
    odds_comparison[f'Champion_{model_name}'] = champion_column

In [661]:
# Loop over models in csv_dict_k and compute the metrics
for model_name, model_df in csv_dict_k.items():
    # Get the champion column for the model
    champion_column = odds_comparison[f'Champion_{model_name}']
        
    # Compute metrics
    rmse_value = RMSE(actual, champion_column)
    linf_value = Linf(actual, champion_column)
    l1_value = L1(actual, champion_column)

    # Print results for the current model
    print(f"Model: {model_name}")
    print(f"  RMSE: {rmse_value}")
    print(f"  Linf: {linf_value}")
    print(f"  L1: {l1_value}")
    print('-' * 40)  # Separator for better readability


Model: original
  RMSE: 0.00943225887634533
  Linf: 0.08026803556018709
  L1: 0.00358643232111009
----------------------------------------
Model: 0.05
  RMSE: 0.02316381059986612
  Linf: 0.23058353467429493
  L1: 0.005617728717089069
----------------------------------------
Model: 0.1
  RMSE: 0.021480354305378653
  Linf: 0.2137835346742949
  L1: 0.0053055298670316746
----------------------------------------
Model: 0.5
  RMSE: 0.015172168753552128
  Linf: 0.14638353467429488
  L1: 0.0042610974837843565
----------------------------------------


In [607]:
odds_comparison

Unnamed: 0,normalized_winning_probability,Champion,Champion_original,Champion_0.05,Champion_0.1,Champion_0.5
Carlos Alcaraz,0.146204,0.1122,0.1122,0.1260,0.1258,0.1120
Novak Djokovic,0.418675,0.3912,0.3912,0.5560,0.5372,0.4646
Daniil Medvedev,0.031329,0.0464,0.0464,0.0634,0.0574,0.0450
Jannik Sinner,0.034627,0.0370,0.0370,0.0494,0.0444,0.0304
Holger Rune,0.016047,0.0322,0.0322,0.0264,0.0250,0.0290
...,...,...,...,...,...,...
Tomas Martin Etcheverry,0.001313,0.0000,0.0000,0.0000,0.0000,0.0000
Yannick Hanfmann,0.001313,0.0000,0.0000,0.0000,0.0000,0.0000
Yibing Wu,0.001313,0.0000,0.0000,0.0000,0.0002,0.0004
Yosuke Watanuki,0.001313,0.0002,0.0002,0.0000,0.0000,0.0000


In [571]:
odds_comparison = odds[['normalized_winning_probability']].join(model['Champion'], how='inner')
odds_comparison = odds_comparison.dropna()

actual = odds_comparison['normalized_winning_probability']

preds = odds_comparison['Champion']

In [572]:
odds_comparison

Unnamed: 0,normalized_winning_probability,Champion
Carlos Alcaraz,0.146204,0.1122
Novak Djokovic,0.418675,0.3912
Daniil Medvedev,0.031329,0.0464
Jannik Sinner,0.034627,0.0370
Holger Rune,0.016047,0.0322
...,...,...
Tomas Martin Etcheverry,0.001313,0.0000
Yannick Hanfmann,0.001313,0.0000
Yibing Wu,0.001313,0.0000
Yosuke Watanuki,0.001313,0.0002


In [521]:
games_played_per_player = games_played.sum(axis=1)

# Step 3: Calculate the average number of games played across all players
average_games_played = games_played_per_player.mean()
average_games_played

47.33678756476684

In [416]:
games_played['Roger Federer'].sum()

396.0

In [386]:
win_per['Roger Federer']['Novak Djokovic']

0.35

In [678]:
import math
math.exp(-0.015 * (30-25))**5

0.6872892787909721

In [688]:
import math

# Sigmoid function to calculate the adjustment factor based on games played
def sigmoid(x, x_0=10, k=0.3):
    """
    Sigmoid function to adjust weight based on the number of games played.
    x_0 is the point where the sigmoid function reaches 0.5 (midpoint).
    k controls the steepness of the sigmoid curve.
    """
    return 1 / (1 + math.exp(-k * (x - x_0)))

# Function to calculate adjusted win probability
def adjusted_win_probability(P_A, P_head_to_head, games_played, x_0=10, k=0.3):
    """
    Calculate the adjusted win probability for Player A based on the sigmoid-weighted head-to-head record.
    
    Parameters:
    - P_A: Base win probability of Player A (e.g., from ELO ratings)
    - P_head_to_head: Historical head-to-head win percentage of Player A
    - games_played: Number of games played between Player A and Player B
    - x_0: Point where sigmoid reaches 0.5 (default = 20 games)
    - k: Steepness factor for the sigmoid function (default = 0.2)
    
    Returns:
    - Adjusted win probability for Player A
    """
    # Calculate the sigmoid adjustment factor based on games played
    adjustment_factor = 0.5 / (1 + math.exp(-k * (games_played - 10)))
    print(adjustment_factor) 

        # Calculate the adjusted win probability
    P_A_adjusted = P_A + adjustment_factor * (P_head_to_head - 0.5)
        
        # Ensure the adjusted probability stays within the valid range [0, 1]
    P_A_adjusted = max(0, min(1, P_A_adjusted))

    return P_A_adjusted

# Example usage
P_A = 0.76  # Base win probability for Player A (e.g., from ELO)
P_head_to_head = 0.8  # Hist
games_played = 20  # Number of games played between Player A and Player B

# Calculate adjusted win probability for Player A
adjusted_prob = adjusted_win_probability(P_A, P_head_to_head, games_played, k = 0.5)

print(f"Adjusted win probability for Player A: {adjusted_prob:.4f}")


0.49665357453785763
Adjusted win probability for Player A: 0.9090


In [None]:
    def adjusted_win_probability_by_games(self, P_A, games_A, games_B, rate=0.1):
        """
        Calculates the adjusted win probability based on specificially game played experience

        Args:
            P_A (float): Probability player A beats player B
            games_A (int): Number of games played by player A
            games_B (int): Number of games played by player B
            rate (float): Rate parameter to adjust winning probability.

        Returns:
            Adjusted winning probability for player A
        """
        total_games = games_A + games_B

        if total_games == 0:
            return P_A  # If no games have been played, just return the base probability
        
        proportion = games_A / total_games
        
        # Calculate the adjustment based on the proportion of games played
        adjustment = rate * (proportion - 0.5)
        
        # Adjust the win probability
        adjusted_P_A = P_A + adjustment
        
        # Ensure the adjusted probability stays within the valid range [0, 1]
        adjusted_P_A = max(0, min(1, adjusted_P_A))
        
        return adjusted_P_A

In [551]:
import math

def adjusted_win_probability(P_A, P_head_to_head, games_played, k=0.1):
    """
    Calculate the adjusted win probability for Player A based on head-to-head win percentage
    and the number of games played between the two players.
    
    Parameters:
    - P_A: Base win probability of Player A (e.g., from ELO ratings)
    - P_head_to_head: Historical win percentage of Player A against Player B
    - games_played: Number of games played between Player A and Player B
    - k: The scaling factor for the number of games played (default = 0.1)
    
    Returns:
    - Adjusted win probability for Player A
    """

    adjustment_factor = 0.5 / (1 + math.exp(-k * (games_played - 10))) 

    # Calculate the adjusted win probability
    P_A_adjusted = P_A + adjustment_factor * (P_head_to_head - 0.5)
    
    # Ensure the adjusted probability stays within the valid range [0, 1]
    P_A_adjusted = max(0, min(1, P_A_adjusted))
    
    return P_A_adjusted

# Example usage
P_A = 0.76  # Base win probability for Player A (e.g., from ELO)
P_head_to_head = 0.9  # Historical win percentage of Player A against Player B
games_played = 30  # Number of games played between Player A and Player B

# Calculate adjusted win probability for Player A
adjusted_prob = adjusted_win_probability(P_A, P_head_to_head, games_played)

print(f"Adjusted win probability for Player A: {adjusted_prob:.4f}")


Adjusted win probability for Player A: 0.9362


In [542]:
1 / (1 + math.exp(-0.6 * (20 - 20))) 

0.5

In [440]:
games_played_A = 50
games_played_B = 20

In [553]:
def adjusted_win_probability_by_games(P_A, games_A, games_B, rate=0.1):
    """
    Calculate the adjusted win probability for Player A based on the number of games played by both players.
    
    Parameters:
    - P_A: Base win probability of Player A (e.g., from ELO ratings)
    - games_A: Number of games played by Player A
    - games_B: Number of games played by Player B
    - r: The small adjustment rate factor (default = 0.01)
    
    Returns:
    - Adjusted win probability for Player A
    """
    # Calculate the relative experience factor
    total_games = games_A + games_B
    if total_games == 0:
        return P_A  # If no games have been played, just return the base probability
    
    proportion = games_A / total_games
    print(proportion)
    
    # Calculate the adjustment based on the relative factor
    adjustment = rate * (proportion - 0.5)
    print(adjustment)
    
    # Adjust the win probability
    adjusted_P_A = P_A + adjustment
    
    # Ensure the adjusted probability stays within the valid range [0, 1]
    adjusted_P_A = max(0, min(1, adjusted_P_A))
    
    return adjusted_P_A

# Example usage
P_A = 0.76  # Base win probability for Player A (e.g., from ELO)
games_A = 50  # Number of games Player A has played
games_B = 10  # Number of games Player B has played

# Calculate adjusted win probability for Player A
adjusted_prob = adjusted_win_probability_by_games(P_A, games_A, games_B)

print(f"Adjusted win probability for Player A: {adjusted_prob:.4f}")


0.8333333333333334
0.03333333333333334
Adjusted win probability for Player A: 0.7933


$P(A_adjusted} = P(A) \cdot$

In [438]:
P_A

0.76

In [437]:
P_A * (1+0.002)**games_played

1.1333339497295256

In [434]:
games_played

20

In [432]:
sigmoid(100, games_played.sum(axis=1).mean(), 0.001)

0.7216504560280248

In [361]:
win_per

Unnamed: 0_level_0,James Duckworth,Roger Federer,Blaz Kavcic,Dudi Sela,Fabio Fognini,Alexandar Lazov,Dimitar Kutrovsky,Aleksandr Nedovyesov,Pierre Hugues Herbert,Marc Gicquel,...,Conor Gannon,Alvaro Guillen Meza,Lucky Candra Kurniawan,Ignacio Buse,Jose Flores,Freddy Murray,Walid Ahouda,Digvijaypratap Singh,Daniel Azar,Mustapha El Natour
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jarkko Nieminen,0.5,0.0,0.0,1.00,0.0,1.0,1.0,1.0,1.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
James Duckworth,0.0,0.0,1.0,1.00,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Marinko Matosevic,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Julien Benneteau,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sam Querrey,0.5,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Daniel Azar,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fajing Sun,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alvaro Guillen Meza,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chris Rodesch,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [406]:
games_played

20

In [345]:
data[(data['winner_name'] == 'Roger Federer') & (data['loser_name'] == 'Novak Djokovic')]

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
703,Dubai,Hard,32,A,3,Roger Federer,32.5,Novak Djokovic,26.7,2014
1129,Monte Carlo Masters,Clay,56,M,3,Roger Federer,32.6,Novak Djokovic,26.8,2014
2683,Shanghai Masters,Hard,56,M,3,Roger Federer,33.1,Novak Djokovic,27.3,2014
3503,Dubai,Hard,32,A,3,Roger Federer,33.5,Novak Djokovic,27.7,2015
5156,Cincinnati Masters,Hard,56,M,3,Roger Federer,34.0,Novak Djokovic,28.2,2015
5824,Tour Finals,Hard,8,F,3,Roger Federer,34.2,Novak Djokovic,28.4,2015
17171,Tour Finals,Hard,8,F,3,Roger Federer,38.2,Novak Djokovic,32.4,2019


In [348]:
data[(data['winner_name'] == 'Novak Djokovic') & (data['loser_name'] == 'Roger Federer')]

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
827,Indian Wells Masters,Hard,96,M,3,Novak Djokovic,26.7,Roger Federer,32.5,2014
1820,Wimbledon,Grass,128,G,5,Novak Djokovic,27.0,Roger Federer,32.8,2014
2890,Tour Finals,Hard,8,F,3,Novak Djokovic,27.4,Roger Federer,33.2,2014
3729,Indian Wells Masters,Hard,96,M,3,Novak Djokovic,27.7,Roger Federer,33.5,2015
4198,Rome Masters,Clay,56,M,3,Novak Djokovic,27.9,Roger Federer,33.7,2015
4669,Wimbledon,Grass,128,G,5,Novak Djokovic,28.1,Roger Federer,33.8,2015
5330,US Open,Hard,128,G,5,Novak Djokovic,28.2,Roger Federer,34.0,2015
5833,Tour Finals,Hard,8,F,3,Novak Djokovic,28.4,Roger Federer,34.2,2015
6100,Australian Open,Hard,128,G,5,Novak Djokovic,28.6,Roger Federer,34.4,2016
13886,Cincinnati Masters,Hard,64,M,3,Novak Djokovic,31.2,Roger Federer,37.0,2018


In [344]:
win_per.loc['Roger Federer']['Novak Djokovic']

0.35

In [335]:
win_per.loc['Andy Murray'].sum()

438.0

In [355]:
df = data

age_bins = range(19, 40, 1)  # Create bins from 22 to 35
age_labels = [f'{i}-{i+1}' for i in age_bins[:-1]]

df['winner_age_group'] = pd.cut(df['winner_age'], bins=age_bins, labels=age_labels, right=False)
df['loser_age_group'] = pd.cut(df['loser_age'], bins=age_bins, labels=age_labels, right=False)

# Step 3: Calculate the winning percentage by age group
# Count the number of wins per age group
win_counts = df['winner_age_group'].value_counts().sort_index()

# Count the total number of matches per age group (winners + losers)
total_counts = pd.concat([
    df['winner_age_group'].value_counts(),
    df['loser_age_group'].value_counts()
], axis=1, keys=['wins', 'losses']).fillna(0)

# Step 4: Calculate winning percentages
total_counts['total'] = total_counts['wins'] + total_counts['losses']
total_counts['win_percentage'] = (total_counts['wins'] / total_counts['total']) * 100

# Step 5: Output the result
print(total_counts[['win_percentage']].sort_index())

       win_percentage
19-20       48.528302
20-21       50.904393
21-22       50.699588
22-23       51.664402
23-24       50.459770
24-25       51.008493
25-26       51.703455
26-27       49.275980
27-28       49.293528
28-29       49.651163
29-30       50.698324
30-31       48.869518
31-32       49.188157
32-33       50.981845
33-34       50.373832
34-35       50.644028
35-36       50.039841
36-37       48.421053
37-38       51.168224
38-39       47.023810


In [349]:
data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
0,Brisbane,Hard,28,A,3,Jarkko Nieminen,32.4,James Duckworth,21.9,2014
1,Brisbane,Hard,28,A,3,Marinko Matosevic,28.3,Julien Benneteau,32.0,2014
2,Brisbane,Hard,28,A,3,Sam Querrey,26.2,Dmitry Tursunov,31.0,2014
3,Brisbane,Hard,28,A,3,Sam Groth,26.1,Ryan Harrison,21.6,2014
4,Brisbane,Hard,28,A,3,Nicolas Mahut,31.9,Igor Sijsling,26.3,2014
...,...,...,...,...,...,...,...,...,...,...
27403,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Blaz Rola,32.9,Alex Knaff,25.7,2023
27404,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Chris Rodesch,22.1,Bor Artnak,19.2,2023
27405,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Mohamed Safwat,32.9,Ignacio Carou,24.1,2023
27406,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Karim Mohamed Maamoun,32.4,Franco Roncadelli,23.5,2023


In [281]:
games_played_df

Unnamed: 0,Peter Luczak,Carsten Ball,Richard Gasquet,Tomas Berdych,Radek Stepanek,Thiemo De Bakker,Thomaz Bellucci,Feliciano Lopez,Fernando Gonzalez,Marin Cilic,...,Conor Gannon,Alvaro Guillen Meza,Lucky Candra Kurniawan,Ignacio Buse,Jose Flores,Freddy Murray,Walid Ahouda,Digvijaypratap Singh,Daniel Azar,Mustapha El Natour
Andy Roddick,1.0,1.0,2.0,6.0,2.0,3.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peter Luczak,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Carsten Ball,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mischa Zverev,0.0,1.0,4.0,3.0,0.0,0.0,1.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Richard Gasquet,0.0,0.0,0.0,15.0,2.0,2.0,4.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Daniel Azar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Fajing Sun,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alvaro Guillen Meza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chris Rodesch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [278]:
matches_data

Unnamed: 0,Peter Luczak,Carsten Ball,Richard Gasquet,Tomas Berdych,Radek Stepanek,Thiemo De Bakker,Thomaz Bellucci,Feliciano Lopez,Fernando Gonzalez,Marin Cilic,...,Conor Gannon,Alvaro Guillen Meza,Lucky Candra Kurniawan,Ignacio Buse,Jose Flores,Freddy Murray,Walid Ahouda,Digvijaypratap Singh,Daniel Azar,Mustapha El Natour
Andy Roddick,1.0,1.0,2.0,6.0,2.0,3.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peter Luczak,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Carsten Ball,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mischa Zverev,0.0,1.0,4.0,3.0,0.0,0.0,1.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Richard Gasquet,0.0,0.0,0.0,15.0,2.0,2.0,4.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Daniel Azar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Fajing Sun,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alvaro Guillen Meza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chris Rodesch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [272]:
data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
0,Brisbane,Hard,28,A,3,Jarkko Nieminen,32.4,James Duckworth,21.9,2014
1,Brisbane,Hard,28,A,3,Marinko Matosevic,28.3,Julien Benneteau,32.0,2014
2,Brisbane,Hard,28,A,3,Sam Querrey,26.2,Dmitry Tursunov,31.0,2014
3,Brisbane,Hard,28,A,3,Sam Groth,26.1,Ryan Harrison,21.6,2014
4,Brisbane,Hard,28,A,3,Nicolas Mahut,31.9,Igor Sijsling,26.3,2014
...,...,...,...,...,...,...,...,...,...,...
27403,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Blaz Rola,32.9,Alex Knaff,25.7,2023
27404,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Chris Rodesch,22.1,Bor Artnak,19.2,2023
27405,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Mohamed Safwat,32.9,Ignacio Carou,24.1,2023
27406,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Karim Mohamed Maamoun,32.4,Franco Roncadelli,23.5,2023


$f(t) = e^{-\lambda \times t}$

In [330]:
import math

In [333]:
[0.95 * math.exp(-0.0075 * (37-25)) ** i for i in range(5)]

[0.95,
 0.8682346260076668,
 0.7935067008407084,
 0.7252105196200105,
 0.6627925097674795]

In [327]:
import math

def decay_factor(year_difference, decay_rate=0.3):
    """
    Calculate the decay factor based on the year difference from the present.
    
    :param year_difference: The difference in years from the present.
    :param decay_rate: The rate at which the weight decays (higher value means faster decay).
    :return: The decay factor for the given year difference.
    """
    #weight = 1 - decay_rate * abs(year_difference)
    return math.exp(-decay_rate * abs(year_difference))

# Example usage
current_year = 2023
match_year = 2014

# Calculate the year difference from the current year
year_difference = match_year - current_year

# Get the decay factor for this match
weight = decay_factor(year_difference)

print(f"Decay weight for match in {match_year}: {weight:.4f}")

Decay weight for match in 2014: 0.0672


In [224]:
data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
0,Brisbane,Hard,32,A,3,Andy Roddick,27.3,Peter Luczak,30.3,2010
1,Brisbane,Hard,32,A,3,Carsten Ball,22.5,Mischa Zverev,22.3,2010
2,Brisbane,Hard,32,A,3,Richard Gasquet,23.5,Jarkko Nieminen,28.4,2010
3,Brisbane,Hard,32,A,3,Matthew Ebden,22.1,Jurgen Melzer,28.6,2010
4,Brisbane,Hard,32,A,3,Tomas Berdych,24.2,Nick Lindahl,21.4,2010
...,...,...,...,...,...,...,...,...,...,...
39361,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Blaz Rola,32.9,Alex Knaff,25.7,2023
39362,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Chris Rodesch,22.1,Bor Artnak,19.2,2023
39363,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Mohamed Safwat,32.9,Ignacio Carou,24.1,2023
39364,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Karim Mohamed Maamoun,32.4,Franco Roncadelli,23.5,2023


In [366]:
import pandas as pd
import itertools


games_played = {}
wins = {}

    # Iterate through the matches to populate the games_played and wins dictionaries
for _, row in data.iterrows():
    winner = row['winner_name']
    loser = row['loser_name']
    
        # Initialize game count for winner and loser if not already in the dictionary
    if winner not in games_played:
        games_played[winner] = {}
        wins[winner] = {}
            
    if loser not in games_played:
        games_played[loser] = {}
        wins[loser] = {}
            
        # Update games played and wins for both winner and loser against each other
    if loser not in games_played[winner]:
        games_played[winner][loser] = 0
        wins[winner][loser] = 0
    if winner not in games_played[loser]:
        games_played[loser][winner] = 0
        wins[loser][winner] = 0
    
    games_played[winner][loser] += 1
    games_played[loser][winner] += 1
    wins[winner][loser] += 1

    # Calculate win percentages
win_percentages = {}
for player in games_played:
    win_percentages[player] = {}
    for opponent in games_played[player]:
        total_games = games_played[player][opponent]
        total_wins = wins[player][opponent]
        win_percentage = total_wins / total_games if total_games > 0 else 0
        win_percentages[player][opponent] = win_percentage

win_percentage_df = pd.DataFrame(win_percentages).T.fillna(0)

    # Create a DataFrame for games played
games_played_df = pd.DataFrame(games_played).T.fillna(0)



In [310]:
games_played_df.index.name = 'Player_Name'

In [376]:
win_percentages['Roger Federer']['Novak Djokovic']

0.35

In [383]:
pd.DataFrame(win_percentages).fillna(0)['Roger Federer']['Novak Djokovic']

0.35

In [372]:
games_played_df.loc['Andy Murray'].sum()

438.0

In [373]:
data[(data['winner_name'] == 'Andy Murray') | (data['loser_name'] == 'Andy Murray')]

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year,winner_age_group,loser_age_group
65,Doha,Hard,32,A,3,Andy Murray,26.6,Mousa Shanan Zayed,19.8,2014,26-27,19-20
75,Doha,Hard,32,A,3,Florian Mayer,30.2,Andy Murray,26.6,2014,30-31,26-27
155,Australian Open,Hard,128,G,5,Andy Murray,26.6,Go Soeda,29.3,2014,26-27,29-30
211,Australian Open,Hard,128,G,5,Andy Murray,26.6,Vincent Millot,27.9,2014,26-27,27-28
239,Australian Open,Hard,128,G,5,Andy Murray,26.6,Feliciano Lopez,32.3,2014,26-27,32-33
...,...,...,...,...,...,...,...,...,...,...,...,...
26954,Shanghai Masters,Hard,128,M,3,Roman Safiullin,26.1,Andy Murray,36.3,2023,26-27,36-37
27051,Basel,Hard,32,A,3,Tomas Martin Etcheverry,24.2,Andy Murray,36.4,2023,24-25,36-37
27060,Basel,Hard,32,A,3,Andy Murray,36.4,Yannick Hanfmann,31.9,2023,36-37,31-32
27145,Paris Masters,Hard,64,M,3,Alex De Minaur,24.6,Andy Murray,36.4,2023,24-25,36-37


In [268]:
wins

{'Andy Roddick': {'Peter Luczak': 1,
  'Carsten Ball': 1,
  'Richard Gasquet': 1,
  'Tomas Berdych': 3,
  'Radek Stepanek': 2,
  'Thiemo De Bakker': 3,
  'Thomaz Bellucci': 1,
  'Feliciano Lopez': 2,
  'Fernando Gonzalez': 1,
  'Marin Cilic': 0,
  'Ryler Deheart': 1,
  'Leonardo Mayer': 1,
  'Sam Querrey': 3,
  'Fernando Verdasco': 1,
  'James Blake': 3,
  'Yen Hsun Lu': 3,
  'Jurgen Melzer': 1,
  'Tommy Robredo': 1,
  'Robin Soderling': 2,
  'Ivan Ljubicic': 0,
  'Igor Andreev': 1,
  'Sergiy Stakhovsky': 2,
  'Benjamin Becker': 1,
  'Nicolas Almagro': 2,
  'Rafael Nadal': 1,
  'Jarkko Nieminen': 2,
  'Blaz Kavcic': 1,
  'Teymuraz Gabashvili': 0,
  'Igor Kunitsyn': 2,
  'Dudi Sela': 0,
  'Rajeev Ram': 2,
  'Michael Llodra': 1,
  'Philipp Kohlschreiber': 2,
  'Xavier Malisse': 1,
  'Mardy Fish': 0,
  'Grega Zemlja': 1,
  'Gilles Simon': 0,
  'Novak Djokovic': 1,
  'Stephane Robert': 1,
  'Janko Tipsarevic': 1,
  'Tatsuma Ito': 1,
  'Jeremy Chardy': 2,
  'Gael Monfils': 0,
  'Guillermo G

In [263]:
matches_data = pd.DataFrame(games_played).T.fillna(0)

In [270]:
pd.DataFrame(win_percentages).T.fillna(0)

Unnamed: 0,Peter Luczak,Carsten Ball,Richard Gasquet,Tomas Berdych,Radek Stepanek,Thiemo De Bakker,Thomaz Bellucci,Feliciano Lopez,Fernando Gonzalez,Marin Cilic,...,Conor Gannon,Alvaro Guillen Meza,Lucky Candra Kurniawan,Ignacio Buse,Jose Flores,Freddy Murray,Walid Ahouda,Digvijaypratap Singh,Daniel Azar,Mustapha El Natour
Andy Roddick,1.0,1.0,0.50,0.5,1.0,1.0,1.0,0.666667,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peter Luczak,0.0,0.0,0.00,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Carsten Ball,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mischa Zverev,0.0,0.0,0.25,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Richard Gasquet,0.0,0.0,0.00,0.4,0.5,1.0,1.0,0.750000,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Daniel Azar,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fajing Sun,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alvaro Guillen Meza,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chris Rodesch,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [266]:
matches_data.loc['Andy Roddick'].sum()

155.0

In [235]:
data[(data['winner_name'] == 'Andy Roddick') & (data['loser_name'] == 'Thiemo De Bakker')]

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
179,Australian Open,Hard,128,G,5,Andy Roddick,27.3,Thiemo De Bakker,21.3,2010
837,Indian Wells Masters,Hard,96,M,3,Andy Roddick,27.5,Thiemo De Bakker,21.4,2010
2260,Cincinnati Masters,Hard,56,M,3,Andy Roddick,27.9,Thiemo De Bakker,21.9,2010


In [233]:
data[(data['winner_name'] == 'Feliciano Lopez') & (data['loser_name'] == 'Andy Roddick')]

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
4755,Wimbledon,Grass,128,G,5,Feliciano Lopez,29.7,Andy Roddick,28.8,2011


In [238]:
games_played_df

Unnamed: 0,Peter Luczak,Carsten Ball,Richard Gasquet,Tomas Berdych,Radek Stepanek,Thiemo De Bakker,Thomaz Bellucci,Feliciano Lopez,Fernando Gonzalez,Marin Cilic,...,Conor Gannon,Alvaro Guillen Meza,Lucky Candra Kurniawan,Ignacio Buse,Jose Flores,Freddy Murray,Walid Ahouda,Digvijaypratap Singh,Daniel Azar,Mustapha El Natour
Andy Roddick,1.0,1.0,2.0,6.0,2.0,3.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peter Luczak,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Carsten Ball,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mischa Zverev,0.0,1.0,4.0,3.0,0.0,0.0,1.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Richard Gasquet,0.0,0.0,0.0,15.0,2.0,2.0,4.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Daniel Azar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Fajing Sun,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alvaro Guillen Meza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chris Rodesch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [243]:
win_percentage_df.loc['Feliciano Lopez']['Andy Roddick']

0.3333333333333333

In [226]:
win_percentage_df

Unnamed: 0,Peter Luczak,Carsten Ball,Richard Gasquet,Tomas Berdych,Radek Stepanek,Thiemo De Bakker,Thomaz Bellucci,Feliciano Lopez,Fernando Gonzalez,Marin Cilic,...,Conor Gannon,Alvaro Guillen Meza,Lucky Candra Kurniawan,Ignacio Buse,Jose Flores,Freddy Murray,Walid Ahouda,Digvijaypratap Singh,Daniel Azar,Mustapha El Natour
Andy Roddick,1.0,1.0,0.50,0.5,1.0,1.0,1.0,0.666667,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Peter Luczak,0.0,0.0,0.00,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Carsten Ball,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mischa Zverev,0.0,0.0,0.25,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Richard Gasquet,0.0,0.0,0.00,0.4,0.5,1.0,1.0,0.750000,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Daniel Azar,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fajing Sun,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alvaro Guillen Meza,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chris Rodesch,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
for _, row in data.iterrows():
    winner = row['winner_name']
    loser = row['loser_name']

    

In [6]:
data[(data['Year'] == 2023) & (data['tourney_name'] == 'Roland Garros')]

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
25777,Roland Garros,Clay,128,G,5,Carlos Alcaraz,20.0,Flavio Cobolli,21.0,2023
25778,Roland Garros,Clay,128,G,5,Taro Daniel,30.3,Christopher Oconnell,28.9,2023
25779,Roland Garros,Clay,128,G,5,Matteo Arnaldi,22.2,Daniel Elahi Galan,26.9,2023
25780,Roland Garros,Clay,128,G,5,Denis Shapovalov,24.1,Brandon Nakashima,21.8,2023
25781,Roland Garros,Clay,128,G,5,Lorenzo Musetti,21.2,Mikael Ymer,24.7,2023
...,...,...,...,...,...,...,...,...,...,...
25899,Roland Garros,Clay,128,G,5,Casper Ruud,24.4,Holger Rune,20.0,2023
25900,Roland Garros,Clay,128,G,5,Alexander Zverev,26.1,Tomas Martin Etcheverry,23.8,2023
25901,Roland Garros,Clay,128,G,5,Novak Djokovic,36.0,Carlos Alcaraz,20.0,2023
25902,Roland Garros,Clay,128,G,5,Casper Ruud,24.4,Alexander Zverev,26.1,2023


In [257]:
def logistic( x):
    """
    Creates logistic function used for ELO calculation.

    Args:
        x (float): number input for the log function

    Returns:
            Final calculation of log function with given number
    """
    return 1 / (1 + 10**(-x))

In [262]:
logistic((1500-1300)/400) + logistic((1300-1500)/400)

1.0

In [261]:
logistic((1300-1500)/400)

0.2402530733520421

In [138]:
data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
0,Brisbane,Hard,32,A,3,Andy Roddick,27.3,Peter Luczak,30.3,2010
1,Brisbane,Hard,32,A,3,Carsten Ball,22.5,Mischa Zverev,22.3,2010
2,Brisbane,Hard,32,A,3,Richard Gasquet,23.5,Jarkko Nieminen,28.4,2010
3,Brisbane,Hard,32,A,3,Matthew Ebden,22.1,Jurgen Melzer,28.6,2010
4,Brisbane,Hard,32,A,3,Tomas Berdych,24.2,Nick Lindahl,21.4,2010
...,...,...,...,...,...,...,...,...,...,...
39361,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Blaz Rola,32.9,Alex Knaff,25.7,2023
39362,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Chris Rodesch,22.1,Bor Artnak,19.2,2023
39363,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Mohamed Safwat,32.9,Ignacio Carou,24.1,2023
39364,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Karim Mohamed Maamoun,32.4,Franco Roncadelli,23.5,2023


In [None]:
X_train_new = train_over.drop('Target', axis = 1)
y_train_new = train_over['Target']

In [273]:
#tennis_data = pd.read_csv('tennis_data.csv')
player_elos = pd.read_csv('../data/player_elos.csv', index_col='Player_Name')

In [274]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO,Player_age
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tristan Lamasine,1493.387818,1493.576646,1492.922151,29.8
Sam Riffice,1485.198991,1488.159193,1488.159193,24.4
Mattia Bellucci,1500.000000,1500.000000,1500.000000,21.6
Yannick Maden,1470.762620,1476.230963,1473.139722,34.2
Andreas Haider Maurer,1437.895391,1437.229610,1441.228491,38.9
...,...,...,...,...
Mehluli Don Ayanda Sibanda,1492.730124,1492.998100,1493.656989,24.0
Nikoloz Basilashvili,1454.103588,1458.841098,1467.911592,30.9
Nik Razborsek,1498.373420,1498.374707,1497.969671,34.4
Yanki Erel,1496.909013,1496.136266,1496.909013,22.9


In [125]:
data[data['winner_name'] == 'Carlos Alcaraz']

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_age,loser_name,loser_age,Year
29871,ATP Rio de Janeiro,Clay,32,A,3,Carlos Alcaraz,16.7,Albert Ramos,32.0,2020
31082,Vienna,Hard,32,A,3,Carlos Alcaraz,18.4,Daniel Evans,31.4,2021
31095,Vienna,Hard,32,A,3,Carlos Alcaraz,18.4,Andy Murray,34.4,2021
31101,Vienna,Hard,32,A,3,Carlos Alcaraz,18.4,Matteo Berrettini,25.5,2021
31139,Paris Masters,Hard,64,M,3,Carlos Alcaraz,18.4,Pierre Hugues Herbert,30.6,2021
...,...,...,...,...,...,...,...,...,...,...
38806,Beijing,Hard,32,A,3,Carlos Alcaraz,20.3,Yannick Hanfmann,31.8,2023
38837,Shanghai Masters,Hard,128,M,3,Carlos Alcaraz,20.4,Daniel Evans,33.3,2023
38853,Shanghai Masters,Hard,128,M,3,Carlos Alcaraz,20.4,Gregoire Barrere,29.6,2023
39182,Tour Finals,Hard,8,A,3,Carlos Alcaraz,20.5,Daniil Medvedev,27.7,2023


In [181]:
from scipy.stats import norm

In [194]:
mean = 25
std_dev = 25

y = norm.pdf(37, mean, std_dev)
y_peak = norm.pdf(mean, mean, std_dev)
y_normalized = y / y_peak

In [195]:
y_normalized

0.8911878885041845

In [216]:
import math

def age_decay_factor(age, decay_rate=0.005):
    """
    Calculate the exponential decay factor based on age.
    
    :param age: The player's age.
    :param decay_rate: The rate at which the decay occurs after age 25.
    :return: The decay factor for the given age.
    """
    # For ages 25 or younger, the factor is 1.
    if age <= 25:
        return 1.0
    # For ages above 25, apply the exponential decay.
    else:
        return math.exp(-decay_rate * (age - 25))

# Example usage
ages = [20, 25, 30, 35, 40]

# Calculate the decay factors for each age
for age in ages:
    factor = age_decay_factor(age)
    print(f"Decay factor for age {age}: {factor:.4f}")



Decay factor for age 20: 1.0000
Decay factor for age 25: 1.0000
Decay factor for age 30: 0.9753
Decay factor for age 35: 0.9512
Decay factor for age 40: 0.9277


In [220]:
player_elos.loc['Casper Ruud']

Hard_ELO      1773.280335
Clay_ELO      1822.241403
Grass_ELO     1739.887825
Player_age      24.300000
Name: Casper Ruud, dtype: float64

In [205]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO,Player_age
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nikola Cacic,1499.994673,1499.993341,1499.994673,32.6
Marcel Granollers,1453.471439,1462.688900,1466.152219,37.0
Thanasi Kokkinakis,1506.573337,1501.327748,1507.718537,26.9
Chuhan Wang,1499.807405,1499.845924,1499.845924,32.0
Jesse Huta Galung,1496.210037,1496.896130,1496.937675,38.3
...,...,...,...,...
Liam Caruana,1493.551790,1494.841432,1494.841432,25.7
Salvador Andres Bolanos,1499.754087,1499.754067,1499.781402,24.8
Juan Carlos Saez,1500.488380,1500.610475,1500.488380,32.1
Cayetano March,1497.697743,1498.158194,1498.158194,25.6


In [94]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO,Player_age
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ryan Harrison,1544.432214,1439.728247,1434.772381,31.6
Hugo Grenier,1477.609055,1480.897086,1494.095614,27.0
Vladyslav Manafov,1496.178196,1498.089098,1498.089098,30.3
Donald Young,1488.633000,1400.474508,1499.710461,36.4
Salvatore Caruso,1424.361147,1396.579264,1404.963998,31.0
...,...,...,...,...
Gustavo Ramirez,1497.195775,1494.391549,1497.195775,37.2
Yshai Oliel,1459.977304,1477.148882,1479.042062,23.6
Skyler Butts,1505.000000,1502.500000,1502.500000,29.3
Niels Desein,1448.624653,1468.085753,1455.632607,36.0


In [None]:
data

In [381]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO,Player_age,Games_played
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jan Kroslak,1495.320324,1496.982564,1500.000000,50.3,19
Michael Agwi,1500.000000,1500.000000,1500.000000,20.4,2
Holger Rune,1757.131745,1992.396285,1889.735991,20.7,187
Lamine Ouahab,1493.250216,1497.407999,1500.000000,39.1,38
Micke Kontinen,1498.269066,1497.203684,1500.000000,33.2,9
...,...,...,...,...,...
Miguel Gallardo Valles,1500.684270,1495.065691,1500.000000,43.9,20
Karim Mohamed Maamoun,1499.603797,1501.132900,1500.000000,33.4,13
Phillip King,1487.447047,1500.000000,1500.000000,43.1,9
Victor Ionita,1500.000000,1495.197985,1500.000000,43.3,4


In [377]:
player_elos.loc['Abdelhak Hameurlaine']

Hard_ELO        1500.000000
Clay_ELO        1499.013606
Grass_ELO       1500.000000
Player_age        53.000000
Games_played       4.000000
Name: Abdelhak Hameurlaine, dtype: float64

In [371]:
games_played_winner = pd.DataFrame(tennis_data['winner_name'].value_counts())
games_played_loser = pd.DataFrame(tennis_data['loser_name'].value_counts())
games_played = pd.concat([games_played_winner, games_played_loser])
games_played.groupby(games_played.index).sum()['count']

Abdelhak Hameurlaine     4
Abdul Mumin Babalola     2
Abdulhameed Alshatti     1
Abdullah Maqdas         15
Abedallah Shelbayh      14
                        ..
Zhizhen Zhang           84
Zibusiso Ncube           1
Zizou Bergs             33
Zsombor Piros           14
Zura Tkemaladze          2
Name: count, Length: 2075, dtype: int64

Unnamed: 0,count
Abdelhak Hameurlaine,4
Abdul Mumin Babalola,2
Abdulhameed Alshatti,1
Abdullah Maqdas,15
Abedallah Shelbayh,14
...,...
Zhizhen Zhang,84
Zibusiso Ncube,1
Zizou Bergs,33
Zsombor Piros,14


In [338]:
df_sorted = tennis_data.sort_values(by = 'Year', ascending=False)

        # Create a dataframe of each of the winners last ages in the dataset, keeping only the first in drop duplicates.
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')

        # Renames columns for proper naming.
winner_ages.rename(columns={'winner_name': 'Player_name', 'winner_age': 'most_recent_age'}, inplace=True)
winner_ages['Result'] = 'Winner'

        # Create a dataframe of each of the losers last ages in the dataset, keeping only the first in drop duplicates.
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name',keep='first')
        
        # Renames the columns similar to winners_ages for similar naming convention.
loser_ages.rename(columns={'loser_name': 'Player_name', 'loser_age': 'most_recent_age'}, inplace=True)
loser_ages['Result'] = 'Loser'

recent_ages = pd.concat([winner_ages, loser_ages])

        # Use pivot to create new dataframe using Player_name as the index, the column being if they won or lost a match, and the values
        # being their most recent age.
recent_ages = recent_ages.pivot(index ='Player_name', columns = 'Result', values = 'most_recent_age').reset_index()

recent_ages

Result,Player_name,Loser,Winner
0,Abdelhak Hameurlaine,35.0,34.3
1,Abdul Mumin Babalola,22.2,
2,Abdulhameed Alshatti,18.6,
3,Abdullah Maqdas,29.8,28.2
4,Abedallah Shelbayh,20.2,19.4
...,...,...,...
2070,Zhizhen Zhang,27.3,27.3
2071,Zibusiso Ncube,24.6,
2072,Zizou Bergs,24.6,24.6
2073,Zsombor Piros,24.0,23.2


In [343]:
player_elos['Player_age'].sort_values()

Player_Name
Darwin Blanch           16.5
Joao Fonseca            17.4
Luca Preda              17.8
Martin Landaluce        18.2
Jakub Mensik            18.4
                        ... 
Mark Woodforde          58.3
Richey Reneberg         58.3
Gianluca Pozzi          58.8
Ronald Agenor           59.7
Rafael Avalos Brenes    62.6
Name: Player_age, Length: 2075, dtype: float64

In [215]:
tennis_data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_hand,winner_age,loser_name,loser_hand,loser_age,winner_rank,winner_rank_points,loser_rank,loser_rank_points,Year
0,Auckland,Hard,32,A,3,Tommy Haas,R,21.7,Jeff Tarango,L,31.1,11.0,1612.0,63.0,595.0,2000
1,Auckland,Hard,32,A,3,Juan Balcells,R,24.5,Franco Squillari,L,24.3,211.0,157.0,49.0,723.0,2000
2,Auckland,Hard,32,A,3,Alberto Martin,R,21.3,Alberto Berasategui,R,26.5,48.0,726.0,59.0,649.0,2000
3,Auckland,Hard,32,A,3,Juan Carlos Ferrero,R,19.9,Roger Federer,R,18.4,45.0,768.0,61.0,616.0,2000
4,Auckland,Hard,32,A,3,Michael Sell,R,27.3,Nicolas Escude,R,23.7,167.0,219.0,34.0,873.0,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69511,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,3,Radu Albot,R,34.2,Joaquin Aguilar Cardozo,U,18.8,136.0,489.0,1109.0,8.0,2024
69512,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,3,Radu Albot,R,34.2,Franco Roncadelli,L,23.9,136.0,489.0,616.0,55.0,2024
69513,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,3,Joaquin Aguilar Cardozo,U,18.8,Ilya Snitari,U,21.8,1109.0,8.0,740.0,34.0,2024
69514,Davis Cup WG2 PO: VIE vs RSA,Hard,4,D,3,Nam Hoang Ly,R,26.9,Philip Henning,R,23.2,554.0,67.0,748.0,32.0,2024


In [351]:
import pandas as pd

# Load the dataset
df = tennis_data

# Sort the DataFrame by Year to ensure we get the most recent matches first
df_sorted = df.sort_values(by='Year', ascending=False)

# Get the most recent ages for winners
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')
winner_ages.rename(columns={'winner_name': 'Player Name', 'winner_age': 'Most Recent Age'}, inplace=True)
winner_ages['Role'] = 'Winner'

# Get the most recent ages for losers
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name', keep='first')
loser_ages.rename(columns={'loser_name': 'Player Name', 'loser_age': 'Most Recent Age'}, inplace=True)
loser_ages['Role'] = 'Loser'

# Combine both DataFrames
recent_ages = pd.concat([winner_ages, loser_ages], ignore_index=True)

# Pivot to have Player Names and ages in a single row
recent_ages_pivot = recent_ages.pivot(index='Player Name', columns='Role', values='Most Recent Age').reset_index()

# Fill NaN values with 0 or a placeholder if needed
recent_ages_pivot.fillna(0, inplace=True)

# Get the most recent year for each player
recent_years = pd.concat([winner_ages[['Player Name', 'Year']], loser_ages[['Player Name', 'Year']]])
#recent_years = recent_years.drop_duplicates(subset='Player Name', keep='first')
recent_years



Unnamed: 0,Player Name,Year
69515,Nam Hoang Ly,2024
68593,Alexei Popyrin,2024
68585,Jakub Mensik,2024
68586,Gael Monfils,2024
68588,Karen Khachanov,2024
...,...,...
1548,Jonas Froberg,2000
1545,Johan Ortegren,2000
1700,Javier Sanchez,2000
1638,Karsten Braasch,2000


In [319]:
recent_ages_pivot

Unnamed: 0,Player Name,Loser,Winner,Year,Current Winner Age,Current Loser Age,Max Current Age
0,Abdelhak Hameurlaine,35.0,34.3,2006,52.3,53.0,53.0
1,Abdul Mumin Babalola,22.2,0.0,2007,17.0,39.2,39.2
2,Abdulhameed Alshatti,18.6,0.0,2014,10.0,28.6,28.6
3,Abdullah Maqdas,29.8,28.2,2015,37.2,38.8,38.8
4,Abedallah Shelbayh,20.2,19.4,2023,20.4,21.2,21.2
...,...,...,...,...,...,...,...
2070,Zhizhen Zhang,27.3,27.3,2024,27.3,27.3,27.3
2071,Zibusiso Ncube,24.6,0.0,2003,21.0,45.6,45.6
2072,Zizou Bergs,24.6,24.6,2024,24.6,24.6,24.6
2073,Zsombor Piros,24.0,23.2,2023,24.2,25.0,25.0


In [289]:
tennis_data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_hand,winner_age,loser_name,loser_hand,loser_age,winner_rank,winner_rank_points,loser_rank,loser_rank_points,Year
0,Auckland,Hard,32,A,3,Tommy Haas,R,21.7,Jeff Tarango,L,31.1,11.0,1612.0,63.0,595.0,2000
1,Auckland,Hard,32,A,3,Juan Balcells,R,24.5,Franco Squillari,L,24.3,211.0,157.0,49.0,723.0,2000
2,Auckland,Hard,32,A,3,Alberto Martin,R,21.3,Alberto Berasategui,R,26.5,48.0,726.0,59.0,649.0,2000
3,Auckland,Hard,32,A,3,Juan Carlos Ferrero,R,19.9,Roger Federer,R,18.4,45.0,768.0,61.0,616.0,2000
4,Auckland,Hard,32,A,3,Michael Sell,R,27.3,Nicolas Escude,R,23.7,167.0,219.0,34.0,873.0,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69511,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,3,Radu Albot,R,34.2,Joaquin Aguilar Cardozo,U,18.8,136.0,489.0,1109.0,8.0,2024
69512,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,3,Radu Albot,R,34.2,Franco Roncadelli,L,23.9,136.0,489.0,616.0,55.0,2024
69513,Davis Cup WG2 PO: URU vs MDA,Clay,4,D,3,Joaquin Aguilar Cardozo,U,18.8,Ilya Snitari,U,21.8,1109.0,8.0,740.0,34.0,2024
69514,Davis Cup WG2 PO: VIE vs RSA,Hard,4,D,3,Nam Hoang Ly,R,26.9,Philip Henning,R,23.2,554.0,67.0,748.0,32.0,2024


In [296]:
df_sorted = tennis_data.sort_values(by='Year', ascending=False)

        # Create a dataframe of each of the winners last ages in the dataset, keeping only the first in drop duplicates.
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')

        # Renames columns for proper naming.
winner_ages.rename(columns={'winner_name': 'player_name', 'winner_age': 'most_recent_age'}, inplace=True)
winner_ages['Role'] = 'Winner'

        # Create a dataframe of each of the losers last ages in the dataset, keeping only the first in drop duplicates.
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name', keep='first')
        
        # Renames the columns similar to winners_ages for similar naming convention.
loser_ages.rename(columns={'loser_name': 'player_name', 'loser_age': 'most_recent_age'}, inplace=True)
loser_ages['Role'] = 'Loser'

        # Combines the dataframe into one, with both the recent winners and loser ages.
recent_ages = pd.concat([winner_ages, loser_ages], ignore_index=True)

        # 
recent_ages_pivot = recent_ages.pivot(index='player_name', columns='Role', values='most_recent_age').reset_index()

recent_ages_pivot.fillna(0, inplace=True)

recent_years = pd.concat([winner_ages[['player_name', 'Year']], loser_ages[['player_name', 'Year']]], ignore_index=True)
recent_years = recent_years.drop_duplicates(subset='player_name', keep='first')


In [175]:
len(tennis_data[(tennis_data['tourney_name'] == 'Australian Open') & (tennis_data['Year'] == 2024)])

99

In [316]:
import pandas as pd

# Load the dataset
df = tennis_data

# Sort the DataFrame by Year to ensure we get the most recent matches first
df_sorted = df.sort_values(by='Year', ascending=False)

# Get the most recent ages for winners
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')
winner_ages.rename(columns={'winner_name': 'Player Name', 'winner_age': 'Most Recent Age'}, inplace=True)
winner_ages['Role'] = 'Winner'

# Get the most recent ages for losers
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name', keep='first')
loser_ages.rename(columns={'loser_name': 'Player Name', 'loser_age': 'Most Recent Age'}, inplace=True)
loser_ages['Role'] = 'Loser'

# Combine both DataFrames
recent_ages = pd.concat([winner_ages, loser_ages], ignore_index=True)

# Pivot to have Player Names and ages in a single row
recent_ages_pivot = recent_ages.pivot(index='Player Name', columns='Role', values='Most Recent Age').reset_index()

# Fill NaN values with 0 or a placeholder if needed
recent_ages_pivot.fillna(0, inplace=True)

# Get the most recent year for each player
recent_years = pd.concat([winner_ages[['Player Name', 'Year']], loser_ages[['Player Name', 'Year']]], ignore_index=True)
recent_years = recent_years.drop_duplicates(subset='Player Name', keep='first')

# Merge to get the year with ages
recent_ages_pivot = recent_ages_pivot.merge(recent_years, on='Player Name')

recent_ages_pivot

Unnamed: 0,Player Name,Loser,Winner,Year
0,Abdelhak Hameurlaine,35.0,34.3,2006
1,Abdul Mumin Babalola,22.2,0.0,2007
2,Abdulhameed Alshatti,18.6,0.0,2014
3,Abdullah Maqdas,29.8,28.2,2015
4,Abedallah Shelbayh,20.2,19.4,2023
...,...,...,...,...
2070,Zhizhen Zhang,27.3,27.3,2024
2071,Zibusiso Ncube,24.6,0.0,2003
2072,Zizou Bergs,24.6,24.6,2024
2073,Zsombor Piros,24.0,23.2,2023


In [311]:
loser_ages['']

Unnamed: 0,Player Name,most_recent_age,Year,Role
69515,Kris Van Wyk,27.3,2024,Loser
68593,Hugo Grenier,27.9,2024,Loser
68585,Andrey Rublev,26.3,2024,Loser
68586,Ugo Humbert,25.6,2024,Loser
68587,Alexander Bublik,26.6,2024,Loser
...,...,...,...,...
1548,Jonas Froberg,19.6,2000,Loser
1545,Johan Ortegren,22.6,2000,Loser
1700,Javier Sanchez,31.9,2000,Loser
1638,Karsten Braasch,32.9,2000,Loser


In [290]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO,Player_age
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sho Shimabukuro,1374.452411,1500.000000,1415.158534,27.2
Alejandro Davidovich Fokina,1780.705249,1690.746251,1575.393870,24.7
Talal Ouahabi,1500.000000,1497.739338,1500.000000,46.1
Tomas Tenconi,1500.000000,1496.685945,1500.000000,45.7
Marco Mirnegg,1500.000000,1498.243968,1500.000000,41.8
...,...,...,...,...
Luis Diego Chavez Villalpando,1500.000000,1500.653475,1500.000000,28.2
Emanuel Couto,1499.638268,1498.815605,1500.000000,50.6
Tiago Sousa,1500.000000,1498.448850,1500.000000,45.4
Andre Agassi,1687.831253,1539.392831,1530.758660,53.8


In [293]:
import pandas as pd

# Load the dataset
df = pd.read_csv('tennis_data.csv')

# Sort the DataFrame by Year to ensure we get the most recent matches first
df_sorted = df.sort_values(by='Year', ascending=False)

# Get the most recent ages for winners
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')
winner_ages.rename(columns={'winner_name': 'Player Name', 'winner_age': 'Most Recent Age'}, inplace=True)
winner_ages['Role'] = 'Winner'

# Get the most recent ages for losers
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name', keep='first')
loser_ages.rename(columns={'loser_name': 'Player Name', 'loser_age': 'Most Recent Age'}, inplace=True)
loser_ages['Role'] = 'Loser'

# Combine both DataFrames
recent_ages = pd.concat([winner_ages, loser_ages], ignore_index=True)

# Pivot to have Player Names and ages in a single row
recent_ages_pivot = recent_ages.pivot(index='Player Name', columns='Role', values='Most Recent Age').reset_index()

# Fill NaN values with 0 or a placeholder if needed
recent_ages_pivot.fillna(0, inplace=True)

# Get the most recent year for each player
recent_years = pd.concat([winner_ages[['Player Name', 'Year']], loser_ages[['Player Name', 'Year']]], ignore_index=True)
recent_years = recent_years.drop_duplicates(subset='Player Name', keep='first')

# Merge to get the year with ages
recent_ages_pivot = recent_ages_pivot.merge(recent_years, on='Player Name')

# Calculate the current age based on the most recent year
current_year = 2023  # or use datetime.datetime.now().year for dynamic current year
recent_ages_pivot['Max Current Age'] = recent_ages_pivot[['Winner', 'Loser']].max(axis=1) + (current_year - recent_ages_pivot['Year'])

# Count occurrences (games played) for each player in the dataset
games_played_winner = df['winner_name'].value_counts().reset_index()
games_played_winner.columns = ['Player Name', 'Games Played']
games_played_loser = df['loser_name'].value_counts().reset_index()
games_played_loser.columns = ['Player Name', 'Games Played']

# Combine games played for both winners and losers
games_played = pd.concat([games_played_winner, games_played_loser], ignore_index=True)
games_played = games_played.groupby('Player Name').sum().reset_index()

# Merge games played with the ages DataFrame
final_df = recent_ages_pivot[['Player Name', 'Max Current Age']].merge(games_played, on='Player Name', how='left')

# Set Player Name as the index
final_df.set_index('Player Name', inplace=True)

# Print the resulting DataFrame
print(final_df[['Max Current Age', 'Games Played']])


                      Max Current Age  Games Played
Player Name                                        
Abdelhak Hameurlaine             52.0             4
Abdul Mumin Babalola             38.2             2
Abdulhameed Alshatti             27.6             1
Abdullah Maqdas                  37.8            15
Abedallah Shelbayh               20.2            14
...                               ...           ...
Zhizhen Zhang                    26.3            84
Zibusiso Ncube                   44.6             1
Zizou Bergs                      23.6            33
Zsombor Piros                    24.0            14
Zura Tkemaladze                  22.6             2

[2075 rows x 2 columns]


In [295]:
final_df['Max Current Age'].sort_values()

Player Name
Darwin Blanch           15.5
Joao Fonseca            16.4
Luca Preda              16.8
Martin Landaluce        17.2
Jakub Mensik            17.4
                        ... 
Richey Reneberg         57.3
Paul Haarhuis           57.3
Gianluca Pozzi          57.8
Ronald Agenor           58.7
Rafael Avalos Brenes    61.6
Name: Max Current Age, Length: 2075, dtype: float64

In [264]:
import pandas as pd

# Assuming player_elos and recent_ages_pivot are already defined and player_elos is indexed by player_name

# Ensure recent_ages_pivot is set up for joining
max_age_series = recent_ages_pivot.set_index('Player Name')['Max Current Age']

# Join the Max Current Age to player_elos
player_elos = player_elos.join(max_age_series, how='left', rsuffix='_Max_Age')


In [265]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO,Max Current Age,Max Current Age_Max_Age,Max Current Age_Max_Age
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Febi Widhiyanto,1501.718611,1500.000000,1500.000000,42.9,42.9,42.9
Lauri Kiiski,1500.000000,1499.448513,1500.000000,42.7,42.7,42.7
Jean Claude Scherrer,1498.222592,1500.000000,1496.606843,45.1,45.1,45.1
Aslan Karatsev,1823.866932,1670.048054,1520.608844,29.6,29.6,29.6
Karan Rastogi,1494.571049,1498.801116,1500.000000,36.3,36.3,36.3
...,...,...,...,...,...,...
Albert Montanes,1432.781343,1539.354472,1477.929455,42.4,42.4,42.4
Tomas Tenconi,1500.000000,1496.685945,1500.000000,44.7,44.7,44.7
Andreas Haider Maurer,1467.193610,1468.219603,1481.229438,39.1,39.1,39.1
Gvidas Sabeckis,1499.297675,1499.297625,1500.000000,38.9,38.9,38.9


In [None]:
import pandas as pd

# Assuming final_df and elo_df are already defined
# Make sure elo_df has 'Player Name' as its index

# Join the two DataFrames on their indices (player names)
combined_df = final_df.join(player_elos, how='left')

# Print the resulting combined DataFrame
print(combined_df)


In [252]:
combined_df

Unnamed: 0_level_0,Max Current Age,Games Played,Hard_ELO,Clay_ELO,Grass_ELO
Player Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdelhak Hameurlaine,52.0,4,1500.000000,1499.013606,1500.000000
Abdul Mumin Babalola,38.2,2,1498.801900,1500.000000,1500.000000
Abdulhameed Alshatti,27.6,1,1499.050000,1500.000000,1500.000000
Abdullah Maqdas,37.8,15,1499.037636,1500.995503,1500.000000
Abedallah Shelbayh,20.2,14,1474.214171,1489.753049,1385.713953
...,...,...,...,...,...
Zhizhen Zhang,26.3,84,1772.127728,1678.443025,1402.399456
Zibusiso Ncube,44.6,1,1499.586096,1500.000000,1500.000000
Zizou Bergs,23.6,33,1238.572061,1508.288841,1469.286456
Zsombor Piros,24.0,14,1485.794678,1488.298390,1500.000000


In [245]:
final_df['Max Current Age']

Player Name
Abdelhak Hameurlaine    52.0
Abdul Mumin Babalola    38.2
Abdulhameed Alshatti    27.6
Abdullah Maqdas         37.8
Abedallah Shelbayh      20.2
                        ... 
Zhizhen Zhang           26.3
Zibusiso Ncube          44.6
Zizou Bergs             23.6
Zsombor Piros           24.0
Zura Tkemaladze         22.6
Name: Max Current Age, Length: 2075, dtype: float64

In [241]:
player_elos

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Febi Widhiyanto,1501.718611,1500.000000,1500.000000
Lauri Kiiski,1500.000000,1499.448513,1500.000000
Jean Claude Scherrer,1498.222592,1500.000000,1496.606843
Aslan Karatsev,1823.866932,1670.048054,1520.608844
Karan Rastogi,1494.571049,1498.801116,1500.000000
...,...,...,...
Albert Montanes,1432.781343,1539.354472,1477.929455
Tomas Tenconi,1500.000000,1496.685945,1500.000000
Andreas Haider Maurer,1467.193610,1468.219603,1481.229438
Gvidas Sabeckis,1499.297675,1499.297625,1500.000000


In [205]:
df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')

Unnamed: 0,winner_name,winner_age,Year
69515,Nam Hoang Ly,26.9,2024
68593,Alexei Popyrin,24.5,2024
68585,Jakub Mensik,18.4,2024
68586,Gael Monfils,37.4,2024
68588,Karen Khachanov,27.7,2024
...,...,...,...
1624,Karsten Braasch,32.9,2000
1695,Andrei Cherkasov,29.4,2000
2229,Petr Kralert,20.8,2000
1994,Marcelo Charpentier,26.8,2000


In [None]:
import pandas as pd

# Load the dataset
df = tennis_data

# Sort the DataFrame by Year to ensure we get the most recent matches first
df_sorted = df.sort_values(by='Year', ascending=False)

# Get the most recent ages for winners
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')
winner_ages.rename(columns={'winner_name': 'Player Name', 'winner_age': 'Most Recent Age'}, inplace=True)
winner_ages['Role'] = 'Winner'

# Get the most recent ages for losers
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name', keep='first')
loser_ages.rename(columns={'loser_name': 'Player Name', 'loser_age': 'Most Recent Age'}, inplace=True)
loser_ages['Role'] = 'Loser'

# Combine both DataFrames
recent_ages = pd.concat([winner_ages, loser_ages], ignore_index=True)

# Pivot to have Player Names and ages in a single row
recent_ages_pivot = recent_ages.pivot(index='Player Name', columns='Role', values='Most Recent Age').reset_index()

# Fill NaN values with 0 or a placeholder if needed
recent_ages_pivot.fillna(0, inplace=True)

# Get the most recent year for each player
recent_years = pd.concat([winner_ages[['Player Name', 'Year']], loser_ages[['Player Name', 'Year']]], ignore_index=True)
recent_years = recent_years.drop_duplicates(subset='Player Name', keep='first')

# Merge to get the year with ages
recent_ages_pivot = recent_ages_pivot.merge(recent_years, on='Player Name')

# Calculate the current age based on the most recent year
current_year = 2023  # or use datetime.datetime.now().year for dynamic current year
recent_ages_pivot['Max Current Age'] = recent_ages_pivot[['Winner', 'Loser']].max(axis=1) + (current_year - recent_ages_pivot['Year'])

# Count occurrences (games played) for each player in the dataset
games_played_winner = df['winner_name'].value_counts().reset_index()
games_played_winner.columns = ['Player Name', 'Games Played']
games_played_loser = df['loser_name'].value_counts().reset_index()
games_played_loser.columns = ['Player Name', 'Games Played']

# Combine games played for both winners and losers
games_played = pd.concat([games_played_winner, games_played_loser], ignore_index=True)
games_played = games_played.groupby('Player Name').sum().reset_index()

# Merge games played with the ages DataFrame
final_df = recent_ages_pivot[['Player Name', 'Max Current Age']].merge(games_played, on='Player Name', how='left')

# Set Player Name as the index
final_df.set_index('Player Name', inplace=True)

# Print the resulting DataFrame
print(final_df[['Max Current Age', 'Games Played']])


In [226]:
final_df

Unnamed: 0_level_0,Max Current Age,Games Played
Player Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Abdelhak Hameurlaine,52.0,4
Abdul Mumin Babalola,38.2,2
Abdulhameed Alshatti,27.6,1
Abdullah Maqdas,37.8,15
Abedallah Shelbayh,20.2,14
...,...,...
Zhizhen Zhang,26.3,84
Zibusiso Ncube,44.6,1
Zizou Bergs,23.6,33
Zsombor Piros,24.0,14


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('tennis_data.csv')

# Sort the DataFrame by Year to ensure we get the most recent matches first
df_sorted = df.sort_values(by='Year', ascending=False)

# Get the most recent ages for winners
winner_ages = df_sorted[['winner_name', 'winner_age', 'Year']].drop_duplicates('winner_name', keep='first')
winner_ages.rename(columns={'winner_name': 'Player Name', 'winner_age': 'Most Recent Age'}, inplace=True)
winner_ages['Role'] = 'Winner'

# Get the most recent ages for losers
loser_ages = df_sorted[['loser_name', 'loser_age', 'Year']].drop_duplicates('loser_name', keep='first')
loser_ages.rename(columns={'loser_name': 'Player Name', 'loser_age': 'Most Recent Age'}, inplace=True)
loser_ages['Role'] = 'Loser'

# Combine both DataFrames
recent_ages = pd.concat([winner_ages, loser_ages], ignore_index=True)

# Pivot to have Player Names and ages in a single row
recent_ages_pivot = recent_ages.pivot(index='Player Name', columns='Role', values='Most Recent Age').reset_index()

# Fill NaN values with 0 or a placeholder if needed
recent_ages_pivot.fillna(0, inplace=True)

# Get the most recent year for each player
recent_years = pd.concat([winner_ages[['Player Name', 'Year']], loser_ages[['Player Name', 'Year']]], ignore_index=True)
recent_years = recent_years.drop_duplicates(subset='Player Name', keep='first')

# Merge to get the year with ages
recent_ages_pivot = recent_ages_pivot.merge(recent_years, on='Player Name')

# Calculate the current age based on the most recent year
current_year = 2023  # or use datetime.datetime.now().year for dynamic current year
recent_ages_pivot['Current Winner Age'] = recent_ages_pivot['Winner'] + (current_year - recent_ages_pivot['Year'])
recent_ages_pivot['Current Loser Age'] = recent_ages_pivot['Loser'] + (current_year - recent_ages_pivot['Year'])

# Calculate the maximum current age for each player
recent_ages_pivot['Max Current Age'] = recent_ages_pivot[['Current Winner Age', 'Current Loser Age']].max(axis=1)

# Set Player Name as the index
recent_ages_pivot.set_index('Player Name', inplace=True)

# Print the resulting DataFrame
print(recent_ages_pivot[['Current Winner Age', 'Current Loser Age', 'Max Current Age']])


In [235]:
recent_ages_pivot

Unnamed: 0_level_0,Loser,Winner,Year,Current Winner Age,Current Loser Age,Max Current Age
Player Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abdelhak Hameurlaine,35.0,34.3,2006,51.3,52.0,52.0
Abdul Mumin Babalola,22.2,0.0,2007,16.0,38.2,38.2
Abdulhameed Alshatti,18.6,0.0,2014,9.0,27.6,27.6
Abdullah Maqdas,29.8,28.2,2015,36.2,37.8,37.8
Abedallah Shelbayh,20.2,19.4,2023,19.4,20.2,20.2
...,...,...,...,...,...,...
Zhizhen Zhang,27.3,27.3,2024,26.3,26.3,26.3
Zibusiso Ncube,24.6,0.0,2003,20.0,44.6,44.6
Zizou Bergs,24.6,24.6,2024,23.6,23.6,23.6
Zsombor Piros,24.0,23.2,2023,23.2,24.0,24.0


In [203]:
287+255

542

In [201]:
player_elos["Age"] = final_df['Max Current Age']
player_elos["Games_Played"] = final_df['Games Played']

In [198]:
player_elos['Age'] = recent_ages_pivot['Max Current Age']

In [147]:
player_elos[(player_elos['Grass_ELO'] == 1500) & (player_elos['Hard_ELO'] == 1500) & (player_elos['Clay_ELO'] == 1500)]

Unnamed: 0_level_0,Hard_ELO,Clay_ELO,Grass_ELO
Player_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Zura Tkemaladze,1500.0,1500.0,1500.0
Michael Sell,1500.0,1500.0,1500.0


In [79]:
recent_ages_pivot['Max Age'].sort_values()

659     16.5
887     17.2
43      17.6
773     17.7
589     17.9
        ... 
1167    39.8
1326    39.9
407     41.5
578     41.8
1291    43.8
Name: Max Age, Length: 1423, dtype: float64

In [68]:
tennis_data

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_hand,winner_ht,winner_age,loser_name,loser_hand,loser_ht,loser_age,winner_rank,winner_rank_points,loser_rank,loser_rank_points,Year
0,Auckland,Hard,32,A,3,Tommy Haas,R,188.0,21.7,Jeff Tarango,L,180.0,31.1,11.0,1612.0,63.0,595.0,2000
1,Auckland,Hard,32,A,3,Juan Balcells,R,190.0,24.5,Franco Squillari,L,183.0,24.3,211.0,157.0,49.0,723.0,2000
2,Auckland,Hard,32,A,3,Alberto Martin,R,175.0,21.3,Alberto Berasategui,R,173.0,26.5,48.0,726.0,59.0,649.0,2000
3,Auckland,Hard,32,A,3,Juan Carlos Ferrero,R,183.0,19.9,Roger Federer,R,185.0,18.4,45.0,768.0,61.0,616.0,2000
4,Auckland,Hard,32,A,3,Michael Sell,R,180.0,27.3,Nicolas Escude,R,185.0,23.7,167.0,219.0,34.0,873.0,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66165,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Blaz Rola,L,193.0,32.9,Chris Rodesch,R,198.0,22.1,457.0,94.0,609.0,54.0,2023
66166,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Blaz Rola,L,193.0,32.9,Alex Knaff,R,193.0,25.7,457.0,94.0,486.0,85.0,2023
66167,Davis Cup WG2 R1: SLO vs LUX,Clay,4,D,3,Chris Rodesch,R,198.0,22.1,Bor Artnak,R,193.0,19.2,609.0,54.0,616.0,54.0,2023
66168,Davis Cup WG2 R1: URU vs EGY,Clay,4,D,3,Mohamed Safwat,R,180.0,32.9,Ignacio Carou,R,178.0,24.1,463.0,93.0,828.0,24.0,2023


In [5]:
player_elos.mean(axis=1).sort_values().tail(10)

Player_Name
Rafael Nadal          1815.058659
Stefanos Tsitsipas    1850.552545
Holger Rune           1878.303056
Grigor Dimitrov       1920.709157
Andrey Rublev         1923.380562
Alexander Zverev      1925.199319
Daniil Medvedev       1964.241269
Jannik Sinner         2006.830614
Carlos Alcaraz        2173.114820
Novak Djokovic        2379.110656
dtype: float64

In [6]:
player_elos['Grass_ELO'].sort_values().tail(10)

Player_Name
Alexander Bublik       1840.118497
Matteo Berrettini      1846.138773
Roman Safiullin        1871.839246
Holger Rune            1889.906946
Christopher Eubanks    1918.294123
Andrey Rublev          1956.159546
Jannik Sinner          2027.974551
Daniil Medvedev        2044.097968
Novak Djokovic         2171.019127
Carlos Alcaraz         2406.120227
Name: Grass_ELO, dtype: float64

In [385]:
url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2024.csv"
response = requests.get(url)

In [434]:
aus = tennis_data[(tennis_data['Year'] == 2024) & (tennis_data['tourney_name'] == 'Australian Open')]

In [386]:
df = pd.read_csv(StringIO(response.text))

In [388]:
australian_open = df[df['tourney_name'] == 'Australian Open']
australian_open = australian_open[['tourney_name', 'surface', 'draw_size', 'tourney_level', 'winner_name', 'winner_hand', 'winner_ht', 'winner_age', 'loser_name', 'loser_hand', 'loser_ht', 'loser_age', 'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points']]


In [453]:
australian_open.head(11)

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,winner_name,winner_hand,winner_ht,winner_age,loser_name,loser_hand,loser_ht,loser_age,winner_rank,winner_rank_points,loser_rank,loser_rank_points
137,Australian Open,Hard,128,G,Novak Djokovic,R,188.0,36.6,Dino Prizmic,U,,18.4,1.0,11055.0,187.0,321.0
138,Australian Open,Hard,128,G,Alexei Popyrin,R,196.0,24.4,Marc Polmans,R,188.0,26.7,43.0,1052.0,154.0,406.0
139,Australian Open,Hard,128,G,Gael Monfils,R,193.0,37.3,Yannick Hanfmann,R,193.0,32.1,76.0,737.0,52.0,929.0
140,Australian Open,Hard,128,G,Tomas Martin Etcheverry,R,196.0,24.4,Andy Murray,R,190.0,36.6,32.0,1375.0,44.0,1050.0
141,Australian Open,Hard,128,G,Adrian Mannarino,L,183.0,35.5,Stan Wawrinka,R,183.0,38.8,19.0,1765.0,56.0,907.0
142,Australian Open,Hard,128,G,Jaume Munar,R,183.0,26.6,Alexander Shevchenko,R,,23.1,82.0,706.0,48.0,975.0
143,Australian Open,Hard,128,G,Christopher Oconnell,R,183.0,29.6,Cristian Garin,R,185.0,27.6,68.0,814.0,86.0,680.0
144,Australian Open,Hard,128,G,Ben Shelton,L,,21.2,Roberto Bautista Agut,R,183.0,35.7,16.0,2225.0,72.0,776.0
145,Australian Open,Hard,128,G,Taylor Fritz,R,193.0,26.2,Facundo Diaz Acosta,L,183.0,23.0,12.0,2840.0,90.0,661.0
146,Australian Open,Hard,128,G,Hugo Gaston,L,173.0,23.3,Roberto Carballes Baena,R,183.0,30.8,97.0,626.0,63.0,846.0


In [454]:
aus.head(11)

Unnamed: 0,tourney_name,surface,draw_size,tourney_level,best_of,winner_name,winner_hand,winner_age,loser_name,loser_hand,loser_age,winner_rank,winner_rank_points,loser_rank,loser_rank_points,Year
68263,Australian Open,Hard,128,G,5,Novak Djokovic,R,36.6,Dino Prizmic,U,18.4,1.0,11055.0,187.0,321.0,2024
68264,Australian Open,Hard,128,G,5,Alexei Popyrin,R,24.4,Marc Polmans,R,26.7,43.0,1052.0,154.0,406.0,2024
68265,Australian Open,Hard,128,G,5,Gael Monfils,R,37.3,Yannick Hanfmann,R,32.1,76.0,737.0,52.0,929.0,2024
68266,Australian Open,Hard,128,G,5,Tomas Martin Etcheverry,R,24.4,Andy Murray,R,36.6,32.0,1375.0,44.0,1050.0,2024
68267,Australian Open,Hard,128,G,5,Adrian Mannarino,L,35.5,Stan Wawrinka,R,38.8,19.0,1765.0,56.0,907.0,2024
68268,Australian Open,Hard,128,G,5,Jaume Munar,R,26.6,Alexander Shevchenko,R,23.1,82.0,706.0,48.0,975.0,2024
68269,Australian Open,Hard,128,G,5,Christopher Oconnell,R,29.6,Cristian Garin,R,27.6,68.0,814.0,86.0,680.0,2024
68270,Australian Open,Hard,128,G,5,Ben Shelton,L,21.2,Roberto Bautista Agut,R,35.7,16.0,2225.0,72.0,776.0,2024
68271,Australian Open,Hard,128,G,5,Taylor Fritz,R,26.2,Facundo Diaz Acosta,L,23.0,12.0,2840.0,90.0,661.0,2024
68272,Australian Open,Hard,128,G,5,Hugo Gaston,L,23.3,Roberto Carballes Baena,R,30.8,97.0,626.0,63.0,846.0,2024


In [436]:
aus_open_first_round = aus.head(64).apply(lambda row: [row['winner_name'], row['loser_name']], axis=1).tolist()

In [418]:
aus_open_players = [name for sublist in aus_open_first_round for name in sublist]
missing_players = set(aus_open_players) - set(player_elos.index)

In [437]:
missing_players

set()

In [117]:
percentile05 = player_elos.quantile(0.05)
new_elos = list(percentile05)
missing_df= pd.DataFrame([new_elos, new_elos], index=list(missing_players), columns=player_elos.columns)
player_elos = pd.concat([player_elos, missing_df])

In [438]:
australian_open_elos = player_elos.loc[aus_open_players]

In [439]:
import numpy as np
import pandas as pd

def convert_elo_to_pair_probabilities(elo_df, surface='Grass', S=600):
    """
    Converts ELO scores from a DataFrame to a matrix of pairwise winning probabilities.

    Args:
    elo_df: A DataFrame containing ELO scores with player names as index and surfaces as columns.
    surface: The surface being played on ('Grass', 'Clay', 'Hard').
    S: The scaling factor for ELO scores.

    Returns:
    A DataFrame of pairwise winning probabilities with player names as both index and columns.
    """
    
    # Check if the surface column exists in the DataFrame, if not tell user it doesn't exist
    if f"{surface}_ELO" not in elo_df.columns:
        raise ValueError(f"Surface '{surface}' not found in DataFrame columns. Make sure the spelling is correct and capitalize the first letter.")

    # Extract ELO scores for the specified surface
    elo_scores = elo_df[f"{surface}_ELO"].values
    
    # Create an empty matrix to store the probabilities
    probabilities = np.zeros((len(elo_scores), len(elo_scores)))

    # Calculate the probability of each player winning against each other player
    for i in range(len(elo_scores)):
        probabilities[i, i] = 0.5  # Each player has a 50% chance against themselves
        
        for j in range(i + 1, len(elo_scores)):
            # Calculate the expected score difference
            expected_score_diff = (elo_scores[i] - elo_scores[j]) / S
            
            # Calculate the probability of player i winning
            probability_i = 1 / (1 + 10**(-expected_score_diff))
            
            # Set the probabilities for both i vs j and j vs i
            probabilities[i, j] = probability_i
            probabilities[j, i] = 1 - probability_i

    # Create a DataFrame for better visualization
    probability_df = pd.DataFrame(probabilities, index=elo_df.index, columns=elo_df.index)
    
    return probability_df

In [440]:
P = convert_elo_to_pair_probabilities(australian_open_elos, surface='Hard')

In [442]:
def matchups_gen(winners):
    match = []
    for i in range(0, len(winners), 2):
        match.append([winners[i], winners[i+1]])
    return match

# Simulates a round of the tournament with the matchups for the round, the results matrix,
# and the round number.
def simulate_round(matchups, results, round):
    winners = []
    for i,j in matchups:
        win_i = bool(np.random.uniform() < P.loc[i,j])
        if win_i:
            results.loc[j,round] += 1
            winners.append(i)
        else:
            results.loc[i,round] += 1
            winners.append(j)
    return winners

final_matrix = np.zeros((128, 8)) 

aus_open_players = [name for sublist in aus_open_first_round for name in sublist]

trials = 1000

# This is whats used for the tournament simulation. The end_matrix stores the matrix value for
# where each team ended, adding it to final_matrix.
for _ in range(trials):
    matchup_current = aus_open_first_round
    end_matrix = pd.DataFrame(np.zeros((128, 8)), index=aus_open_players[:128])
    for round in range(0, 6):
        winners = simulate_round(matchup_current, end_matrix, round)
        matchup_current = matchups_gen(winners)
    round += 1
    final_winner = simulate_round(matchup_current, end_matrix, round)
    end_matrix.loc[final_winner,7] += 1
    final_matrix = final_matrix + end_matrix


matrix_W = final_matrix/trials

# Creates the dataframe that I submit.
column_names = ["Round_64", "Round_32", "Round_16", "Round_8", "Round_4", "Round_2", "Runner_up", "Champion"]
W_data = pd.DataFrame(matrix_W, columns=column_names)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

### Probability to win tournament, without Yiming match calculations as of today

In [429]:
matrix_W[7].sort_values().tail(10)

Andrey Rublev       0.006
Hubert Hurkacz      0.007
Ben Shelton         0.013
Carlos Alcaraz      0.031
Alexander Zverev    0.041
Daniil Medvedev     0.049
Grigor Dimitrov     0.063
Jannik Sinner       0.272
Novak Djokovic      0.487
Novak Djokovic      0.487
Name: 7, dtype: float64

In [7]:
import numpy as np

def logistic(x):
    return 1 / (1 + 10**(-x))

def compute_prob_using_ELO(R_A, R_B):
    return logistic((R_A-R_B)/800)

#calculating the decayed winning probability in each set due to age factor
#we use Hill function to model the decay of winning probability in the last set due to age factor
#and we let the decay to be linear among 5 sets
def compute_prob_in_sets(winning_prob, age, age_threshold1, age_threshold2):
    if age <= age_threshold1:
        return [winning_prob for i in range(5)]
    else:
        return [winning_prob * (1 - (age - age_threshold1)**2 / ((age_threshold2 - age_threshold1)**2 + (age - age_threshold1)**2) * i/5) for i in range(5)]

def simulating_game(player_1, player_1_elo, player_1_age, player_2, player_2_elo, player_2_age):
    set_winner = []
    
    winning_prob_1 = compute_prob_using_ELO(player_1_elo, player_2_elo)
    winning_prob_2 = compute_prob_using_ELO(player_2_elo, player_1_elo)
    
    age_threshold1 = 27 #age from which endurance starts to fall
    age_threshold2 = 34 #age at which winning probability decay 50% in the last set
    
    winning_prob_1_in_sets = compute_prob_in_sets(winning_prob_1, player_1_age, age_threshold1, age_threshold2)
    winning_prob_2_in_sets = compute_prob_in_sets(winning_prob_2, player_2_age, age_threshold1, age_threshold2)
        
    for i in range(5):
        winning_prob_1_inthisset = winning_prob_1_in_sets[i] / (winning_prob_1_in_sets[i] + winning_prob_2_in_sets[i])
        if bool(np.random.uniform() < winning_prob_1_inthisset):
            set_winner.append(player_1)
        else:
            set_winner.append(player_2)
            
    if set_winner.count(player_1) >= 3:
        return player_1
    elif set_winner.count(player_2) >= 3:
        return player_2

In [10]:
player_elos['Grass_ELO'].sort_values().tail(10)

Player_Name
Alexander Bublik       1840.118497
Matteo Berrettini      1846.138773
Roman Safiullin        1871.839246
Holger Rune            1889.906946
Christopher Eubanks    1918.294123
Andrey Rublev          1956.159546
Jannik Sinner          2027.974551
Daniil Medvedev        2044.097968
Novak Djokovic         2171.019127
Carlos Alcaraz         2406.120227
Name: Grass_ELO, dtype: float64

In [64]:
simulating_game('Carlos Alcaraz', 2406.120227, 23, 'Novak Djokovic', 2171.019127, 37)

'Carlos Alcaraz'