### Import necessary libraries

In [1]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D  
from sklearn.preprocessing import OneHotEncoder  

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

import import_ipynb
import Tennis_Analysis_Tools as tennis_tools

importing Jupyter notebook from Tennis_Analysis_Tools.ipynb


### Function for preprocessing dataset

In [2]:
def preprocess_dataset(betting_data_dfs):
    """
    Preprocess the betting dataset to prepare it for analysis.

    This function:
    - Filters the dataset to include only matches marked as 'Completed'.
    - Converts 'WRank' and 'LRank' columns to numeric, replacing errors with NaN.
    - Fills NaN values with 100000, representing unranked players.
    - Creates a new column to indicate if the higher-ranked player won.
    - Keeps only essential columns for analysis.

    Parameters:
        betting_data_dfs (DataFrame): The dataset containing betting data.

    Returns:
        DataFrame: The preprocessed dataset with only essential columns and cleaned data.
    """
    
    # Filter the dataset to include only completed matches
    betting_data_dfs = betting_data_dfs[betting_data_dfs['Comment'].isin(['Completed'])]

    # Reset the index for sequential order after filtering
    betting_data_dfs.reset_index(drop=True, inplace=True)
    
    # Convert 'WRank' and 'LRank' columns to numeric, handling non-numeric entries as NaN
    betting_data_dfs['WRank'] = pd.to_numeric(betting_data_dfs['WRank'], errors='coerce')
    betting_data_dfs['LRank'] = pd.to_numeric(betting_data_dfs['LRank'], errors='coerce')

    # Fill NaN values in 'WRank' and 'LRank' with a placeholder for unranked players (100000) and ensure data type consistency
    betting_data_dfs['WRank'] = betting_data_dfs['WRank'].fillna(100000).astype(float)
    betting_data_dfs['LRank'] = betting_data_dfs['LRank'].fillna(100000).astype(float)

    # Determine the winner based on rankings and create a corresponding binary column
    betting_data_dfs['higher_rank_won'] = (betting_data_dfs['WRank'] < betting_data_dfs['LRank']).astype(int)    

    # Specify the columns to retain in the processed DataFrame
    essential_columns = ['Date', 'Winner', 'Loser', 'WRank', 'LRank', 'higher_rank_won']

    # Retain only the specified essential columns
    betting_data_filtered = betting_data_dfs[essential_columns]
    
    # Return the filtered and cleaned dataframe
    return betting_data_filtered

### Functions related to ELO

In [3]:
def calculate_k(m_i, delta=100, nu=5, sigma=0.1):
    """
    Calculate the value of K for a given match index using the specified parameters.

    """    
    return delta / ((m_i + nu) ** sigma)

In [4]:
def update_elo(winner_elo, loser_elo, games_played_winner, games_played_loser, delta, nu, sigma):
    """
    Update the Elo ratings for a winner and a loser based on the outcome of a game.
    
    """

    # Calculate the dynamic K-factor for both winner and loser based on their games played
    K_winner = calculate_k(games_played_winner, delta, nu, sigma)
    K_loser = calculate_k(games_played_loser, delta, nu, sigma)

    # Calculate the expected probability of the winner winning against the loser
    expected_winner = 1 / (1 + 10 ** ((loser_elo - winner_elo) / 400))

    # Calculate the change in Elo for both winner and loser
    elo_change_winner = K_winner * (1 - expected_winner)  # Winner's Elo increase
    elo_change_loser = K_loser * (expected_winner - 1)    # Loser's Elo decrease

    # Return the new Elo ratings for winner and loser, along with the expected probability of winning
    return (winner_elo, winner_elo + elo_change_winner, loser_elo, loser_elo + elo_change_loser, expected_winner)

In [5]:
def calculate_elo_with_varied_k(betting_data_dfs_preprocessed, delta, nu, sigma):
    """
    Simulate tennis matches and update Elo ratings using a dynamic K-factor that adjusts
    based on the number of games played by each player.
    """
    # Initialize Elo scores and games played counters for all players
    players_ids = pd.concat([betting_data_dfs_preprocessed['Winner'], betting_data_dfs_preprocessed['Loser']]).unique()
    elo_scores_varied_k = {player: 1500 for player in players_ids}
    games_played = {player: 0 for player in players_ids}

    # Iterate over each match in the DataFrame
    for i, tennis_match_row in betting_data_dfs_preprocessed.iterrows():
        winner_id, loser_id = tennis_match_row['Winner'], tennis_match_row['Loser']

        # Increment games played by both winner and loser
        games_played[winner_id] += 1
        games_played[loser_id] += 1

        # Retrieve current Elo scores
        winner_elo_before, loser_elo_before = elo_scores_varied_k[winner_id], elo_scores_varied_k[loser_id]

        # Update Elo scores using the dynamic K-factor
        _, winner_elo_after, _, loser_elo_after, _ = update_elo(
            winner_elo_before, loser_elo_before,
            games_played[winner_id], games_played[loser_id],
            delta, nu, sigma)

        # Update Elo scores in the dictionary
        elo_scores_varied_k[winner_id], elo_scores_varied_k[loser_id] = winner_elo_after, loser_elo_after

        # Store pre- and post-match Elo scores, and probability of the winner winning
        betting_data_dfs_preprocessed.at[i, 'winner_elo_before_varied_k'] = winner_elo_before
        betting_data_dfs_preprocessed.at[i, 'winner_elo_after_varied_k'] = winner_elo_after
        betting_data_dfs_preprocessed.at[i, 'loser_elo_before_varied_k'] = loser_elo_before
        betting_data_dfs_preprocessed.at[i, 'loser_elo_after_varied_k'] = loser_elo_after

        # Calculate and store the probability of the winner winning
        prob_winner = 1 / (1 + 10 ** ((loser_elo_before - winner_elo_before) / 400))
        betting_data_dfs_preprocessed.at[i, 'prob_winner_varied_k'] = prob_winner

        # Determine and store match outcomes based on higher-ranked win probabilities
        if tennis_match_row['higher_rank_won'] == 1:
            betting_data_dfs_preprocessed.at[i, 'match_outcome_varied_k'] = int(prob_winner > 0.5)
            betting_data_dfs_preprocessed.at[i, 'prob_high_ranked_varied_k'] = prob_winner
        else:
            betting_data_dfs_preprocessed.at[i, 'match_outcome_varied_k'] = int((1 - prob_winner) > 0.5)
            betting_data_dfs_preprocessed.at[i, 'prob_high_ranked_varied_k'] = 1 - prob_winner



### Loading Betting data of Tennis

In [6]:
# Initialize a dictionary to store dataframes for each year
betting_data_dfs = {}

In [7]:
# Loop through each year from 2005 to 2019
for current_year in range(2005, 2020): 
    # Determine the file extension based on the year
    file_extension = 'xls' if current_year < 2013 else 'xlsx'
    
    # Construct the file path using the determined file extension
    file_path = f"/Users/harishthota/Desktop/UOA Project/Betting_Odds_Tennis/{current_year}.{file_extension}"
    
    # Read the Excel file and store it in the dictionary with the year as the key
    betting_data_dfs[current_year] = pd.read_excel(file_path)

In [8]:
betting_data_dfs[2019]

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,...,0.0,Completed,1.36,3.00,1.36,3.37,1.42,3.60,1.35,3.18
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,...,0.0,Completed,1.18,4.50,1.23,4.68,1.27,4.84,1.22,4.26
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,...,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.40,1.63,2.28
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,...,1.0,Completed,1.40,2.75,1.41,3.13,1.45,3.20,1.40,2.95
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,...,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1.0,Completed,1.44,2.75,1.39,3.26,1.48,3.30,1.41,2.93
2606,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,0.0,Completed,1.90,1.90,2.14,1.79,2.24,2.06,1.92,1.90
2607,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,0.0,Completed,3.50,1.30,3.75,1.33,3.75,1.40,3.39,1.33
2608,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,0.0,Completed,1.80,2.00,1.84,2.10,1.87,2.20,1.78,2.06


In [9]:
# Combine DataFrames from all years into a single DataFrame, reindexing rows.
betting_data_dfs = pd.concat(betting_data_dfs.values(), ignore_index = True)

### Analyzing Betting data of Tennis

In [10]:
# Display a summary of the DataFrame to understand its structure and data types
betting_data_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40390 entries, 0 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         40390 non-null  int64         
 1   Location    40390 non-null  object        
 2   Tournament  40390 non-null  object        
 3   Date        40390 non-null  datetime64[ns]
 4   Series      40390 non-null  object        
 5   Court       40390 non-null  object        
 6   Surface     40390 non-null  object        
 7   Round       40390 non-null  object        
 8   Best of     40390 non-null  int64         
 9   Winner      40390 non-null  object        
 10  Loser       40390 non-null  object        
 11  WRank       40375 non-null  float64       
 12  LRank       40303 non-null  float64       
 13  WPts        38701 non-null  float64       
 14  LPts        38631 non-null  float64       
 15  W1          40155 non-null  float64       
 16  L1          40157 non-

In [11]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_dfs.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W4,L4,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
count,40390.0,40390.0,40375.0,40303.0,38701.0,38631.0,40155.0,40157.0,3647.0,3647.0,...,10671.0,10671.0,28131.0,28142.0,15572.0,15579.0,25354.0,25354.0,25354.0,25354.0
mean,32.974944,3.378311,57.801536,90.38486,1828.537195,1054.728379,5.801992,4.075155,5.783384,3.865643,...,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.99861,7.847864,1.838168,3.547658
std,18.006138,0.783274,72.735132,115.423997,2278.996487,1212.422674,1.232787,1.841617,1.262227,1.903181,...,0.996238,3.646316,1.031691,3.075889,1.004273,3.27251,1.582432,376.24683,1.089277,3.22777
min,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.01,1.02,1.0,1.0,1.0,1.01,1.01,1.01,1.01,1.01
25%,19.0,3.0,16.0,34.0,645.0,502.0,6.0,3.0,6.0,2.0,...,1.24,1.75,1.25,1.73,1.22,1.73,1.3,1.84,1.25,1.74
50%,33.0,3.0,40.0,64.0,1010.0,745.0,6.0,4.0,6.0,4.0,...,1.5,2.5,1.5,2.5,1.5,2.63,1.58,2.75,1.51,2.53
75%,49.0,3.0,75.0,102.0,1890.0,1150.0,6.0,6.0,6.0,6.0,...,2.03,3.85,2.0,4.0,2.0,4.0,2.21,4.47,2.07,3.91
max,67.0,5.0,1890.0,2159.0,16950.0,16950.0,7.0,7.0,7.0,7.0,...,18.0,60.0,26.0,51.0,19.0,81.0,76.0,42586.0,23.45,36.44


In [12]:
# Retrieve and display the column names of the DataFrame
betting_data_dfs.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'CBW', 'CBL', 'EXW', 'EXL', 'IWW',
       'IWL', 'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW',
       'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

In [13]:
### Preprocessing and analysis of preprocessed betting data

In [14]:
# Apply preprocessing to the betting dataset 
betting_data_dfs_preprocessed = preprocess_dataset(betting_data_dfs) 

# Display the preprocessed DataFrame to verify the applied transformations
betting_data_dfs_preprocessed

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won
0,2005-01-03,Saulnier C.,Baccanello P.,53.0,324.0,1
1,2005-01-03,Enqvist T.,Sluiter R.,72.0,82.0,1
2,2005-01-03,Melzer J.,Berdych T.,39.0,45.0,1
3,2005-01-03,Rochus O.,Dupuis A.,66.0,79.0,1
4,2005-01-03,Mayer F.,Arthurs W.,35.0,101.0,1
...,...,...,...,...,...,...
38842,2019-11-15,Nadal R.,Tsitsipas S.,1.0,6.0,1
38843,2019-11-15,Zverev A.,Medvedev D.,7.0,4.0,0
38844,2019-11-16,Tsitsipas S.,Federer R.,6.0,3.0,0
38845,2019-11-16,Thiem D.,Zverev A.,5.0,7.0,1


In [15]:
betting_data_dfs_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38847 entries, 0 to 38846
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             38847 non-null  datetime64[ns]
 1   Winner           38847 non-null  object        
 2   Loser            38847 non-null  object        
 3   WRank            38847 non-null  float64       
 4   LRank            38847 non-null  float64       
 5   higher_rank_won  38847 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 1.8+ MB


### Evaluating Elo Rating Predictions for Top-Ranked Tennis Players

In [16]:
# Calculate Elo ratings with the current set of parameters
calculate_elo_with_varied_k(betting_data_dfs_preprocessed, delta = 195, nu = 30, sigma = 0.45)

# Filter the data for validation based on date, considering matches after December 31, 2018
betting_data_df_validation_preprocessed = betting_data_dfs_preprocessed[betting_data_dfs_preprocessed.Date > '2018-12-31']

# Ensure 'match_outcome' is of integer type for consistency in calculations
betting_data_df_validation_preprocessed["match_outcome_varied_k"] = betting_data_df_validation_preprocessed["match_outcome_varied_k"].astype(int)

betting_data_df_validation_top_50 = betting_data_df_validation_preprocessed[(betting_data_df_validation_preprocessed['WRank'] <= 50) | (betting_data_df_validation_preprocessed['LRank'] <= 50)]
betting_data_df_validation_top_100 = betting_data_df_validation_preprocessed[(betting_data_df_validation_preprocessed['WRank'] <= 100) | (betting_data_df_validation_preprocessed['LRank'] <= 100)]

In [17]:
betting_data_df_validation_preprocessed

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,winner_elo_before_varied_k,winner_elo_after_varied_k,loser_elo_before_varied_k,loser_elo_after_varied_k,prob_winner_varied_k,match_outcome_varied_k,prob_high_ranked_varied_k
36322,2019-01-01,Kudla D.,Fritz T.,63.0,49.0,0,1533.076770,1546.293803,1628.818784,1615.212270,0.365601,1,0.634399
36323,2019-01-01,Chardy J.,Struff J.L.,40.0,57.0,1,1639.334645,1645.421376,1650.227395,1641.403305,0.484329,0,0.484329
36324,2019-01-01,Murray A.,Duckworth J.,240.0,234.0,0,2017.974656,2018.334079,1454.575944,1453.627529,0.962427,0,0.037573
36325,2019-01-01,Kyrgios N.,Harrison R.,35.0,62.0,1,1864.517559,1867.725480,1604.897654,1602.071753,0.816752,1,0.816752
36326,2019-01-01,Tsonga J.W.,Kokkinakis T.,239.0,146.0,0,1840.585047,1842.342498,1553.412821,1549.234433,0.839309,0,0.160691
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38842,2019-11-15,Nadal R.,Tsitsipas S.,1.0,6.0,1,1499.705061,1522.328029,1541.288639,1518.665671,0.440441,0,0.440441
38843,2019-11-15,Zverev A.,Medvedev D.,7.0,4.0,0,1500.294939,1518.101933,1458.711361,1440.904366,0.559559,0,0.440441
38844,2019-11-16,Tsitsipas S.,Federer R.,6.0,3.0,0,1518.665671,1541.876910,1576.074646,1554.253138,0.418126,1,0.581874
38845,2019-11-16,Thiem D.,Zverev A.,5.0,7.0,1,1525.251539,1544.786392,1518.101933,1498.567080,0.510288,1,0.510288


In [18]:
# Evaluate predictions for top 50 and top 100 players and compute accuracy, calibration, and log loss
accuracy_top50, calibration_top50, log_loss_top50 = tennis_tools.evaluate_predictions(
    betting_data_df_validation_top_50["higher_rank_won"],
    betting_data_df_validation_top_50["match_outcome_varied_k"], 
    betting_data_df_validation_top_50['prob_high_ranked_varied_k']
)
accuracy_top100,calibration_top100, log_loss_top100  = tennis_tools.evaluate_predictions(
    betting_data_df_validation_top_100["higher_rank_won"],
    betting_data_df_validation_top_100["match_outcome_varied_k"], 
    betting_data_df_validation_top_100['prob_high_ranked_varied_k']
)


### Metrics for top 50 players for 538 Model

In [19]:
print("\033[1mAccuracy for Top 50 players:\033[0m", accuracy_top50)
print("\033[1mCalibration error for Top 50 players:\033[0m", calibration_top50)
print("\033[1mLog loss for Top 50 players:\033[0m", log_loss_top50)

[1mAccuracy for Top 50 players:[0m 0.6529
[1mCalibration error for Top 50 players:[0m 1.0521
[1mLog loss for Top 50 players:[0m 0.6104


### Metrics for top 100 players for 538 Model

In [20]:
print("\033[1mAccuracy for Top 100 players:\033[0m", accuracy_top100)
print("\033[1mCalibration error for Top 100 players:\033[0m", calibration_top100)
print("\033[1mLog loss for Top 100 players:\033[0m", log_loss_top100)

[1mAccuracy for Top 100 players:[0m 0.6342
[1mCalibration error for Top 100 players:[0m 1.0387
[1mLog loss for Top 100 players:[0m 0.6292
