In [1]:
# Enable Jupyter Notebook modules import
import import_ipynb

# For numerical operations and arrays
import numpy as np

# For data manipulation and analysis
import pandas as pd

# Custom tools for tennis data analysis
import Tennis_Analysis_Tools as tennis_tools

importing Jupyter notebook from Tennis_Analysis_Tools.ipynb


In [2]:
import os
os.getcwd()

'/Users/harishthota/Desktop/UOA Project/Tennis_Project/Top50 vs Top100'

In [3]:
def preprocess_dataset(betting_data_dfs):
    # Define columns to be used
    odds_columns = ['B365W', 'B365L', 'PSW', 'PSL']

    # Replace 'NR' (Not Ranked) with NaN in 'WRank' and 'LRank' columns
    betting_data_dfs['WRank'].replace('NR', np.nan, inplace=True)
    betting_data_dfs['LRank'].replace('NR', np.nan, inplace=True)
    
    # Convert 'WRank' and 'LRank' columns to numeric, forcing errors to NaN
    betting_data_dfs['WRank'] = pd.to_numeric(betting_data_dfs['WRank'], errors='coerce')
    betting_data_dfs['LRank'] = pd.to_numeric(betting_data_dfs['LRank'], errors='coerce')

    # Fill NaN values in 'WRank' and 'LRank' with 100000 and convert to float
    betting_data_dfs['WRank'] = betting_data_dfs['WRank'].fillna(100000).astype(float)
    betting_data_dfs['LRank'] = betting_data_dfs['LRank'].fillna(100000).astype(float)

    # Create a column indicating if the higher-ranked player won (1 if true, 0 if false)
    betting_data_dfs['higher_rank_won'] = (betting_data_dfs['WRank'] < betting_data_dfs['LRank']).astype(int)
    
    # Define the essential columns to keep in the dataframe
    essential_columns = ['Date', 'Winner', 'Loser', 'WRank', 'LRank', 'higher_rank_won']
    
    # Filter the dataframe to keep only the essential columns and odds columns
    betting_data_filtered = betting_data_dfs[essential_columns + odds_columns]

    # Convert odds columns to numeric, coercing errors to NaN
    betting_data_filtered[odds_columns] = betting_data_filtered[odds_columns].apply(pd.to_numeric, errors='coerce')

    
    # Return the preprocessed dataframe
    return betting_data_filtered

In [4]:
def calculate_implied_probabilities(odds_w, odds_l):
    # Check if either odds are zero or NaN (missing), return zero probabilities if true
    if odds_w == 0 or odds_l == 0 or pd.isna(odds_w) or pd.isna(odds_l):
        return 0, 0 
    
    # Calculate the probabilities from the betting odds
    implied_prob_w = 1 / odds_w  #  probability for the winner from betting odds
    implied_prob_l = 1 / odds_l  #  probability for the loser  from betting odds

    # Calculate the overround (the sum of the probabilities of the winner and loser)
    overround = implied_prob_w + implied_prob_l

    # Normalize the probabilities so they sum to 1 (to correct for the overround)
    normalized_prob_w = implied_prob_w / overround
    normalized_prob_l = implied_prob_l / overround

    # Return the normalized probabilities
    return normalized_prob_w, normalized_prob_l

In [5]:
def logit(p, eps=1e-6):
    # Clip probabilities to avoid log of zero
    p = np.clip(p, eps, 1 - eps)
    
    # Return logit (log-odds) transformation
    return np.log(p / (1 - p))

def inverse_logit(y):
    # Return NaN if input is NaN
    if pd.isna(y):
        return np.nan
    
    # Return inverse logit (sigmoid) transformation
    return np.exp(y) / (1 + np.exp(y))

In [6]:
def preprocess_bookmakers_data(betting_data_df, bookmakers):
    """
    Preprocess the betting data by calculating probabilities and logit probabilities for each bookmaker.
    
    Args:
    df (DataFrame): The DataFrame containing the betting data.
    bookmakers (list): A list of bookmakers to process.

    Returns:
    DataFrame: The DataFrame with added probability and logit probability columns.
    """
    
    for bookmaker in bookmakers:
        # Calculate win and lose probabilities for each bookmaker
        win_prob, lose_prob = zip(*betting_data_df.apply(lambda row: calculate_implied_probabilities(
            row[f'{bookmaker}W'], row[f'{bookmaker}L']), axis=1))
        
        betting_data_df[f'{bookmaker}_prob_w'] = win_prob
        betting_data_df[f'{bookmaker}_prob_l'] = lose_prob

        # Calculate logit probabilities for the win probabilities
        betting_data_df[f'{bookmaker}_logit_prob_w'] = betting_data_df[f'{bookmaker}_prob_w'].apply(logit)
    
    return betting_data_df

In [7]:
def calculate_predictions_and_probabilities(betting_data_df):
    """
    Calculate predictions and adjusted probabilities based on consensus probabilities
    and a given condition in the DataFrame.

    Args:
    df (DataFrame): The DataFrame containing the betting data with consensus probabilities.

    Returns:
    DataFrame: The DataFrame updated with new columns for predictions and adjusted consensus probabilities.
    """
    # Apply the lambda function to calculate predictions and adjusted probabilities
    betting_data_df[['predictions', 'consensus_probabilities']] = betting_data_df.apply(
        lambda row: (
            # Prediction based on consensus probability threshold
            int(row['consensus_prob_w'] > 0.5) if row['higher_rank_won'] == 1 else int((1 - row['consensus_prob_w']) > 0.5),
            # Adjusted probability
            row['consensus_prob_w'] if row['higher_rank_won'] == 1 else 1 - row['consensus_prob_w']
        ),
        axis=1,
        result_type='expand'
    )
    return betting_data_df

In [8]:
def calculate_consensus_probabilities(betting_data_df, bookmakers):
    """
    Calculate the consensus logit probabilities and convert them to normal probabilities.
    
    Args:
    df (DataFrame): The DataFrame containing the logit probabilities for each bookmaker.
    bookmakers (list): A list of bookmakers to consider for consensus calculations.

    Returns:
    DataFrame: The DataFrame with added consensus probability columns.
    """
    
    # Calculate the consensus logit probabilities by taking the mean of the logit probabilities
    betting_data_df['consensus_logit_prob_w'] = betting_data_df[
        [f'{bookmaker}_logit_prob_w' for bookmaker in bookmakers]
    ].mean(axis=1, skipna=True)

    # Convert the consensus logit probabilities back to probabilities
    betting_data_df['consensus_prob_w'] = betting_data_df['consensus_logit_prob_w'].apply(inverse_logit)

    return betting_data_df

In [9]:
# Load the betting data for the year 2019 from an Excel file
betting_data_df_2019 = pd.read_excel(f"/Users/harishthota/Desktop/UOA Project/Betting_Odds_Tennis/2019.xlsx")

In [10]:
betting_data_df_2019

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,...,0.0,Completed,1.36,3.00,1.36,3.37,1.42,3.60,1.35,3.18
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,...,0.0,Completed,1.18,4.50,1.23,4.68,1.27,4.84,1.22,4.26
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,...,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.40,1.63,2.28
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,...,1.0,Completed,1.40,2.75,1.41,3.13,1.45,3.20,1.40,2.95
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,...,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1.0,Completed,1.44,2.75,1.39,3.26,1.48,3.30,1.41,2.93
2606,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,0.0,Completed,1.90,1.90,2.14,1.79,2.24,2.06,1.92,1.90
2607,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,0.0,Completed,3.50,1.30,3.75,1.33,3.75,1.40,3.39,1.33
2608,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,0.0,Completed,1.80,2.00,1.84,2.10,1.87,2.20,1.78,2.06


In [11]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_df_2019.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W2,L2,...,Wsets,Lsets,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
count,2610.0,2610.0,2606.0,2597.0,2607.0,2597.0,2589.0,2589.0,2576.0,2576.0,...,2589.0,2589.0,2597.0,2597.0,2599.0,2599.0,2609.0,2609.0,2609.0,2609.0
mean,33.218774,3.388506,57.129317,77.825568,1782.094745,1156.386985,5.826574,4.202008,5.834239,4.036879,...,2.167246,0.431054,1.844926,3.159365,1.933342,3.474467,1.998605,3.678862,1.867348,3.139747
std,18.226778,0.7914,57.173452,80.276616,2055.16526,1194.664111,1.201032,1.822849,1.213551,1.830008,...,0.435385,0.56527,0.893305,3.067155,1.027109,3.597585,1.096063,4.221048,0.910771,2.668966
min,1.0,3.0,1.0,1.0,17.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.002,1.07,1.005,1.07,1.01,1.08,1.01,1.06
25%,19.0,3.0,19.0,34.0,716.0,598.0,6.0,3.0,6.0,3.0,...,2.0,0.0,1.3,1.66,1.33,1.76,1.36,1.82,1.31,1.72
50%,32.0,3.0,45.0,60.0,1022.0,875.0,6.0,4.0,6.0,4.0,...,2.0,0.0,1.57,2.3,1.64,2.44,1.67,2.51,1.6,2.34
75%,50.0,3.0,76.0,95.0,1825.0,1246.0,6.0,6.0,6.0,6.0,...,2.0,1.0,2.1,3.4,2.185,3.625,2.27,3.78,2.12,3.43
max,66.0,5.0,503.0,1491.0,12415.0,12355.0,7.0,7.0,7.0,7.0,...,3.0,2.0,9.0,41.0,11.73,37.8,12.22,67.0,9.64,28.49


In [12]:
# Retrieve and display the column names of 2019 DataFrame
betting_data_df_2019.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL'],
      dtype='object')

## Preprocess 2019 Tennis Betting Data for Top 50 and Top 100 Players

In [13]:
# Apply preprocessing steps to the validation dataset
betting_data_validation_preprocessed = preprocess_dataset(betting_data_df_2019)

In [14]:
means = betting_data_validation_preprocessed.select_dtypes(include=[np.number]).mean()
betting_data_validation_preprocessed.fillna(means, inplace=True)

In [15]:
# Fill missing values in the 2019 dataset
betting_data_validation_preprocessed.fillna(means, inplace=True)

In [16]:
betting_data_validation_preprocessed.head()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL
0,2018-12-31,Dimitrov G.,Nishioka Y.,19.0,75.0,1,1.36,3.0,1.36,3.37
1,2018-12-31,Raonic M.,Bedene A.,18.0,67.0,1,1.18,4.5,1.23,4.68
2,2018-12-31,Kecmanovic M.,Mayer L.,131.0,56.0,0,1.57,2.25,1.67,2.32
3,2018-12-31,Millman J.,Sandgren T.,38.0,61.0,1,1.4,2.75,1.41,3.13
4,2018-12-31,Uchiyama Y.,Humbert U.,185.0,102.0,0,2.62,1.44,2.73,1.51


In [17]:
betting_data_validation_preprocessed.columns

Index(['Date', 'Winner', 'Loser', 'WRank', 'LRank', 'higher_rank_won', 'B365W',
       'B365L', 'PSW', 'PSL'],
      dtype='object')

In [18]:
top_50_players = tennis_tools.precompute_top_players(betting_data_validation_preprocessed, 50)
top_100_players = tennis_tools.precompute_top_players(betting_data_validation_preprocessed, 100)

In [19]:
betting_data_validation_preprocessed['Top50'] = betting_data_validation_preprocessed.apply(
    lambda row: row['Winner'] in top_50_players[2019] and row['Loser'] in top_50_players[2019], axis=1)
betting_data_validation_preprocessed['Top100'] = betting_data_validation_preprocessed.apply(
    lambda row: row['Winner'] in top_100_players[2019] and row['Loser'] in top_100_players[2019], axis=1)

In [20]:
betting_data_df_2019_top_50 = betting_data_validation_preprocessed[betting_data_validation_preprocessed['Top50'] == True]
betting_data_df_2019_top_100 = betting_data_validation_preprocessed[betting_data_validation_preprocessed['Top100'] == True]

## BCM Model for top 50

In [21]:
# List of bookmakers in 2019
bookmakers = ['B365', 'PS']

In [22]:
betting_data_validation_preprocessed_top50 = preprocess_bookmakers_data(betting_data_df_2019_top_50, bookmakers)

In [23]:
betting_data_validation_preprocessed_top50.tail()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,Top50,Top100,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w
2179,2019-09-08,Nadal R.,Medvedev D.,2.0,5.0,1,1.16,5.5,1.19,5.53,True,True,0.825826,0.174174,1.556328,0.822917,0.177083,1.536235
2400,2019-10-11,Zverev A.,Federer R.,6.0,3.0,0,2.75,1.44,3.04,1.45,True,True,0.343675,0.656325,-0.646958,0.32294,0.67706,-0.740294
2404,2019-10-13,Medvedev D.,Zverev A.,4.0,6.0,1,1.53,2.5,1.58,2.59,True,True,0.620347,0.379653,0.491023,0.621103,0.378897,0.494233
2601,2019-11-13,Nadal R.,Medvedev D.,1.0,4.0,1,2.1,1.72,2.13,1.8,True,True,0.450262,0.549738,-0.199613,0.458015,0.541985,-0.168335
2604,2019-11-14,Federer R.,Djokovic N.,3.0,2.0,0,3.2,1.36,3.12,1.42,True,True,0.298246,0.701754,-0.855666,0.312775,0.687225,-0.787176


In [24]:
betting_data_validation_preprocessed_top50 = calculate_consensus_probabilities(betting_data_validation_preprocessed_top50, bookmakers)

In [25]:
betting_data_validation_preprocessed_top50.tail()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,Top50,Top100,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w,consensus_logit_prob_w,consensus_prob_w
2179,2019-09-08,Nadal R.,Medvedev D.,2.0,5.0,1,1.16,5.5,1.19,5.53,True,True,0.825826,0.174174,1.556328,0.822917,0.177083,1.536235,1.546281,0.824376
2400,2019-10-11,Zverev A.,Federer R.,6.0,3.0,0,2.75,1.44,3.04,1.45,True,True,0.343675,0.656325,-0.646958,0.32294,0.67706,-0.740294,-0.693626,0.333227
2404,2019-10-13,Medvedev D.,Zverev A.,4.0,6.0,1,1.53,2.5,1.58,2.59,True,True,0.620347,0.379653,0.491023,0.621103,0.378897,0.494233,0.492628,0.620725
2601,2019-11-13,Nadal R.,Medvedev D.,1.0,4.0,1,2.1,1.72,2.13,1.8,True,True,0.450262,0.549738,-0.199613,0.458015,0.541985,-0.168335,-0.183974,0.454136
2604,2019-11-14,Federer R.,Djokovic N.,3.0,2.0,0,3.2,1.36,3.12,1.42,True,True,0.298246,0.701754,-0.855666,0.312775,0.687225,-0.787176,-0.821421,0.305462


In [26]:
betting_data_validation_preprocessed_top50 = calculate_predictions_and_probabilities(betting_data_validation_preprocessed_top50)

In [27]:
# Evaluate the model's performance by comparing the true values ('higher_rank_won') 
# with the predicted values ('predictions') and the predicted probabilities ('consensus_probabilities')
tennis_tools.evaluate_predictions(
    betting_data_validation_preprocessed_top50 ['higher_rank_won'], 
    betting_data_validation_preprocessed_top50 ['predictions'], 
    betting_data_validation_preprocessed_top50 ['consensus_probabilities']
)

{0.6354, 0.6538, 1.2181}

## BCM Model for top 100

In [28]:
betting_data_validation_preprocessed_top100 = preprocess_bookmakers_data(betting_data_df_2019_top_100, bookmakers)

In [29]:
betting_data_validation_preprocessed_top100.tail()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,Top50,Top100,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w
2601,2019-11-13,Nadal R.,Medvedev D.,1.0,4.0,1,2.1,1.72,2.13,1.8,True,True,0.450262,0.549738,-0.199613,0.458015,0.541985,-0.168335
2604,2019-11-14,Federer R.,Djokovic N.,3.0,2.0,0,3.2,1.36,3.12,1.42,True,True,0.298246,0.701754,-0.855666,0.312775,0.687225,-0.787176
2605,2019-11-15,Nadal R.,Tsitsipas S.,1.0,6.0,1,1.44,2.75,1.39,3.26,False,True,0.656325,0.343675,0.646958,0.701075,0.298925,0.852423
2607,2019-11-16,Tsitsipas S.,Federer R.,6.0,3.0,0,3.5,1.3,3.75,1.33,False,True,0.270833,0.729167,-0.990399,0.261811,0.738189,-1.036577
2609,2019-11-17,Tsitsipas S.,Thiem D.,6.0,5.0,0,2.0,1.8,2.0,1.93,False,True,0.473684,0.526316,-0.105361,0.491094,0.508906,-0.035627


In [30]:
betting_data_validation_preprocessed_top100 = calculate_consensus_probabilities(betting_data_validation_preprocessed_top100, bookmakers)

In [31]:
betting_data_validation_preprocessed_top100.tail()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,Top50,Top100,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w,consensus_logit_prob_w,consensus_prob_w
2601,2019-11-13,Nadal R.,Medvedev D.,1.0,4.0,1,2.1,1.72,2.13,1.8,True,True,0.450262,0.549738,-0.199613,0.458015,0.541985,-0.168335,-0.183974,0.454136
2604,2019-11-14,Federer R.,Djokovic N.,3.0,2.0,0,3.2,1.36,3.12,1.42,True,True,0.298246,0.701754,-0.855666,0.312775,0.687225,-0.787176,-0.821421,0.305462
2605,2019-11-15,Nadal R.,Tsitsipas S.,1.0,6.0,1,1.44,2.75,1.39,3.26,False,True,0.656325,0.343675,0.646958,0.701075,0.298925,0.852423,0.749691,0.679111
2607,2019-11-16,Tsitsipas S.,Federer R.,6.0,3.0,0,3.5,1.3,3.75,1.33,False,True,0.270833,0.729167,-0.990399,0.261811,0.738189,-1.036577,-1.013488,0.266298
2609,2019-11-17,Tsitsipas S.,Thiem D.,6.0,5.0,0,2.0,1.8,2.0,1.93,False,True,0.473684,0.526316,-0.105361,0.491094,0.508906,-0.035627,-0.070494,0.482384


In [32]:
betting_data_validation_preprocessed_top100 = calculate_predictions_and_probabilities(betting_data_validation_preprocessed_top100)

In [33]:
# Evaluate the model's performance by comparing the true values ('higher_rank_won') 
# with the predicted values ('predictions') and the predicted probabilities ('consensus_probabilities')
accuracy1, calibration1, log_loss = tennis_tools.evaluate_predictions(
    betting_data_validation_preprocessed_top100 ['higher_rank_won'], 
    betting_data_validation_preprocessed_top100 ['predictions'], 
    betting_data_validation_preprocessed_top100 ['consensus_probabilities']
)

In [34]:
print(f"Metrics for BCM Model: \nAccuracy : {accuracy1}, Calibration : {calibration1}, LogLoss: {log_loss}")

Metrics for BCM Model: 
Accuracy : 0.6491, Calibration : 1.2157, LogLoss: 0.6619
