In [1]:
# Import necessary libraries
import numpy as np                  
import pandas as pd                 
import matplotlib.pyplot as plt     
from sklearn.preprocessing import OneHotEncoder 

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [2]:
def preprocess_dataset(betting_data_dfs):
    # Define columns to be used
    odds_columns = ['B365W', 'B365L', 'PSW', 'PSL']

    # Replace 'NR' (Not Ranked) with NaN in 'WRank' and 'LRank' columns
    betting_data_dfs['WRank'].replace('NR', np.nan, inplace=True)
    betting_data_dfs['LRank'].replace('NR', np.nan, inplace=True)
    
    # Convert 'WRank' and 'LRank' columns to numeric, forcing errors to NaN
    betting_data_dfs['WRank'] = pd.to_numeric(betting_data_dfs['WRank'], errors='coerce')
    betting_data_dfs['LRank'] = pd.to_numeric(betting_data_dfs['LRank'], errors='coerce')

    # Fill NaN values in 'WRank' and 'LRank' with 100000 and convert to float
    betting_data_dfs['WRank'] = betting_data_dfs['WRank'].fillna(100000).astype(float)
    betting_data_dfs['LRank'] = betting_data_dfs['LRank'].fillna(100000).astype(float)

    # Create a column indicating if the higher-ranked player won (1 if true, 0 if false)
    betting_data_dfs['higher_rank_won'] = (betting_data_dfs['WRank'] < betting_data_dfs['LRank']).astype(int)
    
    # Define the essential columns to keep in the dataframe
    essential_columns = ['WRank', 'LRank', 'higher_rank_won']
    
    # Filter the dataframe to keep only the essential columns and odds columns
    betting_data_filtered = betting_data_dfs[essential_columns + odds_columns]

    # Convert odds columns to numeric, coercing errors to NaN
    betting_data_filtered[odds_columns] = betting_data_filtered[odds_columns].apply(pd.to_numeric, errors='coerce')

    
    # Return the preprocessed dataframe
    return betting_data_filtered

In [3]:
def calculate_implied_probabilities(odds_w, odds_l):
    # Check if either odds are zero or NaN (missing), return zero probabilities if true
    if odds_w == 0 or odds_l == 0 or pd.isna(odds_w) or pd.isna(odds_l):
        return 0, 0 
    
    # Calculate the probabilities from the betting odds
    implied_prob_w = 1 / odds_w  #  probability for the winner from betting odds
    implied_prob_l = 1 / odds_l  #  probability for the loser  from betting odds

    # Calculate the overround (the sum of the probabilities of the winner and loser)
    overround = implied_prob_w + implied_prob_l

    # Normalize the probabilities so they sum to 1 (to correct for the overround)
    normalized_prob_w = implied_prob_w / overround
    normalized_prob_l = implied_prob_l / overround

    # Return the normalized probabilities
    return normalized_prob_w, normalized_prob_l

In [4]:
def logit(p, eps=1e-6):
    # Clip probabilities to avoid log of zero
    p = np.clip(p, eps, 1 - eps)
    
    # Return logit (log-odds) transformation
    return np.log(p / (1 - p))

def inverse_logit(y):
    # Return NaN if input is NaN
    if pd.isna(y):
        return np.nan
    
    # Return inverse logit (sigmoid) transformation
    return np.exp(y) / (1 + np.exp(y))

In [5]:
def accuracy(actual, predictions):
    # Calculate the mean of correct predictions
    logr_accuracy_all_predictors = np.round(np.mean(actual == predictions),4)
    return logr_accuracy_all_predictors

def calibration(actual, predictions):
    # Calculate the ratio of the sum of predictions to the sum of actual values
    return np.round((np.sum(predictions) / np.sum(actual)), 4)

def logloss(actual, predictions):
    epsilon = 1e-15  # Small constant to prevent division by zero
    # Clip predictions to avoid log of zero. Values are clipped to the range [epsilon, 1-epsilon]
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return np.round(logr_logloss_all_predictors, 4)

In [6]:
# Evaluating predictions using accuracy, calibration and logloss
def evaluate_predictions(actual_outcomes, binary_predictions, probability_predictions):
    accuracy_result = accuracy(actual_outcomes, binary_predictions)
    
    calibration_result = calibration(actual_outcomes, probability_predictions)
    
    logloss_result = logloss(actual_outcomes, probability_predictions)
    
    return {
        'accuracy': accuracy_result,  
        'calibration': calibration_result,  
        'log_loss': logloss_result  
    }

In [7]:
# Load the betting data for the year 2019 from an Excel file
betting_data_df_2019 = pd.read_excel(f"Betting_Odds_Tennis/2019.xlsx")

In [8]:
betting_data_df_2019

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,...,0.0,Completed,1.36,3.00,1.36,3.37,1.42,3.60,1.35,3.18
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,...,0.0,Completed,1.18,4.50,1.23,4.68,1.27,4.84,1.22,4.26
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,...,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.40,1.63,2.28
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,...,1.0,Completed,1.40,2.75,1.41,3.13,1.45,3.20,1.40,2.95
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,...,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1.0,Completed,1.44,2.75,1.39,3.26,1.48,3.30,1.41,2.93
2606,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,0.0,Completed,1.90,1.90,2.14,1.79,2.24,2.06,1.92,1.90
2607,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,0.0,Completed,3.50,1.30,3.75,1.33,3.75,1.40,3.39,1.33
2608,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,0.0,Completed,1.80,2.00,1.84,2.10,1.87,2.20,1.78,2.06


In [9]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_df_2019.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W2,L2,...,Wsets,Lsets,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
count,2610.0,2610.0,2606.0,2597.0,2607.0,2597.0,2589.0,2589.0,2576.0,2576.0,...,2589.0,2589.0,2597.0,2597.0,2599.0,2599.0,2609.0,2609.0,2609.0,2609.0
mean,33.218774,3.388506,57.129317,77.825568,1782.094745,1156.386985,5.826574,4.202008,5.834239,4.036879,...,2.167246,0.431054,1.844926,3.159365,1.933342,3.474467,1.998605,3.678862,1.867348,3.139747
std,18.226778,0.7914,57.173452,80.276616,2055.16526,1194.664111,1.201032,1.822849,1.213551,1.830008,...,0.435385,0.56527,0.893305,3.067155,1.027109,3.597585,1.096063,4.221048,0.910771,2.668966
min,1.0,3.0,1.0,1.0,17.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.002,1.07,1.005,1.07,1.01,1.08,1.01,1.06
25%,19.0,3.0,19.0,34.0,716.0,598.0,6.0,3.0,6.0,3.0,...,2.0,0.0,1.3,1.66,1.33,1.76,1.36,1.82,1.31,1.72
50%,32.0,3.0,45.0,60.0,1022.0,875.0,6.0,4.0,6.0,4.0,...,2.0,0.0,1.57,2.3,1.64,2.44,1.67,2.51,1.6,2.34
75%,50.0,3.0,76.0,95.0,1825.0,1246.0,6.0,6.0,6.0,6.0,...,2.0,1.0,2.1,3.4,2.185,3.625,2.27,3.78,2.12,3.43
max,66.0,5.0,503.0,1491.0,12415.0,12355.0,7.0,7.0,7.0,7.0,...,3.0,2.0,9.0,41.0,11.73,37.8,12.22,67.0,9.64,28.49


In [10]:
# Retrieve and display the column names of 2019 DataFrame
betting_data_df_2019.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL'],
      dtype='object')

## Preprocessing Tennis Betting Data

In [11]:
# Apply preprocessing steps to the validation dataset
betting_data_preprocessed_validation = preprocess_dataset(betting_data_df_2019)

In [12]:
means = betting_data_df_2019.select_dtypes(include=[np.number]).mean()
betting_data_df_2019.fillna(means, inplace=True)

In [13]:
# Fill missing values in the 2019 dataset
betting_data_df_2019.fillna(means, inplace=True)

In [14]:
betting_data_preprocessed_validation.head()

Unnamed: 0,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL
0,19.0,75.0,1,1.36,3.0,1.36,3.37
1,18.0,67.0,1,1.18,4.5,1.23,4.68
2,131.0,56.0,0,1.57,2.25,1.67,2.32
3,38.0,61.0,1,1.4,2.75,1.41,3.13
4,185.0,102.0,0,2.62,1.44,2.73,1.51


In [15]:
betting_data_preprocessed_validation.columns

Index(['WRank', 'LRank', 'higher_rank_won', 'B365W', 'B365L', 'PSW', 'PSL'], dtype='object')

## BCM Model

In [16]:
# List of bookmakers in 2019
bookmakers = ['B365', 'PS']

In [17]:
# Calculate probabilities, logit probabilities, and consensus probabilities for each bookmaker and add them to the 2019 DataFrame

for bookmaker in bookmakers:
    # Apply the calculate_implied_probabilities function to each row for the 
    # current bookmaker's win and loss odds
    # and unpack the results into new columns for the win and loss probabilities
    betting_data_preprocessed_validation[f'{bookmaker}_prob_w'], betting_data_preprocessed_validation[f'{bookmaker}_prob_l'] = zip(
        *betting_data_preprocessed_validation.apply(
            lambda row: calculate_implied_probabilities(row[f'{bookmaker}W'], row[f'{bookmaker}L']), axis=1
        )
    )
    
    # Calculate logit probabilities for the current bookmaker's win probabilities
    betting_data_preprocessed_validation[f'{bookmaker}_logit_prob_w'] = betting_data_preprocessed_validation[f'{bookmaker}_prob_w'].apply(logit)

In [18]:
 betting_data_preprocessed_validation

Unnamed: 0,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w
0,19.0,75.0,1,1.36,3.00,1.36,3.37,0.688073,0.311927,0.791128,0.712474,0.287526,0.907428
1,18.0,67.0,1,1.18,4.50,1.23,4.68,0.792254,0.207746,1.338563,0.791878,0.208122,1.336284
2,131.0,56.0,0,1.57,2.25,1.67,2.32,0.589005,0.410995,0.359855,0.581454,0.418546,0.328744
3,38.0,61.0,1,1.40,2.75,1.41,3.13,0.662651,0.337349,0.675129,0.689427,0.310573,0.797443
4,185.0,102.0,0,2.62,1.44,2.73,1.51,0.354680,0.645320,-0.598531,0.356132,0.643868,-0.592192
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,1.0,6.0,1,1.44,2.75,1.39,3.26,0.656325,0.343675,0.646958,0.701075,0.298925,0.852423
2606,7.0,4.0,0,1.90,1.90,2.14,1.79,0.500000,0.500000,0.000000,0.455471,0.544529,-0.178590
2607,6.0,3.0,0,3.50,1.30,3.75,1.33,0.270833,0.729167,-0.990399,0.261811,0.738189,-1.036577
2608,5.0,7.0,1,1.80,2.00,1.84,2.10,0.526316,0.473684,0.105361,0.532995,0.467005,0.132172


In [19]:
# Calculate the consensus logit probabilities by taking the mean of the logit probabilities
betting_data_preprocessed_validation['consensus_logit_prob_w'] = betting_data_preprocessed_validation[
    [f'{bookmaker}_logit_prob_w' for bookmaker in bookmakers]
].mean(axis=1, skipna=True)

# Convert the consensus logit probabilities back to probabilities using the inverse logit function
betting_data_preprocessed_validation['consensus_prob_w'] = betting_data_preprocessed_validation['consensus_logit_prob_w'].apply(inverse_logit)

In [20]:
betting_data_preprocessed_validation

Unnamed: 0,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w,consensus_logit_prob_w,consensus_prob_w
0,19.0,75.0,1,1.36,3.00,1.36,3.37,0.688073,0.311927,0.791128,0.712474,0.287526,0.907428,0.849278,0.700416
1,18.0,67.0,1,1.18,4.50,1.23,4.68,0.792254,0.207746,1.338563,0.791878,0.208122,1.336284,1.337423,0.792066
2,131.0,56.0,0,1.57,2.25,1.67,2.32,0.589005,0.410995,0.359855,0.581454,0.418546,0.328744,0.344299,0.585234
3,38.0,61.0,1,1.40,2.75,1.41,3.13,0.662651,0.337349,0.675129,0.689427,0.310573,0.797443,0.736286,0.676183
4,185.0,102.0,0,2.62,1.44,2.73,1.51,0.354680,0.645320,-0.598531,0.356132,0.643868,-0.592192,-0.595362,0.355406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,1.0,6.0,1,1.44,2.75,1.39,3.26,0.656325,0.343675,0.646958,0.701075,0.298925,0.852423,0.749691,0.679111
2606,7.0,4.0,0,1.90,1.90,2.14,1.79,0.500000,0.500000,0.000000,0.455471,0.544529,-0.178590,-0.089295,0.477691
2607,6.0,3.0,0,3.50,1.30,3.75,1.33,0.270833,0.729167,-0.990399,0.261811,0.738189,-1.036577,-1.013488,0.266298
2608,5.0,7.0,1,1.80,2.00,1.84,2.10,0.526316,0.473684,0.105361,0.532995,0.467005,0.132172,0.118766,0.529657


In [21]:
# Apply a lambda function directly to calculate predictions and probabilities
betting_data_preprocessed_validation[['predictions', 'consensus_probabilities']] = betting_data_preprocessed_validation.apply(
    lambda row: (
        int(row['consensus_prob_w'] > 0.5) if int(row['higher_rank_won']) == 1 else int((1 - row['consensus_prob_w']) > 0.5),
        row['consensus_prob_w'] if int(row['higher_rank_won']) == 1 else 1 - row['consensus_prob_w']
    ), 
    axis=1, 
    result_type='expand'
)

In [23]:
# Evaluate the model's performance by comparing the true values ('higher_rank_won') 
# with the predicted values ('predictions') and the predicted probabilities ('consensus_probabilities')
evaluate_predictions(
    betting_data_preprocessed_validation['higher_rank_won'], 
    betting_data_preprocessed_validation['predictions'], 
    betting_data_preprocessed_validation['consensus_probabilities']
)

{'accuracy': 0.6686, 'calibration': 1.0229, 'log_loss': 0.6556}