### Import necessary libraries

In [1]:
# Enable Jupyter Notebook modules import
import import_ipynb

# For numerical operations and arrays
import numpy as np

# For data manipulation and analysis
import pandas as pd

# Custom tools for tennis data analysis
import Tennis_Analysis_Tools as tennis_tools

importing Jupyter notebook from Tennis_Analysis_Tools.ipynb


### Function to preprocess dataset

In [2]:
def preprocess_dataset(betting_data_dfs):
    
    """
    Preprocess the betting dataset to prepare it for analysis.

    This function:
    - Filters the dataset to include only completed matches.
    - Resets the index for sequential order after filtering.
    - Defines the columns to be used for odds.
    - Converts 'WRank' and 'LRank' columns to numeric, forcing errors to NaN.
    - Fills NaN values in 'WRank' and 'LRank' with 100000 and converts them to float.
    - Creates a new column to indicate if the higher-ranked player won.
    - Defines and keeps only the essential columns for analysis, including odds columns.
    - Converts odds columns to numeric, coercing errors to NaN.
    - Fills NaN values in the odds columns with their respective column means.
    """
    
    # Filter the dataset to include only completed matches
    betting_data_dfs = betting_data_dfs[betting_data_dfs['Comment'].isin(['Completed'])]

    # Reset the index for sequential order after filtering
    betting_data_dfs.reset_index(drop=True, inplace=True)
    
    # Define columns to be used for odds
    odds_columns = ['B365W', 'B365L', 'PSW', 'PSL']

    # Convert 'WRank' and 'LRank' columns to numeric, forcing errors to NaN
    betting_data_dfs['WRank'] = pd.to_numeric(betting_data_dfs['WRank'], errors='coerce')
    betting_data_dfs['LRank'] = pd.to_numeric(betting_data_dfs['LRank'], errors='coerce')

    # Fill NaN values in 'WRank' and 'LRank' with 100000 and convert to float
    betting_data_dfs['WRank'] = betting_data_dfs['WRank'].fillna(100000).astype(float)
    betting_data_dfs['LRank'] = betting_data_dfs['LRank'].fillna(100000).astype(float)

    # Create a column indicating if the higher-ranked player won (1 if true, 0 if false)
    betting_data_dfs['higher_rank_won'] = (betting_data_dfs['WRank'] < betting_data_dfs['LRank']).astype(int)
    
    # Define the essential columns to keep in the dataframe
    essential_columns = ['Date', 'Winner', 'Loser', 'WRank', 'LRank', 'higher_rank_won']
    
    # Filter the dataframe to keep only the essential columns and odds columns
    betting_data_filtered = betting_data_dfs[essential_columns + odds_columns]

    # Convert odds columns to numeric, coercing errors to NaN
    betting_data_filtered[odds_columns] = betting_data_filtered[odds_columns].apply(pd.to_numeric, errors='coerce')
    
    # Fill NaN values in odds columns with their respective column means
    means = betting_data_filtered.select_dtypes(include=[np.number]).mean()
    betting_data_filtered.fillna(means, inplace=True)
    
    # Return the preprocessed dataframe
    return betting_data_filtered

### Functions related to BCM

In [3]:
def calculate_implied_probabilities(odds_w, odds_l):
    # Check if either odds are zero or NaN (missing), return zero probabilities if true
    if odds_w == 0 or odds_l == 0 or pd.isna(odds_w) or pd.isna(odds_l):
        return 0, 0 
    
    # Calculate the probabilities from the betting odds
    implied_prob_w = 1 / odds_w  #  probability for the winner from betting odds
    implied_prob_l = 1 / odds_l  #  probability for the loser  from betting odds

    # Calculate the overround (the sum of the probabilities of the winner and loser)
    overround = implied_prob_w + implied_prob_l

    # Normalize the probabilities so they sum to 1 (to correct for the overround)
    normalized_prob_w = implied_prob_w / overround
    normalized_prob_l = implied_prob_l / overround

    # Return the normalized probabilities
    return normalized_prob_w, normalized_prob_l

In [4]:
def logit(p, eps=1e-6):
    # Clip probabilities to avoid log of zero
    p = np.clip(p, eps, 1 - eps)
    
    # Return logit (log-odds) transformation
    return np.log(p / (1 - p))

def inverse_logit(y):
    # Return NaN if input is NaN
    if pd.isna(y):
        return np.nan
    
    # Return inverse logit (sigmoid) transformation
    return np.exp(y) / (1 + np.exp(y))

### Loading and Analyzing betting data for validation dataset (2019)

In [5]:
# Load the betting data for the year 2019 from an Excel file
betting_data_df_2019 = pd.read_excel(f"/Users/harishthota/Desktop/UOA Project/Betting_Odds_Tennis/2019.xlsx")

In [6]:
betting_data_df_2019 = betting_data_df_2019[betting_data_df_2019.Date > '2018-12-31']
betting_data_df_2019.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
5,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kudla D.,...,1.0,Completed,2.62,1.44,2.8,1.49,2.85,1.55,2.7,1.47
6,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Chardy J.,...,1.0,Completed,2.1,1.66,2.23,1.72,2.26,1.74,2.19,1.68
7,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Murray A.,...,0.0,Completed,1.28,3.5,1.38,3.29,1.39,3.6,1.34,3.26
8,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kyrgios N.,...,1.0,Completed,1.4,2.75,1.47,2.87,1.5,3.16,1.44,2.8
9,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Tsonga J.W.,...,0.0,Completed,2.25,1.57,2.27,1.7,2.42,1.71,2.27,1.64


In [7]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_df_2019.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W2,L2,...,Wsets,Lsets,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
count,2593.0,2593.0,2590.0,2580.0,2591.0,2580.0,2572.0,2572.0,2559.0,2559.0,...,2572.0,2572.0,2580.0,2580.0,2582.0,2582.0,2592.0,2592.0,2592.0,2592.0
mean,33.423448,3.391053,57.059073,77.859302,1786.952914,1158.931008,5.824261,4.208398,5.835092,4.033998,...,2.168351,0.43196,1.84583,3.163674,1.93439,3.479733,1.999514,3.684718,1.868291,3.143696
std,18.109567,0.793364,57.232148,80.516542,2060.167885,1198.0869,1.204259,1.821081,1.21246,1.829953,...,0.436609,0.565814,0.895391,3.075953,1.029592,3.607953,1.098668,4.233503,0.912978,2.676414
min,1.0,3.0,1.0,1.0,17.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.002,1.07,1.005,1.07,1.01,1.08,1.01,1.06
25%,19.0,3.0,19.0,34.0,716.5,598.0,6.0,3.0,6.0,3.0,...,2.0,0.0,1.3,1.66,1.33,1.76,1.36,1.81,1.31,1.72
50%,32.0,3.0,45.0,60.0,1023.0,876.5,6.0,4.0,6.0,4.0,...,2.0,0.0,1.57,2.3,1.64,2.435,1.67,2.51,1.6,2.34
75%,50.0,3.0,76.0,95.0,1832.5,1250.0,6.0,6.0,6.0,6.0,...,2.0,1.0,2.1,3.4,2.1875,3.63,2.27,3.7925,2.12,3.43
max,66.0,5.0,503.0,1491.0,12415.0,12355.0,7.0,7.0,7.0,7.0,...,3.0,2.0,9.0,41.0,11.73,37.8,12.22,67.0,9.64,28.49


In [8]:
# Retrieve and display the column names of 2019 DataFrame
betting_data_df_2019.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'PSW', 'PSL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL'],
      dtype='object')

### Preprocessing Betting data for 2019

In [9]:
# Apply preprocessing steps to the validation dataset
betting_data_preprocessed_validation = preprocess_dataset(betting_data_df_2019)

In [10]:
betting_data_preprocessed_validation.head()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL
0,2019-01-01,Kudla D.,Fritz T.,63.0,49.0,0,2.62,1.44,2.8,1.49
1,2019-01-01,Chardy J.,Struff J.L.,40.0,57.0,1,2.1,1.66,2.23,1.72
2,2019-01-01,Murray A.,Duckworth J.,240.0,234.0,0,1.28,3.5,1.38,3.29
3,2019-01-01,Kyrgios N.,Harrison R.,35.0,62.0,1,1.4,2.75,1.47,2.87
4,2019-01-01,Tsonga J.W.,Kokkinakis T.,239.0,146.0,0,2.25,1.57,2.27,1.7


### BCM Model for 2019

In [11]:
# List of bookmakers in 2019
bookmakers = ['B365', 'PS']

In [12]:
# Calculate probabilities, logit probabilities, and consensus probabilities for each bookmaker 
# and add them to the 2019 DataFrame

for bookmaker in bookmakers:
    # Apply the calculate_implied_probabilities function to each row for the current bookmaker's win 
    # and loss odds
    betting_data_preprocessed_validation[f'{bookmaker}_prob_w'], \
    betting_data_preprocessed_validation[f'{bookmaker}_prob_l'] = zip(
        *betting_data_preprocessed_validation.apply(
            lambda row: calculate_implied_probabilities(
                row[f'{bookmaker}W'], 
                row[f'{bookmaker}L']
            ), axis=1
        )
    )
    
    # Calculate logit probabilities for the current bookmaker's win probabilities
    betting_data_preprocessed_validation[f'{bookmaker}_logit_prob_w'] = \
    betting_data_preprocessed_validation[f'{bookmaker}_prob_w'].apply(logit)

In [13]:
# Calculate the consensus logit probabilities by taking the mean of the logit probabilities
betting_data_preprocessed_validation['consensus_logit_prob_w'] = \
    betting_data_preprocessed_validation[
        [f'{bookmaker}_logit_prob_w' for bookmaker in bookmakers]
    ].mean(axis=1, skipna=True)

# Convert the consensus logit probabilities back to probabilities using the inverse logit function
betting_data_preprocessed_validation['consensus_prob_w'] = \
    betting_data_preprocessed_validation['consensus_logit_prob_w'].apply(inverse_logit)

In [14]:
betting_data_preprocessed_validation.head()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w,consensus_logit_prob_w,consensus_prob_w
0,2019-01-01,Kudla D.,Fritz T.,63.0,49.0,0,2.62,1.44,2.8,1.49,0.35468,0.64532,-0.598531,0.347319,0.652681,-0.630843,-0.614687,0.350991
1,2019-01-01,Chardy J.,Struff J.L.,40.0,57.0,1,2.1,1.66,2.23,1.72,0.441489,0.558511,-0.23512,0.435443,0.564557,-0.259677,-0.247399,0.438464
2,2019-01-01,Murray A.,Duckworth J.,240.0,234.0,0,1.28,3.5,1.38,3.29,0.732218,0.267782,1.005903,0.704497,0.295503,0.868804,0.937353,0.718565
3,2019-01-01,Kyrgios N.,Harrison R.,35.0,62.0,1,1.4,2.75,1.47,2.87,0.662651,0.337349,0.675129,0.66129,0.33871,0.66905,0.672089,0.661971
4,2019-01-01,Tsonga J.W.,Kokkinakis T.,239.0,146.0,0,2.25,1.57,2.27,1.7,0.410995,0.589005,-0.359855,0.428212,0.571788,-0.289152,-0.324503,0.419579


In [15]:
# Apply a lambda function directly to calculate predictions and probabilities
betting_data_preprocessed_validation[['predictions', 'consensus_probabilities']] = \
    betting_data_preprocessed_validation.apply(
        lambda row: (
            int(row['consensus_prob_w'] > 0.5) if int(row['higher_rank_won']) == 1 
            else int((1 - row['consensus_prob_w']) > 0.5),
            row['consensus_prob_w'] if int(row['higher_rank_won']) == 1 
            else 1 - row['consensus_prob_w']
        ), 
        axis=1, 
        result_type='expand'
    )

In [16]:
betting_data_preprocessed_validation.head()

Unnamed: 0,Date,Winner,Loser,WRank,LRank,higher_rank_won,B365W,B365L,PSW,PSL,B365_prob_w,B365_prob_l,B365_logit_prob_w,PS_prob_w,PS_prob_l,PS_logit_prob_w,consensus_logit_prob_w,consensus_prob_w,predictions,consensus_probabilities
0,2019-01-01,Kudla D.,Fritz T.,63.0,49.0,0,2.62,1.44,2.8,1.49,0.35468,0.64532,-0.598531,0.347319,0.652681,-0.630843,-0.614687,0.350991,1.0,0.649009
1,2019-01-01,Chardy J.,Struff J.L.,40.0,57.0,1,2.1,1.66,2.23,1.72,0.441489,0.558511,-0.23512,0.435443,0.564557,-0.259677,-0.247399,0.438464,0.0,0.438464
2,2019-01-01,Murray A.,Duckworth J.,240.0,234.0,0,1.28,3.5,1.38,3.29,0.732218,0.267782,1.005903,0.704497,0.295503,0.868804,0.937353,0.718565,0.0,0.281435
3,2019-01-01,Kyrgios N.,Harrison R.,35.0,62.0,1,1.4,2.75,1.47,2.87,0.662651,0.337349,0.675129,0.66129,0.33871,0.66905,0.672089,0.661971,1.0,0.661971
4,2019-01-01,Tsonga J.W.,Kokkinakis T.,239.0,146.0,0,2.25,1.57,2.27,1.7,0.410995,0.589005,-0.359855,0.428212,0.571788,-0.289152,-0.324503,0.419579,1.0,0.580421


### Evaluate the model's performance for 2019 data

In [17]:
accuracy_2019, calibration_2019, logloss_2019 = tennis_tools.evaluate_predictions(
    betting_data_preprocessed_validation['higher_rank_won'], 
    betting_data_preprocessed_validation['predictions'], 
    betting_data_preprocessed_validation['consensus_probabilities']
)

In [18]:
print("\033[1mAccuracy for validation dataset using BCM Model:\033[0m", accuracy_2019)
print("\033[1mCalibration for validation dataset using BCM Model:\033[0m", calibration_2019)
print("\033[1mLog loss for validation dataset using BCM Model:\033[0m", logloss_2019)

[1mAccuracy for validation dataset using BCM Model:[0m 0.67449
[1mCalibration for validation dataset using BCM Model:[0m 1.02443
[1mLog loss for validation dataset using BCM Model:[0m 0.59293


### Evaluate the model's performance for top 50 players

In [19]:
# Filter the DataFrame to include only matches where either the winner or loser is ranked in the top 50
betting_data_df_preprocessed_validation_top_50 = \
    betting_data_preprocessed_validation[
        (betting_data_preprocessed_validation['WRank'] <= 50) | 
        (betting_data_preprocessed_validation['LRank'] <= 50)
    ]

In [20]:
accuracy_2019_top50, calibration_2019_top50, logloss_2019_top50 = tennis_tools.evaluate_predictions(
    betting_data_df_preprocessed_validation_top_50['higher_rank_won'], 
    betting_data_df_preprocessed_validation_top_50['predictions'], 
    betting_data_df_preprocessed_validation_top_50['consensus_probabilities']
)

In [21]:
print("\033[1mAccuracy for validation dataset using BCM Model:\033[0m", accuracy_2019_top50)
print("\033[1mCalibration for validation dataset using BCM Model:\033[0m", calibration_2019_top50)
print("\033[1mLog loss for validation dataset using BCM Model:\033[0m", logloss_2019_top50)

[1mAccuracy for validation dataset using BCM Model:[0m 0.68007
[1mCalibration for validation dataset using BCM Model:[0m 1.03128
[1mLog loss for validation dataset using BCM Model:[0m 0.58263


### Evaluate the model's performance for top 100 players

In [22]:
# Filter the DataFrame to include only matches where either the winner or loser is ranked in the top 100
betting_data_df_preprocessed_validation_top_100 = \
    betting_data_preprocessed_validation[
        (betting_data_preprocessed_validation['WRank'] <= 100) | 
        (betting_data_preprocessed_validation['LRank'] <= 100)
    ]

In [23]:
accuracy_2019_top100, calibration_2019_top100, logloss_2019_top100 = tennis_tools.evaluate_predictions(
    betting_data_df_preprocessed_validation_top_100['higher_rank_won'], 
    betting_data_df_preprocessed_validation_top_100['predictions'], 
    betting_data_df_preprocessed_validation_top_100['consensus_probabilities']
)

In [24]:
print("\033[1mAccuracy for validation dataset using BCM Model:\033[0m", accuracy_2019_top100)
print("\033[1mCalibration for validation dataset using BCM Model:\033[0m", calibration_2019_top100)
print("\033[1mLog loss for validation dataset using BCM Model:\033[0m", logloss_2019_top100)

[1mAccuracy for validation dataset using BCM Model:[0m 0.67498
[1mCalibration for validation dataset using BCM Model:[0m 1.02693
[1mLog loss for validation dataset using BCM Model:[0m 0.59306
