In [1]:
# Import necessary libraries
import numpy as np                  
import pandas as pd                 
import matplotlib.pyplot as plt     
from sklearn.preprocessing import OneHotEncoder 

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [2]:
def preprocess_dataset(df):

    # Convert 'WRank' and 'LRank' to numeric, coerce errors to NaN
    df['WRank'] = pd.to_numeric(df['WRank'], errors='coerce')
    df['LRank'] = pd.to_numeric(df['LRank'], errors='coerce')

    # Fill missing ranks with a high number (indicative of a very low rank) and convert to float
    df['WRank'] = df['WRank'].fillna(100000).astype(float)
    df['LRank'] = df['LRank'].fillna(100000).astype(float)

    # Determine which player had the higher rank and who won
    df['higher_rank_won'] = (df['WRank'] < df['LRank']).astype(int)

    return df

In [3]:
def accuracy(actual, predictions):
    # Calculate the mean of correct predictions
    logr_accuracy_all_predictors = np.round(np.mean(actual == predictions),4)
    return logr_accuracy_all_predictors

def calibration(actual, predictions):
    # Calculate the ratio of the sum of predictions to the sum of actual values
    return np.round((np.sum(predictions) / np.sum(actual)), 4)

def logloss(actual, predictions):
    epsilon = 1e-15  # Small constant to prevent division by zero
    # Clip predictions to avoid log of zero. Values are clipped to the range [epsilon, 1-epsilon]
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return np.round(logr_logloss_all_predictors, 4)

In [4]:
# Evaluating predictions using accuracy, calibration and logloss
def evaluate_predictions(actual_outcomes, binary_predictions, probability_predictions):
    accuracy_result = accuracy(actual_outcomes, binary_predictions)
    
    calibration_result = calibration(actual_outcomes, probability_predictions)
    
    logloss_result = logloss(actual_outcomes, probability_predictions)
    
    return {
        'accuracy': accuracy_result,  
        'calibration': calibration_result,  
        'log_loss': logloss_result  
    }

In [5]:
def precompute_top_players(data, top_n):
    # Group data by year, then find top n players based on 'WRank' and 'LRank' for winners and losers.
    top_players_by_year = {}
    for year, group in data.groupby(data['Date'].dt.year):
        top_winners = set(group.nsmallest(top_n, 'WRank')['Winner'].unique())
        top_losers = set(group.nsmallest(top_n, 'LRank')['Loser'].unique())
        top_players_by_year[year] = top_winners.union(top_losers)
    return top_players_by_year

In [6]:
# Initialize a dictionary to store dataframes for each year
betting_data_dfs = {}

In [7]:
# Loop through each year from 2005 to 2019
for current_year in range(2005, 2020): 
    # Determine the file extension based on the year
    file_extension = 'xls' if current_year < 2013 else 'xlsx'
    
    # Construct the file path using the determined file extension
    file_path = f"Betting_Odds_Tennis/{current_year}.{file_extension}"
    
    # Read the Excel file and store it in the dictionary with the year as the key
    betting_data_dfs[current_year] = pd.read_excel(file_path)

In [8]:
betting_data_dfs[2019]

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,...,0.0,Completed,1.36,3.00,1.36,3.37,1.42,3.60,1.35,3.18
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,...,0.0,Completed,1.18,4.50,1.23,4.68,1.27,4.84,1.22,4.26
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,...,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.40,1.63,2.28
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,...,1.0,Completed,1.40,2.75,1.41,3.13,1.45,3.20,1.40,2.95
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,...,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1.0,Completed,1.44,2.75,1.39,3.26,1.48,3.30,1.41,2.93
2606,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,0.0,Completed,1.90,1.90,2.14,1.79,2.24,2.06,1.92,1.90
2607,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,0.0,Completed,3.50,1.30,3.75,1.33,3.75,1.40,3.39,1.33
2608,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,0.0,Completed,1.80,2.00,1.84,2.10,1.87,2.20,1.78,2.06


In [9]:
# Combine DataFrames from all years into a single DataFrame, reindexing rows.
betting_data_dfs = pd.concat(betting_data_dfs.values(), ignore_index=True)

In [10]:
# Display a summary of the DataFrame to understand its structure and data types
betting_data_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40390 entries, 0 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         40390 non-null  int64         
 1   Location    40390 non-null  object        
 2   Tournament  40390 non-null  object        
 3   Date        40390 non-null  datetime64[ns]
 4   Series      40390 non-null  object        
 5   Court       40390 non-null  object        
 6   Surface     40390 non-null  object        
 7   Round       40390 non-null  object        
 8   Best of     40390 non-null  int64         
 9   Winner      40390 non-null  object        
 10  Loser       40390 non-null  object        
 11  WRank       40375 non-null  float64       
 12  LRank       40303 non-null  float64       
 13  WPts        38701 non-null  float64       
 14  LPts        38631 non-null  float64       
 15  W1          40155 non-null  float64       
 16  L1          40157 non-

In [11]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_dfs.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W4,L4,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
count,40390.0,40390.0,40375.0,40303.0,38701.0,38631.0,40155.0,40157.0,3647.0,3647.0,...,10671.0,10671.0,28131.0,28142.0,15572.0,15579.0,25354.0,25354.0,25354.0,25354.0
mean,32.974944,3.378311,57.801536,90.38486,1828.537195,1054.728379,5.801992,4.075155,5.783384,3.865643,...,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.99861,7.847864,1.838168,3.547658
std,18.006138,0.783274,72.735132,115.423997,2278.996487,1212.422674,1.232787,1.841617,1.262227,1.903181,...,0.996238,3.646316,1.031691,3.075889,1.004273,3.27251,1.582432,376.24683,1.089277,3.22777
min,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.01,1.02,1.0,1.0,1.0,1.01,1.01,1.01,1.01,1.01
25%,19.0,3.0,16.0,34.0,645.0,502.0,6.0,3.0,6.0,2.0,...,1.24,1.75,1.25,1.73,1.22,1.73,1.3,1.84,1.25,1.74
50%,33.0,3.0,40.0,64.0,1010.0,745.0,6.0,4.0,6.0,4.0,...,1.5,2.5,1.5,2.5,1.5,2.63,1.58,2.75,1.51,2.53
75%,49.0,3.0,75.0,102.0,1890.0,1150.0,6.0,6.0,6.0,6.0,...,2.03,3.85,2.0,4.0,2.0,4.0,2.21,4.47,2.07,3.91
max,67.0,5.0,1890.0,2159.0,16950.0,16950.0,7.0,7.0,7.0,7.0,...,18.0,60.0,26.0,51.0,19.0,81.0,76.0,42586.0,23.45,36.44


In [12]:
# Retrieve and display the column names of the DataFrame
betting_data_dfs.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'CBW', 'CBL', 'EXW', 'EXL', 'IWW',
       'IWL', 'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW',
       'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

## Splitting the Dataset into training and validation

In [13]:
split_time = "2019-01-01"
betting_data_dfs_train = betting_data_dfs[betting_data_dfs.Date < split_time]

In [14]:
betting_data_dfs_train.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
37838,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Darcis S.,...,,,,,,,2.47,1.65,2.35,1.59
37839,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Munar J.,...,,,,,,,2.08,1.95,1.94,1.86
37840,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Donskoy E.,...,,,,,,,1.57,2.65,1.51,2.53
37841,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Mmoh M.,...,,,,,,,1.83,2.17,1.74,2.09
37842,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Gulbis E.,...,,,,,,,1.4,3.5,1.35,3.19


In [15]:
betting_data_dfs_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37797 entries, 0 to 37842
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         37797 non-null  int64         
 1   Location    37797 non-null  object        
 2   Tournament  37797 non-null  object        
 3   Date        37797 non-null  datetime64[ns]
 4   Series      37797 non-null  object        
 5   Court       37797 non-null  object        
 6   Surface     37797 non-null  object        
 7   Round       37797 non-null  object        
 8   Best of     37797 non-null  int64         
 9   Winner      37797 non-null  object        
 10  Loser       37797 non-null  object        
 11  WRank       37785 non-null  float64       
 12  LRank       37723 non-null  float64       
 13  WPts        36110 non-null  float64       
 14  LPts        36051 non-null  float64       
 15  W1          37583 non-null  float64       
 16  L1          37585 non-

In [16]:
# Preprocessing betting odds training dataset
betting_data_dfs_train_preprocessed = preprocess_dataset(betting_data_dfs_train) 

In [17]:
betting_data_dfs_train_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37797 entries, 0 to 37842
Data columns (total 49 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ATP              37797 non-null  int64         
 1   Location         37797 non-null  object        
 2   Tournament       37797 non-null  object        
 3   Date             37797 non-null  datetime64[ns]
 4   Series           37797 non-null  object        
 5   Court            37797 non-null  object        
 6   Surface          37797 non-null  object        
 7   Round            37797 non-null  object        
 8   Best of          37797 non-null  int64         
 9   Winner           37797 non-null  object        
 10  Loser            37797 non-null  object        
 11  WRank            37797 non-null  float64       
 12  LRank            37797 non-null  float64       
 13  WPts             36110 non-null  float64       
 14  LPts             36051 non-null  float

In [18]:
# Calculate means of all numeric columns and fill missing values with these means for the training dataset
means = betting_data_dfs_train_preprocessed.select_dtypes(include=[np.number]).mean()
betting_data_dfs_train_preprocessed.fillna(means, inplace=True)

In [19]:
# Filter the DataFrame to create a validation dataset with records on or after the specified split time
betting_data_df_validation = betting_data_dfs[betting_data_dfs.Date >= split_time]

In [20]:
betting_data_df_validation.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,,,,,,,1.48,3.3,1.41,2.93
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,,,,,,,2.24,2.06,1.92,1.9
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,,,,,,,3.75,1.4,3.39,1.33
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,,,,,,,1.87,2.2,1.78,2.06
40389,66,London,Masters Cup,2019-11-17,Masters Cup,Indoor,Hard,The Final,3,Tsitsipas S.,...,,,,,,,2.05,1.93,1.96,1.86


In [21]:
betting_data_dfs_train_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37797 entries, 0 to 37842
Data columns (total 49 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ATP              37797 non-null  int64         
 1   Location         37797 non-null  object        
 2   Tournament       37797 non-null  object        
 3   Date             37797 non-null  datetime64[ns]
 4   Series           37797 non-null  object        
 5   Court            37797 non-null  object        
 6   Surface          37797 non-null  object        
 7   Round            37797 non-null  object        
 8   Best of          37797 non-null  int64         
 9   Winner           37797 non-null  object        
 10  Loser            37797 non-null  object        
 11  WRank            37797 non-null  float64       
 12  LRank            37797 non-null  float64       
 13  WPts             37797 non-null  float64       
 14  LPts             37797 non-null  float

In [22]:
# Apply preprocessing steps to the validation dataset
betting_data_df_validation_preprocessed = preprocess_dataset(betting_data_df_validation)

In [23]:
# Fill missing values in the validation dataset with the mean values calculated from the training dataset
betting_data_df_validation_preprocessed.fillna(means, inplace=True)

In [24]:
betting_data_df_validation_preprocessed.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won
37785,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kudla D.,...,3.542479,1.810226,3.451461,1.796538,3.557943,2.85,1.55,2.7,1.47,0
37786,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Chardy J.,...,3.542479,1.810226,3.451461,1.796538,3.557943,2.26,1.74,2.19,1.68,1
37787,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Murray A.,...,3.542479,1.810226,3.451461,1.796538,3.557943,1.39,3.6,1.34,3.26,0
37788,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Kyrgios N.,...,3.542479,1.810226,3.451461,1.796538,3.557943,1.5,3.16,1.44,2.8,1
37789,1,Brisbane,Brisbane International,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Tsonga J.W.,...,3.542479,1.810226,3.451461,1.796538,3.557943,2.42,1.71,2.27,1.64,0


In [25]:
betting_data_df_validation_preprocessed.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'CBW', 'CBL', 'EXW', 'EXL', 'IWW',
       'IWL', 'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW',
       'MaxL', 'AvgW', 'AvgL', 'higher_rank_won'],
      dtype='object')

## Naive Model

In [26]:
def higher_ranked_wins(betting_data_df):
    # Calculate the number of matches won by the higher-ranked player
    matches_won_by_higher_ranked = betting_data_df.apply(
        lambda row: row['WRank'] < row['LRank'], axis=1).sum()

    # Get the total number of matches in the dataset
    total_matches = len(betting_data_df)

    # Calculate the probability that a higher-ranked player wins
    probability_higher_ranked_wins = matches_won_by_higher_ranked / total_matches

    # Return the probability
    return probability_higher_ranked_wins

### Metrics for Trained Data

In [27]:
# Calculate the probability that the higher-ranked player wins in the preprocessed training dataset
prob_higher_rank_win = higher_ranked_wins(betting_data_dfs_train_preprocessed)

In [43]:
# Create a naive prediction array where we assume the higher-ranked player always wins (binary prediction)
naive_predictions = np.full_like(betting_data_dfs_train_preprocessed["higher_rank_won"], fill_value=1)

# Create a naive probability prediction array using the computed probability of higher-ranked players winning
naive_probability_predictions = np.full_like(betting_data_dfs_train_preprocessed["higher_rank_won"], fill_value=prob_higher_rank_win, dtype='float64')

# Print the naive probability predictions
naive_predictions, naive_probability_predictions

(array([1, 1, 1, ..., 1, 1, 1]),
 array([0.66280393, 0.66280393, 0.66280393, ..., 0.66280393, 0.66280393,
        0.66280393]))

In [29]:
# Evaluate the naive predictions against the actual outcomes in the preprocessed training dataset
evaluation_results = evaluate_predictions(betting_data_dfs_train_preprocessed["higher_rank_won"], naive_predictions, naive_probability_predictions)

# Print the evaluation results
print(evaluation_results)

{'accuracy': 0.6628, 'calibration': 1.0, 'log_loss': 0.6392}


## Evaluating Naive Model on top 50 and top 100 players

In [30]:
betting_data_dfs_preprocessed = preprocess_dataset(betting_data_dfs)

In [31]:
top_50_players = precompute_top_players(betting_data_dfs_preprocessed, 50)
top_100_players = precompute_top_players(betting_data_dfs_preprocessed, 100)

In [32]:
top_50_players

{2005: {'Coria G.',
  'Federer R.',
  'Gaudio G.',
  'Henman T.',
  'Hewitt L.',
  'Moya C.',
  'Nadal R.',
  'Roddick A.',
  'Safin M.'},
 2006: {'Davydenko N.',
  'Federer R.',
  'Hewitt L.',
  'Ljubicic I.',
  'Nadal R.',
  'Nalbandian D.',
  'Roddick A.'},
 2007: {'Davydenko N.',
  'Djokovic N.',
  'Federer R.',
  'Ljubicic I.',
  'Nadal R.',
  'Roddick A.'},
 2008: {'Davydenko N.', 'Djokovic N.', 'Federer R.', 'Ferrer D.', 'Nadal R.'},
 2009: {'Djokovic N.', 'Federer R.', 'Federer R. ', 'Murray A.', 'Nadal R.'},
 2010: {'Djokovic N.', 'Federer R.', 'Murray A.', 'Nadal R.'},
 2011: {'Djokovic N.', 'Federer R.', 'Murray A.', 'Nadal R.', 'Soderling R.'},
 2012: {'Djokovic N.',
  'Federer R.',
  'Ferrer D.',
  'Murray A.',
  'Nadal R.',
  'Tsonga J.W.'},
 2013: {'Djokovic N.', 'Federer R.', 'Ferrer D.', 'Murray A.', 'Nadal R.'},
 2014: {'Del Potro J.M.',
  'Djokovic N.',
  'Federer R.',
  'Ferrer D.',
  'Murray A.',
  'Nadal R.',
  'Wawrinka S.'},
 2015: {'Berdych T.',
  'Djokovic N.'

In [33]:
betting_data_df_validation_preprocessed['Top50'] = betting_data_df_validation_preprocessed.apply(
    lambda row: row['Winner'] in top_50_players[2019] and row['Loser'] in top_50_players[2019], axis=1)
betting_data_df_validation_preprocessed['Top100'] = betting_data_df_validation_preprocessed.apply(
    lambda row: row['Winner'] in top_100_players[2019] and row['Loser'] in top_100_players[2019], axis=1)

In [34]:
betting_data_df_validation_top_50 = betting_data_df_validation[betting_data_df_validation['Top50'] == True]
betting_data_df_validation_top_100 = betting_data_df_validation[betting_data_df_validation['Top100'] == True]

In [42]:
betting_data_df_validation_top_50.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL,higher_rank_won,Top50,Top100
39959,52,New York,US Open,2019-09-08,Grand Slam,Outdoor,Hard,The Final,5,Nadal R.,...,3.451461,1.796538,3.557943,1.25,5.64,1.19,4.96,1,True,True
40180,59,Shanghai,Shanghai Masters,2019-10-11,Masters 1000,Outdoor,Hard,Quarterfinals,3,Zverev A.,...,3.451461,1.796538,3.557943,3.05,1.48,2.85,1.43,0,True,True
40184,59,Shanghai,Shanghai Masters,2019-10-13,Masters 1000,Outdoor,Hard,The Final,3,Medvedev D.,...,3.451461,1.796538,3.557943,1.59,2.63,1.54,2.49,1,True,True
40381,66,London,Masters Cup,2019-11-13,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,3.451461,1.796538,3.557943,2.28,1.8,2.16,1.71,1,True,True
40384,66,London,Masters Cup,2019-11-14,Masters Cup,Indoor,Hard,Round Robin,3,Federer R.,...,3.451461,1.796538,3.557943,3.33,1.42,3.12,1.37,0,True,True


### Evaluating Model on top 50

In [36]:
# Create a naive prediction array where we assume the higher-ranked player always wins (binary prediction) for the validation dataset
naive_predictions = np.full_like(betting_data_df_validation_top_50["higher_rank_won"], fill_value=1)

# Create a naive probability prediction array using the computed probability of higher-ranked players winning for the validation dataset
naive_probability_predictions = np.round(np.full_like(betting_data_df_validation_top_50["higher_rank_won"], fill_value = prob_higher_rank_win, dtype='float64'),4)

naive_predictions, naive_probability_predictions

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]),
 array([0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628]))

In [37]:
# Evaluate the naive predictions against the actual outcomes in the preprocessed validation dataset
evaluation_results_validation = evaluate_predictions(betting_data_df_validation_top_50["higher_rank_won"], naive_predictions, naive_probability_predictions)

evaluation_results_validation

{'accuracy': 0.5385, 'calibration': 1.2309, 'log_loss': 0.7232}

### Evaluating Model on top 100

In [40]:
# Create a naive prediction array where we assume the higher-ranked player always wins (binary prediction) for the validation dataset
naive_predictions = np.full_like(betting_data_df_validation_top_100["higher_rank_won"], fill_value=1)

# Create a naive probability prediction array using the computed probability of higher-ranked players winning for the validation dataset
naive_probability_predictions = np.round(np.full_like(betting_data_df_validation_top_100["higher_rank_won"], fill_value = prob_higher_rank_win, dtype='float64'),4)

naive_predictions, naive_probability_predictions

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628, 0.6628,
        0.6628]))

In [41]:
# Evaluate the naive predictions against the actual outcomes in the preprocessed validation dataset
evaluation_results_validation = evaluate_predictions(betting_data_df_validation_top_100["higher_rank_won"], naive_predictions, naive_probability_predictions)

evaluation_results_validation

{'accuracy': 0.5614, 'calibration': 1.1806, 'log_loss': 0.7077}