In [1]:
# Enable Jupyter Notebook modules import
import import_ipynb

# For numerical operations and arrays
import numpy as np

# For data manipulation and analysis
import pandas as pd

# Custom tools for tennis data analysis
import Tennis_Analysis_Tools as tennis_tools

importing Jupyter notebook from Tennis_Analysis_Tools.ipynb


In [2]:
def preprocess_dataset(betting_data_dfs):
    # Replace 'NR' (Not Ranked) with NaN in 'WRank' and 'LRank' columns
    betting_data_dfs['WRank'].replace('NR', np.nan, inplace=True)
    betting_data_dfs['LRank'].replace('NR', np.nan, inplace=True)
    
    # Convert 'WRank' and 'LRank' columns to numeric, forcing errors to NaN
    betting_data_dfs['WRank'] = pd.to_numeric(betting_data_dfs['WRank'], errors='coerce')
    betting_data_dfs['LRank'] = pd.to_numeric(betting_data_dfs['LRank'], errors='coerce')

    # Fill NaN values in 'WRank' and 'LRank' with 100000 and convert to float
    betting_data_dfs['WRank'] = betting_data_dfs['WRank'].fillna(100000).astype(float)
    betting_data_dfs['LRank'] = betting_data_dfs['LRank'].fillna(100000).astype(float)

    # Create a column indicating if the higher-ranked player won (1 if true, 0 if false)
    betting_data_dfs['higher_rank_won'] = (betting_data_dfs['WRank'] < betting_data_dfs['LRank']).astype(int)
    
    # Calculate points for the higher-ranked player and the lower-ranked player
    betting_data_dfs['higher_rank_points'] = betting_data_dfs['higher_rank_won'] * betting_data_dfs['WPts'] + betting_data_dfs['LPts'] * (1 - betting_data_dfs['higher_rank_won'])
    betting_data_dfs['lower_rank_points'] = (1 - betting_data_dfs['higher_rank_won']) * betting_data_dfs['WPts'] + betting_data_dfs['LPts'] * betting_data_dfs['higher_rank_won']
    
    # Calculate the difference in points between the higher-ranked and lower-ranked players
    betting_data_dfs['points_diff'] = betting_data_dfs['higher_rank_points'] - betting_data_dfs['lower_rank_points']

    # Define the essential columns to keep in the dataframe
    essential_columns = ['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 
                         'higher_rank_won', 'higher_rank_points', 'lower_rank_points', 'points_diff', 
                         'WRank', 'LRank', 'Wsets', 'Lsets', 'Winner', 'Loser']

    # Filter the dataframe to keep only the essential columns
    betting_data_filtered = betting_data_dfs[essential_columns]

    # Convert certain categorical columns to the category dtype
    betting_data_filtered = betting_data_filtered.astype({'Location': 'category', 'Tournament': 'category', 'Series': 'category',
                                                          'Court': 'category', 'Surface': 'category', 'Round': 'category'})

    # Define categorical features for one-hot encoding
    categorical_features = ['Location', 'Tournament', 'Series', 'Court', 'Surface', 'Round']
    
    # Initialize the OneHotEncoder to ignore unknown categories and output a dense array
    encoder = tennis_tools.OneHotEncoder(handle_unknown='ignore', sparse=False)

    # Fit and transform the categorical features
    encoded_features = encoder.fit_transform(betting_data_filtered[categorical_features])

    # Create a new dataframe from the encoded features
    betting_data_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

    # Reset the index of betting_data_filtered to ensure alignment during concatenation
    betting_data_filtered.reset_index(drop=True, inplace=True)
    betting_data_encoded.reset_index(drop=True, inplace=True)

    # Concatenate the original dataframe (without categorical features) with the encoded features
    betting_data_final = pd.concat([betting_data_filtered.drop(columns=categorical_features), betting_data_encoded], axis=1)

    # Drop any columns that are completely NaN
    betting_data_final.dropna(axis=1, how='all', inplace=True)

    # Return the preprocessed dataframe
    return betting_data_final

In [3]:
# Initialize a dictionary to store dataframes for each year
betting_data_dfs = {}

In [4]:
# Loop through each year from 2005 to 2019
for current_year in range(2005, 2020): 
    # Determine the file extension based on the year
    file_extension = 'xls' if current_year < 2013 else 'xlsx'
    
    # Construct the file path using the determined file extension
    file_path = f"Betting_Odds_Tennis/{current_year}.{file_extension}"
    
    # Read the Excel file and store it in the dictionary with the year as the key
    betting_data_dfs[current_year] = tennis_tools.pd.read_excel(file_path)

In [5]:
betting_data_dfs[2019]

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,...,0.0,Completed,1.36,3.00,1.36,3.37,1.42,3.60,1.35,3.18
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,...,0.0,Completed,1.18,4.50,1.23,4.68,1.27,4.84,1.22,4.26
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,...,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.40,1.63,2.28
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,...,1.0,Completed,1.40,2.75,1.41,3.13,1.45,3.20,1.40,2.95
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,...,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1.0,Completed,1.44,2.75,1.39,3.26,1.48,3.30,1.41,2.93
2606,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,0.0,Completed,1.90,1.90,2.14,1.79,2.24,2.06,1.92,1.90
2607,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,0.0,Completed,3.50,1.30,3.75,1.33,3.75,1.40,3.39,1.33
2608,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,0.0,Completed,1.80,2.00,1.84,2.10,1.87,2.20,1.78,2.06


In [6]:
# Combine DataFrames from all years into a single DataFrame, reindexing rows.
betting_data_dfs = tennis_tools.pd.concat(betting_data_dfs.values(), ignore_index = True)

In [7]:
# Display a summary of the DataFrame to understand its structure and data types
betting_data_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40390 entries, 0 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         40390 non-null  int64         
 1   Location    40390 non-null  object        
 2   Tournament  40390 non-null  object        
 3   Date        40390 non-null  datetime64[ns]
 4   Series      40390 non-null  object        
 5   Court       40390 non-null  object        
 6   Surface     40390 non-null  object        
 7   Round       40390 non-null  object        
 8   Best of     40390 non-null  int64         
 9   Winner      40390 non-null  object        
 10  Loser       40390 non-null  object        
 11  WRank       40375 non-null  float64       
 12  LRank       40303 non-null  float64       
 13  WPts        38701 non-null  float64       
 14  LPts        38631 non-null  float64       
 15  W1          40155 non-null  float64       
 16  L1          40157 non-

In [8]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_dfs.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W4,L4,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
count,40390.0,40390.0,40375.0,40303.0,38701.0,38631.0,40155.0,40157.0,3647.0,3647.0,...,10671.0,10671.0,28131.0,28142.0,15572.0,15579.0,25354.0,25354.0,25354.0,25354.0
mean,32.974944,3.378311,57.801536,90.38486,1828.537195,1054.728379,5.801992,4.075155,5.783384,3.865643,...,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.99861,7.847864,1.838168,3.547658
std,18.006138,0.783274,72.735132,115.423997,2278.996487,1212.422674,1.232787,1.841617,1.262227,1.903181,...,0.996238,3.646316,1.031691,3.075889,1.004273,3.27251,1.582432,376.24683,1.089277,3.22777
min,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.01,1.02,1.0,1.0,1.0,1.01,1.01,1.01,1.01,1.01
25%,19.0,3.0,16.0,34.0,645.0,502.0,6.0,3.0,6.0,2.0,...,1.24,1.75,1.25,1.73,1.22,1.73,1.3,1.84,1.25,1.74
50%,33.0,3.0,40.0,64.0,1010.0,745.0,6.0,4.0,6.0,4.0,...,1.5,2.5,1.5,2.5,1.5,2.63,1.58,2.75,1.51,2.53
75%,49.0,3.0,75.0,102.0,1890.0,1150.0,6.0,6.0,6.0,6.0,...,2.03,3.85,2.0,4.0,2.0,4.0,2.21,4.47,2.07,3.91
max,67.0,5.0,1890.0,2159.0,16950.0,16950.0,7.0,7.0,7.0,7.0,...,18.0,60.0,26.0,51.0,19.0,81.0,76.0,42586.0,23.45,36.44


In [9]:
# Retrieve and display the column names of the DataFrame
betting_data_dfs.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'CBW', 'CBL', 'EXW', 'EXL', 'IWW',
       'IWL', 'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW',
       'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

## Splitting the Dataset into training and validation

In [10]:
split_time = "2019-01-01"
betting_data_train = betting_data_dfs[betting_data_dfs.Date < split_time]

In [11]:
betting_data_train.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
37838,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Darcis S.,...,,,,,,,2.47,1.65,2.35,1.59
37839,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Munar J.,...,,,,,,,2.08,1.95,1.94,1.86
37840,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Donskoy E.,...,,,,,,,1.57,2.65,1.51,2.53
37841,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Mmoh M.,...,,,,,,,1.83,2.17,1.74,2.09
37842,3,Pune,Maharashtra Open,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Gulbis E.,...,,,,,,,1.4,3.5,1.35,3.19


In [12]:
betting_data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37797 entries, 0 to 37842
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         37797 non-null  int64         
 1   Location    37797 non-null  object        
 2   Tournament  37797 non-null  object        
 3   Date        37797 non-null  datetime64[ns]
 4   Series      37797 non-null  object        
 5   Court       37797 non-null  object        
 6   Surface     37797 non-null  object        
 7   Round       37797 non-null  object        
 8   Best of     37797 non-null  int64         
 9   Winner      37797 non-null  object        
 10  Loser       37797 non-null  object        
 11  WRank       37785 non-null  float64       
 12  LRank       37723 non-null  float64       
 13  WPts        36110 non-null  float64       
 14  LPts        36051 non-null  float64       
 15  W1          37583 non-null  float64       
 16  L1          37585 non-

## Preprocessing betting odds training dataset

In [13]:
betting_data_train_preprocessed = preprocess_dataset(betting_data_train) 

In [14]:
betting_data_train_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37797 entries, 0 to 37796
Columns: 300 entries, ATP to Round_The Final
dtypes: datetime64[ns](1), float64(294), int64(3), object(2)
memory usage: 86.5+ MB


In [15]:
# Calculate means of all numeric columns and fill missing values with these means for the training dataset
means = betting_data_train_preprocessed.select_dtypes(include=[tennis_tools.np.number]).mean()
betting_data_train_preprocessed.fillna(means, inplace=True)

## Preprocessing betting odds Validation dataset

In [16]:
# Filter the DataFrame to create a validation dataset with records on or after the specified split time
betting_data_df_validation = betting_data_dfs[betting_data_dfs.Date >= split_time]

In [17]:
betting_data_df_validation.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,,,,,,,1.48,3.3,1.41,2.93
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,,,,,,,2.24,2.06,1.92,1.9
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,,,,,,,3.75,1.4,3.39,1.33
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,,,,,,,1.87,2.2,1.78,2.06
40389,66,London,Masters Cup,2019-11-17,Masters Cup,Indoor,Hard,The Final,3,Tsitsipas S.,...,,,,,,,2.05,1.93,1.96,1.86


In [18]:
betting_data_df_validation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2593 entries, 37785 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         2593 non-null   int64         
 1   Location    2593 non-null   object        
 2   Tournament  2593 non-null   object        
 3   Date        2593 non-null   datetime64[ns]
 4   Series      2593 non-null   object        
 5   Court       2593 non-null   object        
 6   Surface     2593 non-null   object        
 7   Round       2593 non-null   object        
 8   Best of     2593 non-null   int64         
 9   Winner      2593 non-null   object        
 10  Loser       2593 non-null   object        
 11  WRank       2590 non-null   float64       
 12  LRank       2580 non-null   float64       
 13  WPts        2591 non-null   float64       
 14  LPts        2580 non-null   float64       
 15  W1          2572 non-null   float64       
 16  L1          2572 no

In [19]:
# Apply preprocessing steps to the validation dataset
betting_data_df_validation_preprocessed = preprocess_dataset(betting_data_df_validation)

In [20]:
# Fill missing values in the validation dataset with the mean values calculated from the training dataset
betting_data_df_validation_preprocessed.fillna(means, inplace=True)

In [21]:
betting_data_df_validation_preprocessed.head()

Unnamed: 0,ATP,Date,Best of,higher_rank_won,higher_rank_points,lower_rank_points,points_diff,WRank,LRank,Wsets,...,Surface_Grass,Surface_Hard,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final
0,1,2019-01-01,3,0,974.0,810.0,164.0,63.0,49.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2019-01-01,3,1,1050.0,875.0,175.0,40.0,57.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2019-01-01,3,0,206.0,200.0,6.0,240.0,234.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2019-01-01,3,1,1125.0,810.0,315.0,35.0,62.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2019-01-01,3,0,367.0,200.0,167.0,239.0,146.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
betting_data_df_validation_preprocessed.columns

Index(['ATP', 'Date', 'Best of', 'higher_rank_won', 'higher_rank_points',
       'lower_rank_points', 'points_diff', 'WRank', 'LRank', 'Wsets',
       ...
       'Surface_Grass', 'Surface_Hard', 'Round_1st Round', 'Round_2nd Round',
       'Round_3rd Round', 'Round_4th Round', 'Round_Quarterfinals',
       'Round_Round Robin', 'Round_Semifinals', 'Round_The Final'],
      dtype='object', length=160)

## Fitting Logistic Regression Model

In [23]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model with no intercept
logr = LogisticRegression(fit_intercept=False)

# Fit the model using 'points_diff' as the feature and 'higher_rank_won' as the target variable
logr.fit(betting_data_train_preprocessed[['points_diff']], betting_data_train_preprocessed["higher_rank_won"])

LogisticRegression(fit_intercept=False)

### Logistic Predicitons on Trained Data

In [24]:
# Generate class predictions for the training data
tennis_train_predictions_logr = logr.predict(betting_data_train_preprocessed[['points_diff']])

# Generate prediction probabilities for the training data
# [:, 1] is used to get the probability of the positive class (higher_rank_won = 1)
tennis_train_prediction_prob_logr = logr.predict_proba(betting_data_train_preprocessed[['points_diff']])[:, 1]

### Evaluation of Logistic Model on Trained Data

In [26]:
# Evaluate the model's predictions on the training data
tennis_tools.evaluate_predictions(betting_data_train_preprocessed["higher_rank_won"], tennis_train_predictions_logr, tennis_train_prediction_prob_logr)

{'accuracy': 0.6627, 'calibration': 0.9329, 'log_loss': 0.6256}

## Evaluating Logistic Model on top 50 and top 100 players

In [27]:
betting_data_dfs_preprocessed = preprocess_dataset(betting_data_dfs)

In [28]:
top_50_players = tennis_tools.precompute_top_players(betting_data_dfs_preprocessed, 50)
top_100_players = tennis_tools.precompute_top_players(betting_data_dfs_preprocessed, 100)

In [29]:
top_50_players

{2005: {'Coria G.',
  'Federer R.',
  'Gaudio G.',
  'Henman T.',
  'Hewitt L.',
  'Moya C.',
  'Nadal R.',
  'Roddick A.',
  'Safin M.'},
 2006: {'Davydenko N.',
  'Federer R.',
  'Hewitt L.',
  'Ljubicic I.',
  'Nadal R.',
  'Nalbandian D.',
  'Roddick A.'},
 2007: {'Davydenko N.',
  'Djokovic N.',
  'Federer R.',
  'Ljubicic I.',
  'Nadal R.',
  'Roddick A.'},
 2008: {'Davydenko N.', 'Djokovic N.', 'Federer R.', 'Ferrer D.', 'Nadal R.'},
 2009: {'Djokovic N.', 'Federer R.', 'Federer R. ', 'Murray A.', 'Nadal R.'},
 2010: {'Djokovic N.', 'Federer R.', 'Murray A.', 'Nadal R.'},
 2011: {'Djokovic N.', 'Federer R.', 'Murray A.', 'Nadal R.', 'Soderling R.'},
 2012: {'Djokovic N.',
  'Federer R.',
  'Ferrer D.',
  'Murray A.',
  'Nadal R.',
  'Tsonga J.W.'},
 2013: {'Djokovic N.', 'Federer R.', 'Ferrer D.', 'Murray A.', 'Nadal R.'},
 2014: {'Del Potro J.M.',
  'Djokovic N.',
  'Federer R.',
  'Ferrer D.',
  'Murray A.',
  'Nadal R.',
  'Wawrinka S.'},
 2015: {'Berdych T.',
  'Djokovic N.'

In [30]:
betting_data_df_validation_preprocessed['Top50'] = betting_data_df_validation_preprocessed.apply(
    lambda row: row['Winner'] in top_50_players[2019] and row['Loser'] in top_50_players[2019], axis=1)
betting_data_df_validation_preprocessed['Top100'] = betting_data_df_validation_preprocessed.apply(
    lambda row: row['Winner'] in top_100_players[2019] and row['Loser'] in top_100_players[2019], axis=1)

In [31]:
betting_data_df_validation_top_50 = betting_data_df_validation_preprocessed[betting_data_df_validation_preprocessed['Top50'] == True]
betting_data_df_validation_top_100 = betting_data_df_validation_preprocessed[betting_data_df_validation_preprocessed['Top100'] == True]

In [32]:
betting_data_df_validation_top_50.tail()

Unnamed: 0,ATP,Date,Best of,higher_rank_won,higher_rank_points,lower_rank_points,points_diff,WRank,LRank,Wsets,...,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final,Top50,Top100
2162,52,2019-09-08,5,1,7945.0,4125.0,3820.0,2.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,True,True
2383,59,2019-10-11,3,0,7130.0,4185.0,2945.0,6.0,3.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,True,True
2387,59,2019-10-13,3,1,4965.0,4185.0,780.0,4.0,6.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,True,True
2584,66,2019-11-13,3,1,9585.0,5705.0,3880.0,1.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,True
2587,66,2019-11-14,3,0,8945.0,6190.0,2755.0,3.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True,True


#### Evaluating Model on top 50

In [33]:
# Generate class predictions for the validation data
tennis_validation_predictions_logr = logr.predict(betting_data_df_validation_top_50[['points_diff']])

# Generate prediction probabilities for the validation data
tennis_validation_prediction_prob_logr = logr.predict_proba(betting_data_df_validation_top_50[['points_diff']])[:, 1]

In [37]:
# Evaluate the model's predictions on the validation data
tennis_tools.evaluate_predictions(betting_data_df_validation_top_50['higher_rank_won'], tennis_validation_predictions_logr,  tennis_validation_prediction_prob_logr)

{'accuracy': 0.5385, 'calibration': 1.5206, 'log_loss': 1.0499}

#### Evaluating Model on top 100 

In [38]:
# Generate class predictions for the validation data
tennis_validation_predictions_logr = logr.predict(betting_data_df_validation_top_100[['points_diff']])

# Generate prediction probabilities for the validation data
tennis_validation_prediction_prob_logr = logr.predict_proba(betting_data_df_validation_top_100[['points_diff']])[:, 1]

In [39]:
# Evaluate the model's predictions on the validation data
tennis_tools.evaluate_predictions(betting_data_df_validation_top_100['higher_rank_won'], tennis_validation_predictions_logr,  tennis_validation_prediction_prob_logr)

{'accuracy': 0.5614, 'calibration': 1.4134, 'log_loss': 0.8933}