### Import Libraries 

In [1]:
# Import the import_ipynb library to enable importing Jupyter notebooks as modules
import import_ipynb

# Import custom Tennis Analysis Tools module
import Tennis_Analysis_Tools as tennis_tools

# Import necessary libraries
import numpy as np                   # For numerical operations
import pandas as pd                  # For data manipulation and analysis
import matplotlib.pyplot as plt      # For data visualization
from sklearn.preprocessing import OneHotEncoder  # For one-hot encoding categorical data

# For logistic regression model
from sklearn.linear_model import LogisticRegression

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

importing Jupyter notebook from Tennis_Analysis_Tools.ipynb


### Function to preprocess the dataset

In [2]:
def preprocess_dataset(betting_data_dfs):
    
    """
    Preprocesses the tennis betting dataset by filtering completed matches, 
    handling missing values, and creating additional relevant features.
    
    """
    # Filter the dataset to include only completed matches
    betting_data_dfs = betting_data_dfs[betting_data_dfs['Comment'].isin(['Completed'])]
    
    # Convert 'WRank' and 'LRank' columns to numeric, forcing errors to NaN
    betting_data_dfs['WRank'] = pd.to_numeric(betting_data_dfs['WRank'], errors='coerce')
    betting_data_dfs['LRank'] = pd.to_numeric(betting_data_dfs['LRank'], errors='coerce')

    # Fill NaN values in 'WRank' and 'LRank' with 100000 and convert to float
    betting_data_dfs['WRank'] = betting_data_dfs['WRank'].fillna(100000).astype(float)
    betting_data_dfs['LRank'] = betting_data_dfs['LRank'].fillna(100000).astype(float)

    # Create a column indicating if the higher-ranked player won (1 if true, 0 if false)
    betting_data_dfs['higher_rank_won'] = (betting_data_dfs['WRank'] < betting_data_dfs['LRank']).astype(int)
    
    # Calculate points for the higher-ranked player and the lower-ranked player
    betting_data_dfs['higher_rank_points'] = (
        betting_data_dfs['higher_rank_won'] * betting_data_dfs['WPts'] + 
        betting_data_dfs['LPts'] * (1 - betting_data_dfs['higher_rank_won'])
    )
    betting_data_dfs['lower_rank_points'] = (
        (1 - betting_data_dfs['higher_rank_won']) * betting_data_dfs['WPts'] + 
        betting_data_dfs['LPts'] * betting_data_dfs['higher_rank_won']
    )
    
    # Calculate the difference in points between the higher-ranked and lower-ranked players
    betting_data_dfs['points_diff'] = betting_data_dfs['higher_rank_points'] - betting_data_dfs['lower_rank_points']
    
    # Select relevant columns for the final dataset
    betting_data_dfs = betting_data_dfs[['Date', 'WRank', 'LRank', 'WPts', 'LPts', 'higher_rank_points', 'lower_rank_points', 'points_diff', 'higher_rank_won']]
    
    # Calculate means of all numeric columns and fill missing values with these means for the training dataset
    means = betting_data_dfs.select_dtypes(include=[np.number]).mean()
    betting_data_dfs.fillna(means, inplace=True)

    return betting_data_dfs

### Loading Betting Data of Tennis

In [3]:
# Initialize a dictionary to store dataframes for each year
betting_data_dfs = {}

In [4]:
# Loop through each year from 2005 to 2019
for current_year in range(2005, 2020): 
    # Determine the file extension based on the year
    file_extension = 'xls' if current_year < 2013 else 'xlsx'
    
    # Construct the file path using the determined file extension
    file_path = f"/Users/harishthota/Desktop/UOA Project/Betting_Odds_Tennis/{current_year}.{file_extension}"
    
    # Read the Excel file and store it in the dictionary with the year as the key
    betting_data_dfs[current_year] = pd.read_excel(file_path)

In [5]:
# Combine DataFrames from all years into a single DataFrame, reindexing rows.
betting_data_dfs = pd.concat(betting_data_dfs.values(), ignore_index = True)

In [6]:
# Display a summary of the DataFrame to understand its structure and data types
betting_data_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40390 entries, 0 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         40390 non-null  int64         
 1   Location    40390 non-null  object        
 2   Tournament  40390 non-null  object        
 3   Date        40390 non-null  datetime64[ns]
 4   Series      40390 non-null  object        
 5   Court       40390 non-null  object        
 6   Surface     40390 non-null  object        
 7   Round       40390 non-null  object        
 8   Best of     40390 non-null  int64         
 9   Winner      40390 non-null  object        
 10  Loser       40390 non-null  object        
 11  WRank       40375 non-null  float64       
 12  LRank       40303 non-null  float64       
 13  WPts        38701 non-null  float64       
 14  LPts        38631 non-null  float64       
 15  W1          40155 non-null  float64       
 16  L1          40157 non-

In [7]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_dfs.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W4,L4,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
count,40390.0,40390.0,40375.0,40303.0,38701.0,38631.0,40155.0,40157.0,3647.0,3647.0,...,10671.0,10671.0,28131.0,28142.0,15572.0,15579.0,25354.0,25354.0,25354.0,25354.0
mean,32.974944,3.378311,57.801536,90.38486,1828.537195,1054.728379,5.801992,4.075155,5.783384,3.865643,...,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.99861,7.847864,1.838168,3.547658
std,18.006138,0.783274,72.735132,115.423997,2278.996487,1212.422674,1.232787,1.841617,1.262227,1.903181,...,0.996238,3.646316,1.031691,3.075889,1.004273,3.27251,1.582432,376.24683,1.089277,3.22777
min,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.01,1.02,1.0,1.0,1.0,1.01,1.01,1.01,1.01,1.01
25%,19.0,3.0,16.0,34.0,645.0,502.0,6.0,3.0,6.0,2.0,...,1.24,1.75,1.25,1.73,1.22,1.73,1.3,1.84,1.25,1.74
50%,33.0,3.0,40.0,64.0,1010.0,745.0,6.0,4.0,6.0,4.0,...,1.5,2.5,1.5,2.5,1.5,2.63,1.58,2.75,1.51,2.53
75%,49.0,3.0,75.0,102.0,1890.0,1150.0,6.0,6.0,6.0,6.0,...,2.03,3.85,2.0,4.0,2.0,4.0,2.21,4.47,2.07,3.91
max,67.0,5.0,1890.0,2159.0,16950.0,16950.0,7.0,7.0,7.0,7.0,...,18.0,60.0,26.0,51.0,19.0,81.0,76.0,42586.0,23.45,36.44


In [8]:
# Retrieve and display the column names of the DataFrame
betting_data_dfs.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'CBW', 'CBL', 'EXW', 'EXL', 'IWW',
       'IWL', 'PSW', 'PSL', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW',
       'MaxL', 'AvgW', 'AvgL'],
      dtype='object')

### Splitting the Dataset into training and validation

In [9]:
split_time = "2019-01-01"
betting_data_train = betting_data_dfs[betting_data_dfs.Date < split_time]

### Preprocessing and Analysis of Betting Odds Training Dataset

In [10]:
betting_data_preprocessed_train = preprocess_dataset(betting_data_train) 

In [11]:
betting_data_preprocessed_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36334 entries, 0 to 37842
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                36334 non-null  datetime64[ns]
 1   WRank               36334 non-null  float64       
 2   LRank               36334 non-null  float64       
 3   WPts                36334 non-null  float64       
 4   LPts                36334 non-null  float64       
 5   higher_rank_points  36334 non-null  float64       
 6   lower_rank_points   36334 non-null  float64       
 7   points_diff         36334 non-null  float64       
 8   higher_rank_won     36334 non-null  int64         
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 2.8 MB


### Preprocessing and Analysis of Betting Odds Validation Dataset

In [12]:
# Filter the DataFrame to create a validation dataset with records on or after the specified split time
betting_data_validation = betting_data_dfs[betting_data_dfs.Date >= split_time]

In [13]:
betting_data_validation.tail()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
40385,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,,,,,,,1.48,3.3,1.41,2.93
40386,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,,,,,,,2.24,2.06,1.92,1.9
40387,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,,,,,,,3.75,1.4,3.39,1.33
40388,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,,,,,,,1.87,2.2,1.78,2.06
40389,66,London,Masters Cup,2019-11-17,Masters Cup,Indoor,Hard,The Final,3,Tsitsipas S.,...,,,,,,,2.05,1.93,1.96,1.86


In [14]:
betting_data_validation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2593 entries, 37785 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         2593 non-null   int64         
 1   Location    2593 non-null   object        
 2   Tournament  2593 non-null   object        
 3   Date        2593 non-null   datetime64[ns]
 4   Series      2593 non-null   object        
 5   Court       2593 non-null   object        
 6   Surface     2593 non-null   object        
 7   Round       2593 non-null   object        
 8   Best of     2593 non-null   int64         
 9   Winner      2593 non-null   object        
 10  Loser       2593 non-null   object        
 11  WRank       2590 non-null   float64       
 12  LRank       2580 non-null   float64       
 13  WPts        2591 non-null   float64       
 14  LPts        2580 non-null   float64       
 15  W1          2572 non-null   float64       
 16  L1          2572 no

In [15]:
# Apply preprocessing steps to the validation dataset
betting_data_preprocessed_validation = preprocess_dataset(betting_data_validation)

In [16]:
betting_data_preprocessed_validation.head()

Unnamed: 0,Date,WRank,LRank,WPts,LPts,higher_rank_points,lower_rank_points,points_diff,higher_rank_won
37785,2019-01-01,63.0,49.0,810.0,974.0,974.0,810.0,164.0,0
37786,2019-01-01,40.0,57.0,1050.0,875.0,1050.0,875.0,175.0,1
37787,2019-01-01,240.0,234.0,200.0,206.0,206.0,200.0,6.0,0
37788,2019-01-01,35.0,62.0,1125.0,810.0,1125.0,810.0,315.0,1
37789,2019-01-01,239.0,146.0,200.0,367.0,367.0,200.0,167.0,0


In [17]:
betting_data_preprocessed_validation.columns

Index(['Date', 'WRank', 'LRank', 'WPts', 'LPts', 'higher_rank_points',
       'lower_rank_points', 'points_diff', 'higher_rank_won'],
      dtype='object')

### Fitting Logistic Regression Model

In [18]:
# Initialize the Logistic Regression model with no intercept
logr = LogisticRegression(fit_intercept=False)

# Fit the model using 'points_diff' as the feature and 'higher_rank_won' as the target variable
logr.fit(betting_data_preprocessed_train[['points_diff']], betting_data_preprocessed_train["higher_rank_won"])

LogisticRegression(fit_intercept=False)

### Logistic Prediction and Evaluation for Trained Data

In [19]:
# Generate class predictions for the training data
tennis_train_predictions_logr = logr.predict(betting_data_preprocessed_train[['points_diff']])

# Generate prediction probabilities for the training data
# [:, 1] is used to get the probability of the positive class (higher_rank_won = 1)
tennis_train_prediction_prob_logr = logr.predict_proba(betting_data_preprocessed_train[['points_diff']])[:, 1]

In [20]:
# Evaluate the model's predictions on the training data
accuracy, calibration, log_loss = tennis_tools.evaluate_predictions(betting_data_preprocessed_train["higher_rank_won"], tennis_train_predictions_logr, tennis_train_prediction_prob_logr)

print(f"Accuracy: {accuracy}")
print(f"Calibration: {calibration}")
print(f"Log Loss: {log_loss}")

Accuracy: 0.668
Calibration: 0.93143
Log Loss: 0.62201


### Logistic Prediction and Evaluation for Test Data

In [21]:
# Generate class predictions for the test data
tennis_validation_predictions_logr = logr.predict(betting_data_preprocessed_validation[['points_diff']])

# Generate prediction probabilities for the test data
tennis_validation_prediction_prob_logr = logr.predict_proba(betting_data_preprocessed_validation[['points_diff']])[:, 1]

In [22]:
# Evaluate the model's predictions on the test data
accuracy, calibration, log_loss = tennis_tools.evaluate_predictions(betting_data_preprocessed_validation['higher_rank_won'], tennis_validation_predictions_logr,  tennis_validation_prediction_prob_logr)

print(f"Accuracy: {accuracy}")
print(f"Calibration: {calibration}")
print(f"Log Loss: {log_loss}")

Accuracy: 0.61441
Calibration: 1.0091
Log Loss: 0.65069
