### Import Libraries

In [1]:
# Import the import_ipynb library to enable importing Jupyter notebooks as modules
import import_ipynb

# Import custom Tennis Analysis Tools module
import Tennis_Analysis_Tools as tennis_tools

# Import necessary libraries
import numpy as np                   # For numerical operations
import pandas as pd                  # For data manipulation and analysis
import matplotlib.pyplot as plt      # For data visualization
from sklearn.preprocessing import OneHotEncoder  # For one-hot encoding categorical data

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

importing Jupyter notebook from Tennis_Analysis_Tools.ipynb


### Function to preprocess the dataset

In [2]:
def preprocess_dataset(df):
    
    """
    Preprocesses the tennis betting dataset by filtering completed matches, 
    converting rank columns to numeric, handling missing values, and creating 
    a feature to indicate if the higher-ranked player won. The final dataset 
    retains only relevant columns and fills any remaining missing values with 
    column means.
    
    """
    # Filter the dataset to include only completed matches
    df = df[df['Comment'].isin(['Completed'])]

    # Convert 'WRank' and 'LRank' columns to numeric, coercing errors to NaN
    df['WRank'] = pd.to_numeric(df['WRank'], errors='coerce')
    df['LRank'] = pd.to_numeric(df['LRank'], errors='coerce')

    # Fill missing rank values with 100000 (indicative of a very low rank) and convert to float
    df['WRank'] = df['WRank'].fillna(100000).astype(float)
    df['LRank'] = df['LRank'].fillna(100000).astype(float)

    # Create a column indicating if the higher-ranked player won (1 if true, 0 if false)
    df['higher_rank_won'] = (df['WRank'] < df['LRank']).astype(int)

    # Select relevant columns for the final dataset
    df = df[['Date', 'WRank', 'LRank', 'higher_rank_won']]

    # Calculate means of all numeric columns and fill missing values with these means
    means = df.select_dtypes(include=[np.number]).mean()
    df.fillna(means, inplace=True)

    return df

### Function to find the probability that a higher rank player wins

In [3]:
def probability_higher_ranked_wins(betting_data_df):
    
    # Calculate the number of matches won by the higher-ranked player
    matches_won_by_higher_ranked = betting_data_df.apply(
        lambda row: row['WRank'] < row['LRank'], axis=1).sum()

    # Get the total number of matches in the dataset
    total_matches = len(betting_data_df)

    # Calculate the probability that a higher-ranked player wins
    probability_higher_ranked_wins = matches_won_by_higher_ranked / total_matches

    # Return the probability
    return probability_higher_ranked_wins

### Loading betting data of tennis

In [4]:
# Initialize a dictionary to store dataframes for each year
betting_data_dfs = {}

In [5]:
# Loop through each year from 2005 to 2019
for current_year in range(2005, 2020): 
    # Determine the file extension based on the year
    file_extension = 'xls' if current_year < 2013 else 'xlsx'
    
    # Construct the file path using the determined file extension
    file_path = f"/Users/harishthota/Desktop/UOA Project/Betting_Odds_Tennis/{current_year}.{file_extension}"
    
    # Read the Excel file and store it in the dictionary with the year as the key
    betting_data_dfs[current_year] = pd.read_excel(file_path)

In [6]:
# Combine DataFrames from all years into a single DataFrame, reindexing rows.
betting_data_dfs = pd.concat(betting_data_dfs.values(), ignore_index=True)

### Analyzing Betting Data of Tennis

In [7]:
# Display a summary of the DataFrame to understand its structure and data types
betting_data_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40390 entries, 0 to 40389
Data columns (total 48 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         40390 non-null  int64         
 1   Location    40390 non-null  object        
 2   Tournament  40390 non-null  object        
 3   Date        40390 non-null  datetime64[ns]
 4   Series      40390 non-null  object        
 5   Court       40390 non-null  object        
 6   Surface     40390 non-null  object        
 7   Round       40390 non-null  object        
 8   Best of     40390 non-null  int64         
 9   Winner      40390 non-null  object        
 10  Loser       40390 non-null  object        
 11  WRank       40375 non-null  float64       
 12  LRank       40303 non-null  float64       
 13  WPts        38701 non-null  float64       
 14  LPts        38631 non-null  float64       
 15  W1          40155 non-null  float64       
 16  L1          40157 non-

In [8]:
# Generate descriptive statistics for numerical columns in the DataFrame
betting_data_dfs.describe()

Unnamed: 0,ATP,Best of,WRank,LRank,WPts,LPts,W1,L1,W4,L4,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
count,40390.0,40390.0,40375.0,40303.0,38701.0,38631.0,40155.0,40157.0,3647.0,3647.0,...,10671.0,10671.0,28131.0,28142.0,15572.0,15579.0,25354.0,25354.0,25354.0,25354.0
mean,32.974944,3.378311,57.801536,90.38486,1828.537195,1054.728379,5.801992,4.075155,5.783384,3.865643,...,1.815867,3.542479,1.810226,3.451461,1.796538,3.557943,1.99861,7.847864,1.838168,3.547658
std,18.006138,0.783274,72.735132,115.423997,2278.996487,1212.422674,1.232787,1.841617,1.262227,1.903181,...,0.996238,3.646316,1.031691,3.075889,1.004273,3.27251,1.582432,376.24683,1.089277,3.22777
min,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.01,1.02,1.0,1.0,1.0,1.01,1.01,1.01,1.01,1.01
25%,19.0,3.0,16.0,34.0,645.0,502.0,6.0,3.0,6.0,2.0,...,1.24,1.75,1.25,1.73,1.22,1.73,1.3,1.84,1.25,1.74
50%,33.0,3.0,40.0,64.0,1010.0,745.0,6.0,4.0,6.0,4.0,...,1.5,2.5,1.5,2.5,1.5,2.63,1.58,2.75,1.51,2.53
75%,49.0,3.0,75.0,102.0,1890.0,1150.0,6.0,6.0,6.0,6.0,...,2.03,3.85,2.0,4.0,2.0,4.0,2.21,4.47,2.07,3.91
max,67.0,5.0,1890.0,2159.0,16950.0,16950.0,7.0,7.0,7.0,7.0,...,18.0,60.0,26.0,51.0,19.0,81.0,76.0,42586.0,23.45,36.44


### Splitting the Dataset into Training and Validation

In [9]:
split_time = "2019-01-01"
betting_data_train = betting_data_dfs[betting_data_dfs.Date < split_time]

### Preprocessing and Analysis of Betting Odds Training Dataset

In [10]:
# Preprocess the training dataset
betting_data_preprocessed_train = preprocess_dataset(betting_data_train) 

In [11]:
# Display a concise summary of the preprocessed training dataset
betting_data_preprocessed_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36334 entries, 0 to 37842
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             36334 non-null  datetime64[ns]
 1   WRank            36334 non-null  float64       
 2   LRank            36334 non-null  float64       
 3   higher_rank_won  36334 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(1)
memory usage: 1.4 MB


### Preprocessing and Analysis of Betting Odds Validation Dataset

In [12]:
# Obtain Validation Dataset using split_time (2019-01-01)
betting_data_validation = betting_data_dfs[betting_data_dfs.Date >= split_time]

In [13]:
# Apply preprocessing steps to the validation dataset
betting_data_preprocessed_validation = preprocess_dataset(betting_data_validation)

In [14]:
# Display the first 5 rows of the preprocessed validation dataset
betting_data_preprocessed_validation.head()

Unnamed: 0,Date,WRank,LRank,higher_rank_won
37785,2019-01-01,63.0,49.0,0
37786,2019-01-01,40.0,57.0,1
37787,2019-01-01,240.0,234.0,0
37788,2019-01-01,35.0,62.0,1
37789,2019-01-01,239.0,146.0,0


### Naive Prediction and Evaluation for Training Data

In [15]:
# Calculate the probability that the higher-ranked player wins in the preprocessed training dataset
prob_higher_rank_win = probability_higher_ranked_wins(betting_data_preprocessed_train)

In [16]:
# Create a naive prediction array where we assume the higher-ranked player always wins (binary prediction)
naive_predictions = np.full_like(betting_data_preprocessed_train["higher_rank_won"], fill_value=1)

# Print the naive binary predictions for the training data
print("Naive Predictions for Training Data:", naive_predictions)
print()  # Print an empty line for better readability

# Create a naive probability prediction array using the computed probability of higher-ranked players winning
naive_probability_predictions = np.round(
    np.full_like(betting_data_preprocessed_train["higher_rank_won"], 
                 fill_value=prob_higher_rank_win, dtype='float64'), 
    5
)

# Print the naive probability predictions for the training data
print("Naive Probability Predictions for Training Data:", naive_probability_predictions)

Naive Predictions for Training Data: [1 1 1 ... 1 1 1]

Naive Probability Predictions for Training Data: [0.66808 0.66808 0.66808 ... 0.66808 0.66808 0.66808]


In [17]:
# Evaluate the naive predictions against the actual outcomes in the preprocessed training dataset
evaluation_results_train = tennis_tools.evaluate_predictions(
    betting_data_preprocessed_train["higher_rank_won"], 
    naive_predictions, 
    naive_probability_predictions
)

# Unpack the evaluation metrics for the training dataset
accuracy, calibration, log_loss = evaluation_results_train

# Print the evaluation metrics for the training dataset
print(f"Accuracy of Training Dataset: {accuracy}")
print(f"Calibration of Training Dataset: {calibration}")
print(f"Log Loss of Training Dataset: {log_loss}")

Accuracy of Training Dataset: 0.66808
Calibration of Training Dataset: 1.0
Log Loss of Training Dataset: 0.63553


### Naive Prediction and Evaluation for Validation Data

In [18]:
# Create a naive prediction array assuming the higher-ranked player always wins (binary prediction)
naive_predictions = np.full_like(betting_data_preprocessed_validation["higher_rank_won"], fill_value=1)

# Create a naive probability prediction array using the computed probability of higher-ranked players winning
# from the training dataset
naive_probability_predictions = np.round(
    np.full_like(betting_data_preprocessed_validation["higher_rank_won"], 
                 fill_value=prob_higher_rank_win, dtype='float64'), 
    5
)

# Print the naive binary and probability predictions for the validation data
print("Naive Predictions for Validation Data:", naive_predictions)
print()  # Add a blank line for readability
print("Naive Probability Predictions for Validation Data:", naive_probability_predictions)

Naive Predictions for Validation Data: [1 1 1 ... 1 1 1]

Naive Probability Predictions for Validation Data: [0.66808 0.66808 0.66808 ... 0.66808 0.66808 0.66808]


In [19]:
# Evaluate the naive predictions against the actual outcomes in the preprocessed validation dataset
evaluation_results_validation = tennis_tools.evaluate_predictions(
    betting_data_preprocessed_validation["higher_rank_won"], 
    naive_predictions, 
    naive_probability_predictions
)

# Extract accuracy, calibration, and log loss from the validation evaluation results
accuracy, calibration, log_loss = evaluation_results_validation

# Print the evaluation metrics for the validation dataset
print(f"Accuracy of Validation Dataset: {accuracy}")
print(f"Calibration of Validation Dataset: {calibration}")
print(f"Log Loss of Validation Dataset: {log_loss}")

Accuracy of Validation Dataset: 0.61361
Calibration of Validation Dataset: 1.08877
Log Loss of Validation Dataset: 0.67363
