In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore') 

In [2]:
def accuracy(actual, predictions):
    logr_accuracy_all_predictors = np.mean(actual == predictions)
    return logr_accuracy_all_predictors  # Corrected indentation

def calibration(actual, predictions):
    return np.sum(predictions) / np.sum(actual)

def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

In [3]:
def calculate_implied_probabilities(odds_w, odds_l):
    if odds_w == 0 or odds_l == 0 or pd.isna(odds_w) or pd.isna(odds_l):
        return 0, 0 
    implied_prob_w = 1 / odds_w
    implied_prob_l = 1 / odds_l
    overround = implied_prob_w + implied_prob_l
    normalized_prob_w = implied_prob_w / overround
    normalized_prob_l = implied_prob_l / overround
    return normalized_prob_w, normalized_prob_l

In [4]:
def evaluate_predictions(actual_outcomes, binary_predictions,  probability_predictions):
    
    accuracy_result = accuracy(actual_outcomes, binary_predictions)
    
    calibration_result = calibration(actual_outcomes, probability_predictions)
    
    logloss_result = logloss(actual_outcomes, probability_predictions)
    
    return {
        accuracy_result,
        calibration_result,
        logloss_result
    }

In [5]:
tennis_dfs = {}

In [6]:
for year in range(2000, 2020): 
    file_extension = 'xls' if year < 2013 else 'xlsx'
    file_path = f"Betting_Odds_Tennis/{year}.{file_extension}"
    tennis_dfs[year] = pd.read_excel(file_path)

In [7]:
tennis_dfs[2019]

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,Lsets,Comment,B365W,B365L,PSW,PSL,MaxW,MaxL,AvgW,AvgL
0,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Dimitrov G.,...,0.0,Completed,1.36,3.00,1.36,3.37,1.42,3.60,1.35,3.18
1,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Raonic M.,...,0.0,Completed,1.18,4.50,1.23,4.68,1.27,4.84,1.22,4.26
2,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Kecmanovic M.,...,0.0,Completed,1.57,2.25,1.67,2.32,1.71,2.40,1.63,2.28
3,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Millman J.,...,1.0,Completed,1.40,2.75,1.41,3.13,1.45,3.20,1.40,2.95
4,1,Brisbane,Brisbane International,2018-12-31,ATP250,Outdoor,Hard,1st Round,3,Uchiyama Y.,...,0.0,Completed,2.62,1.44,2.73,1.51,3.26,1.53,2.69,1.47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Nadal R.,...,1.0,Completed,1.44,2.75,1.39,3.26,1.48,3.30,1.41,2.93
2606,66,London,Masters Cup,2019-11-15,Masters Cup,Indoor,Hard,Round Robin,3,Zverev A.,...,0.0,Completed,1.90,1.90,2.14,1.79,2.24,2.06,1.92,1.90
2607,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Tsitsipas S.,...,0.0,Completed,3.50,1.30,3.75,1.33,3.75,1.40,3.39,1.33
2608,66,London,Masters Cup,2019-11-16,Masters Cup,Indoor,Hard,Semifinals,3,Thiem D.,...,0.0,Completed,1.80,2.00,1.84,2.10,1.87,2.20,1.78,2.06


In [8]:
all_years_df = pd.concat(tennis_dfs.values(), ignore_index=True)

In [9]:
all_years_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54908 entries, 0 to 54907
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         54908 non-null  int64         
 1   Location    54908 non-null  object        
 2   Tournament  54908 non-null  object        
 3   Date        54908 non-null  datetime64[ns]
 4   Series      54908 non-null  object        
 5   Court       54908 non-null  object        
 6   Surface     54908 non-null  object        
 7   Round       54908 non-null  object        
 8   Best of     54908 non-null  int64         
 9   Winner      54908 non-null  object        
 10  Loser       54908 non-null  object        
 11  WRank       54889 non-null  object        
 12  LRank       54816 non-null  object        
 13  W1          54624 non-null  float64       
 14  L1          54626 non-null  float64       
 15  W2          54102 non-null  object        
 16  L2          54103 non-

In [10]:
all_years_df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'W1', 'L1',
       'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets',
       'Comment', 'CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL',
       'B365W', 'B365L', 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL', 'WPts',
       'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL'],
      dtype='object')

In [11]:
def preprocess_dataset(df):

    # Not considering Max and Avg columns

    odds_columns = ['B365W', 'B365L', 'B&WW', 'B&WL', 'CBW', 'CBL', 'EXW', 'EXL', 'LBW', 'LBL', 
                'GBW', 'GBL', 'IWW', 'IWL', 'PSW', 'PSL', 'SBW', 'SBL', 'SJW', 'SJL',
                'UBW', 'UBL']


    df['WRank'].replace('NR', np.nan, inplace=True)
    df['LRank'].replace('NR', np.nan, inplace=True)

    df['WRank'] = df['WRank'].fillna(100000).astype(float)
    df['LRank'] = df['LRank'].fillna(100000).astype(float)

    df['higher_rank_won'] = (df['WRank'] < df['LRank']).astype(int)
    df['higher_rank_points'] = df['higher_rank_won'] * df['WPts'] + df['LPts'] * (1 - df['higher_rank_won'])
    df['lower_rank_points'] = (1 - df['higher_rank_won']) * df['WPts'] + df['LPts'] * df['higher_rank_won']
    df['points_diff'] = df['higher_rank_points'] - df['lower_rank_points']

    essential_columns = ['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'higher_rank_won', 'higher_rank_points', 'lower_rank_points', 'points_diff', 'WRank', 'LRank', 'Wsets', 'Lsets', 'Winner', 'Loser']

    df_odds = df[essential_columns + odds_columns]


    df_odds[odds_columns] = df_odds[odds_columns].apply(pd.to_numeric, errors='coerce')


    df_odds['EXW'] = df_odds['EXW'].replace('2.,3', 2.3).astype(float)


    df_odds = df_odds.astype({'Location': 'category', 'Tournament': 'category', 'Series': 'category','Court': 'category', 'Surface': 'category', 'Round': 'category' })



    categorical_features = ['Location', 'Tournament', 'Series', 'Court', 'Surface','Round']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)  # Ensure output is a dense array

    encoded_features = encoder.fit_transform(df_odds[categorical_features])

    df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

    df_odds.reset_index(drop=True, inplace=True)

    df_final = pd.concat([df_odds.drop(columns=categorical_features), df_encoded], axis=1)

    df_final.dropna(axis=1, how='all', inplace=True)

    return df_final

In [12]:
df_odds_preprocessed = preprocess_dataset(all_years_df) 

In [13]:
df_odds_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54908 entries, 0 to 54907
Columns: 396 entries, ATP to Round_The Final
dtypes: datetime64[ns](1), float64(390), int64(3), object(2)
memory usage: 165.9+ MB


In [14]:
means = df_odds_preprocessed.select_dtypes(include=[np.number]).mean()
df_odds_preprocessed.fillna(means, inplace=True)

In [15]:
players_ids = pd.concat([df_odds_preprocessed['Winner'], df_odds_preprocessed['Loser']]).unique()
players_ids

array(['Dosedel S.', 'Enqvist T.', 'Escude N.', ..., 'Dubrivnyy A.',
       'Kachmazov A.', 'Medvedev D. '], dtype=object)

In [16]:
atp_ranking_history = {player_id: [] for player_id in players_ids}
elo_scores_constant_k = {player: 1500 for player in players_ids}
elo_scores_varied_k = elo_scores_constant_k.copy()

In [17]:
def calculate_k(m_i, delta=100, nu=5, sigma=0.1):
    return delta / ((m_i + nu) ** sigma)

In [18]:
def update_elo(winner_elo, loser_elo, games_played_winner, games_played_loser, K, use_varied_k=False):
    if use_varied_k:
        K_winner = calculate_k(games_played_winner)
        K_loser = calculate_k(games_played_loser)
    else:
        K_winner = K_loser = K 

    expected_winner = 1 / (1 + 10 ** ((loser_elo - winner_elo) / 400))
    elo_change_winner = K_winner * (1 - expected_winner)
    elo_change_loser = K_loser * (expected_winner - 1)


    return (winner_elo, winner_elo + elo_change_winner, loser_elo, loser_elo + elo_change_loser, expected_winner)

In [19]:
games_played = {}

In [20]:
def calculate_elo_with_k(df_odds_preprocessed, k):
    players_ids = pd.concat([df_odds_preprocessed['Winner'], df_odds_preprocessed['Loser']]).unique()
    elo_scores = {player: 1500 for player in players_ids}
    games_played = {player: 0 for player in players_ids}


    for i in range(len(df_odds_preprocessed)):
        tennis_match_row = df_odds_preprocessed.iloc[i]
        winner_id = tennis_match_row['Winner']
        loser_id = tennis_match_row['Loser']
        higher_ranked_wins = tennis_match_row['higher_rank_won']

        games_played[winner_id] = games_played.get(winner_id, 0) + 1
        games_played[loser_id] = games_played.get(loser_id, 0) + 1

        winner_elo_before, loser_elo_before = elo_scores_constant_k[winner_id], elo_scores_constant_k[loser_id]


        _, winner_elo_after, _, loser_elo_after, prob_winner = update_elo(
            winner_elo_before, loser_elo_before, 
            games_played[winner_id], games_played[loser_id], k, use_varied_k=False)

        elo_scores_constant_k[winner_id], elo_scores_constant_k[loser_id] = winner_elo_after, loser_elo_after




        df_odds_preprocessed.at[i, 'winner_elo_before'] = winner_elo_before
        df_odds_preprocessed.at[i, 'winner_elo_after'] = winner_elo_after
        df_odds_preprocessed.at[i, 'loser_elo_before'] = loser_elo_before
        df_odds_preprocessed.at[i, 'loser_elo_after'] = loser_elo_after

        prob_winner = 1 / (1 + 10 ** ((loser_elo_before - winner_elo_before) / 400))
        df_odds_preprocessed.at[i, 'prob_winner'] = prob_winner

        if tennis_match_row['higher_rank_won'] == 1:
            df_odds_preprocessed.at[i, 'match_outcome'] = int(df_odds_preprocessed.at[i, 'prob_winner'] > 0.5)  # Convert boolean to int using int()
            df_odds_preprocessed.at[i, 'prob_high_ranked'] = df_odds_preprocessed.at[i, 'prob_winner']
        else:
            df_odds_preprocessed.at[i, 'match_outcome'] = int((1 - df_odds_preprocessed.at[i, 'prob_winner']) > 0.5)  # Convert boolean to int using int()
            df_odds_preprocessed.at[i, 'prob_high_ranked'] = 1 - df_odds_preprocessed.at[i, 'prob_winner']
            
    
    return df_odds_preprocessed

    

In [21]:
split_time = '2018-12-31'
df_odds_validation = df_odds_preprocessed[df_odds_preprocessed.Date > split_time]
df_odds_validation
results = []

In [22]:
for k in range(1,500):
    df_updated = calculate_elo_with_k(df_odds_preprocessed.copy(), k)
    df_validation = df_updated[df_updated.Date > '2018-12-31']
    df_validation["match_outcome"] = df_validation["match_outcome"].astype(int)

    accuracy1, calibration1, logloss1 = evaluate_predictions(df_validation["higher_rank_won"], 
                                                  df_validation["match_outcome"], 
                                                  df_validation['prob_high_ranked'])
    results.append((k, accuracy1, calibration1, logloss1))

In [23]:
optimal_k_logloss = min(results, key=lambda x: x[3])[0]
print(f"Optimal K based on Log Loss: {optimal_k_logloss}")

Optimal K based on Log Loss: 21


In [24]:
for result in results:
    print(f"K={result[0]}: Accuracy={result[1]:.4f}, Calibration={result[2]:.4f}, LogLoss={result[3]:.4f}")

K=1: Accuracy=0.6035, Calibration=0.8685, LogLoss=0.6599
K=2: Accuracy=0.6155, Calibration=0.9149, LogLoss=0.6470
K=3: Accuracy=0.6174, Calibration=0.9483, LogLoss=0.6407
K=4: Accuracy=0.6278, Calibration=0.9724, LogLoss=0.6358
K=5: Accuracy=0.6332, Calibration=0.9900, LogLoss=0.6316
K=6: Accuracy=0.6410, Calibration=1.0030, LogLoss=0.6281
K=7: Accuracy=0.6456, Calibration=1.0127, LogLoss=0.6253
K=8: Accuracy=0.6464, Calibration=1.0202, LogLoss=0.6231
K=9: Accuracy=0.6448, Calibration=1.0261, LogLoss=0.6214
K=10: Accuracy=0.6467, Calibration=1.0308, LogLoss=0.6200
K=11: Accuracy=0.6483, Calibration=1.0347, LogLoss=0.6190
K=12: Accuracy=0.6437, Calibration=1.0379, LogLoss=0.6181
K=13: Accuracy=0.6440, Calibration=1.0407, LogLoss=0.6174
K=14: Accuracy=0.6452, Calibration=1.0430, LogLoss=0.6169
K=15: Accuracy=0.6464, Calibration=1.0451, LogLoss=0.6164
K=16: Accuracy=0.6471, Calibration=1.0469, LogLoss=0.6160
K=17: Accuracy=0.6460, Calibration=1.0485, LogLoss=0.6158
K=18: Accuracy=0.6456, 