In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
def accuracy(actual, predictions):
    logr_accuracy_all_predictors = np.mean(actual == predictions)
    return logr_accuracy_all_predictors  # Corrected indentation

def calibration(actual, predictions):
    return np.sum(predictions) / np.sum(actual)

def logloss(actual, predictions):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return logr_logloss_all_predictors

In [3]:
def calculate_implied_probabilities(odds_w, odds_l):
    if odds_w == 0 or odds_l == 0 or pd.isna(odds_w) or pd.isna(odds_l):
        return 0, 0 
    implied_prob_w = 1 / odds_w
    implied_prob_l = 1 / odds_l
    overround = implied_prob_w + implied_prob_l
    normalized_prob_w = implied_prob_w / overround
    normalized_prob_l = implied_prob_l / overround
    return normalized_prob_w, normalized_prob_l

In [4]:
def evaluate_predictions(actual_outcomes, binary_predictions,  probability_predictions):
    
    accuracy_result = accuracy(actual_outcomes, binary_predictions)
    
    calibration_result = calibration(actual_outcomes, probability_predictions)
    
    logloss_result = logloss(actual_outcomes, probability_predictions)
    
    return {
        'accuracy': accuracy_result,
        'calibration': calibration_result,
        'log_loss': logloss_result
    }

In [5]:
tennis_dfs = {}

In [6]:
for year in range(2000, 2020): 
    file_extension = 'xls' if year < 2013 else 'xlsx'
    file_path = f"Betting_Odds_Tennis/{year}.{file_extension}"
    tennis_dfs[year] = pd.read_excel(file_path)

  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():
  for idx, row in parser.parse():


In [7]:
tennis_dfs

{2000:       ATP  Location                          Tournament       Date  \
 0       1  Adelaide  Australian Hardcourt Championships 2000-01-03   
 1       1  Adelaide  Australian Hardcourt Championships 2000-01-03   
 2       1  Adelaide  Australian Hardcourt Championships 2000-01-03   
 3       1  Adelaide  Australian Hardcourt Championships 2000-01-03   
 4       1  Adelaide  Australian Hardcourt Championships 2000-01-03   
 ...   ...       ...                                 ...        ...   
 2958   69    Lisbon                         Masters Cup 2000-11-27   
 2959   69    Lisbon                         Masters Cup 2000-11-27   
 2960   69    Lisbon                         Masters Cup 2000-11-27   
 2961   69    Lisbon                         Masters Cup 2000-11-27   
 2962   69    Lisbon                         Masters Cup 2000-11-27   
 
              Series    Court Surface        Round  Best of       Winner  ...  \
 0     International  Outdoor    Hard    1st Round        3

In [8]:
all_years_df = pd.concat(tennis_dfs.values(), ignore_index=True)

In [9]:
all_years_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54908 entries, 0 to 54907
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         54908 non-null  int64         
 1   Location    54908 non-null  object        
 2   Tournament  54908 non-null  object        
 3   Date        54908 non-null  datetime64[ns]
 4   Series      54908 non-null  object        
 5   Court       54908 non-null  object        
 6   Surface     54908 non-null  object        
 7   Round       54908 non-null  object        
 8   Best of     54908 non-null  int64         
 9   Winner      54908 non-null  object        
 10  Loser       54908 non-null  object        
 11  WRank       54889 non-null  object        
 12  LRank       54816 non-null  object        
 13  W1          54624 non-null  float64       
 14  L1          54626 non-null  float64       
 15  W2          54102 non-null  object        
 16  L2          54103 non-

In [10]:
all_years_df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'W1', 'L1',
       'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets',
       'Comment', 'CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL',
       'B365W', 'B365L', 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL', 'WPts',
       'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL'],
      dtype='object')

In [11]:
# odds_columns = ['B365W', 'B365L', 'B&WW', 'B&WL', 'CBW', 'CBL', 'EXW', 'EXL', 'LBW', 'LBL',
#                 'GBW', 'GBL', 'IWW', 'IWL', 'PSW', 'PSL', 'SBW', 'SBL', 'SJW', 'SJL',
#                 'UBW', 'UBL', 'MaxW', 'MaxL', 'AvgW', 'AvgL']
# essential_columns = ['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of',  'WRank', 'LRank', 'Wsets', 'Lsets']

# all_years_df['EXW'] = all_years_df['EXW'].replace('2.,3', 2.3)

# all_years_df['EXW'].unique()

In [12]:
# for col in odds_columns:
#     print(col)
#     all_years_df[col].fillna(all_years_df[col].mean(), inplace=True)

# # Define a high arbitrary value to replace NaN values
# high_value = 0.0001

# # Fill NaN values with the high arbitrary value
# for col in odds_columns:
#     all_years_df[col].fillna(high_value, inplace=True)

In [13]:
# Strategy 1: Always bet on the player with the smallest odd
def smallest_odd_strategy(row, bookmaker):
    winner = row['Winner']
    loser = row['Loser']
    winner_odd = row[f'{bookmaker}W']
    loser_odd = row[f'{bookmaker}L']
    if winner_odd < loser_odd:
        return winner, winner_odd
    else:
        return loser, loser_odd


# Strategy 2: Always bet on the player with the best ATP ranking
def best_ranking_strategy(row, bookmaker):
    winner = row['Winner']
    loser = row['Loser']
    winner_rank = row['WRank']
    loser_rank = row['LRank']
    if winner_rank < loser_rank:
        return winner, row['WRank']
    else:
        return loser, row['LRank']
    

# Strategy 3: Bet completely at random
def random_strategy(row, bookmaker):
    if np.random.rand() > 0.5:
        return row['Winner'], row['WRank']
    else:
        return row['Loser'], row['LRank']

In [14]:
# # Evaluate strategies
# def evaluate_strategy(strategy_func, bookmaker=None):
#     results = []
#     for idx, row in df_odds.iterrows():
#         if bookmaker:
#             player, odd = strategy_func(row, bookmaker)
#         else:
#             player, odd = strategy_func(row)
#         if player == row['Winner']:
#             results.append(odd - 1)  # Win: payout is the odd minus the stake (1 unit)
#         else:
#             results.append(-1)  # Lose: loss is the stake (1 unit)
#     return np.cumsum(results)

# # Apply strategies for two bookmakers (e.g., Bet365 and Expekt)
# bookmakers = ['B365', 'EX']
# strategies = {
#     'Smallest Odd': smallest_odd_strategy,
#     'Best ATP Ranking': best_ranking_strategy,
#     'Random': random_strategy
# }

# # Store cumulative returns
# cumulative_returns = {}
# for bookmaker in bookmakers:
#     for strategy_name, strategy_func in strategies.items():
#         key = f'{strategy_name} ({bookmaker})'
#         if strategy_name == 'Random':
#             # Average over multiple random runs for stability
#             returns = np.mean([evaluate_strategy(strategy_func) for _ in range(100)], axis=0)
#         else:
#             returns = evaluate_strategy(strategy_func, bookmaker)
#         cumulative_returns[key] = returns

# # Convert to DataFrame for plotting
# returns_df = pd.DataFrame(cumulative_returns)


In [15]:
def evaluate_strategy(strategy_func, bookmaker=None):
    results = []
    for idx, row in df_odds.iterrows():
        if bookmaker:
            player, odd = strategy_func(row, bookmaker)
        else:
            player, odd = strategy_func(row)
        if player == row['Winner']:
            results.append(odd - 1)  # Win: payout is the odd minus the stake (1 unit)
        else:
            results.append(-1)  # Lose: loss is the stake (1 unit)
    return np.cumsum(results)

In [16]:
# # Plot cumulative returns
# plt.figure(figsize=(14, 8))
# for col in returns_df.columns:
#     plt.plot(returns_df[col], label=col)
# plt.xlabel('Match Number')
# plt.ylabel('Cumulative Return')
# plt.title('Comparison of Betting Strategies for Two Bookmakers')
# plt.legend()
# plt.show()

In [17]:
def preprocess_dataset(df):
    #df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
    
    
    odds_columns = ['B365W', 'B365L', 'B&WW', 'B&WL', 'CBW', 'CBL', 'EXW', 'EXL', 'LBW', 'LBL',
                'GBW', 'GBL', 'IWW', 'IWL', 'PSW', 'PSL', 'SBW', 'SBL', 'SJW', 'SJL',
                'UBW', 'UBL', 'MaxW', 'MaxL', 'AvgW', 'AvgL']
    
    essential_columns = ['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of',  'WRank', 'LRank', 'Wsets', 'Lsets']
    
    df_odds = df[essential_columns + odds_columns]

    
    df_odds[odds_columns] = df_odds[odds_columns].apply(pd.to_numeric, errors='coerce')


    df_odds['EXW'] = df_odds['EXW'].replace('2.,3', 2.3)
    
    

    
    df_odds['WRank'].replace('NR', np.nan, inplace=True)
    df_odds['LRank'].replace('NR', np.nan, inplace=True)

    df_odds['WRank'].fillna(100000, inplace=True)
    df_odds['LRank'].fillna(100000, inplace=True)
    
    
    df_odds = df_odds.astype({'Location': 'category', 'Tournament': 'category', 'Series': 'category','Court': 'category', 'Surface': 'category', 'Round': 'category' })
    
   
    df_odds['higher_rank_won'] = df_odds['WRank'] < df_odds['LRank']

    categorical_features = ['Location', 'Tournament', 'Series', 'Court', 'Surface','Round']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)  # Ensure output is a dense array
    
    encoded_features = encoder.fit_transform(df_odds[categorical_features])
    
    df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
    
    df_odds.reset_index(drop=True, inplace=True)
    
    df_final = pd.concat([df_odds.drop(columns=categorical_features), df_encoded], axis=1)
    
    df_final.dropna(axis=1, how='all', inplace = True)

    return df_final

In [18]:
# df_odds = all_years_df[essential_columns + odds_columns].copy()
# df_odds

# df_odds[odds_columns] = df_odds[odds_columns].apply(pd.to_numeric, errors='coerce')

from sklearn.preprocessing import OneHotEncoder




In [19]:
# df_odds['WRank'].replace('NR', np.nan, inplace=True)
# df_odds['LRank'].replace('NR', np.nan, inplace=True)

# df_odds['WRank'].fillna(100000, inplace=True)
# df_odds['LRank'].fillna(100000, inplace=True)

In [20]:
split_time = "2018-01-01"
df_odds_train = all_years_df[all_years_df.Date < split_time]

In [21]:
df_odds_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49663 entries, 0 to 49662
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ATP         49663 non-null  int64         
 1   Location    49663 non-null  object        
 2   Tournament  49663 non-null  object        
 3   Date        49663 non-null  datetime64[ns]
 4   Series      49663 non-null  object        
 5   Court       49663 non-null  object        
 6   Surface     49663 non-null  object        
 7   Round       49663 non-null  object        
 8   Best of     49663 non-null  int64         
 9   Winner      49663 non-null  object        
 10  Loser       49663 non-null  object        
 11  WRank       49648 non-null  object        
 12  LRank       49585 non-null  object        
 13  W1          49415 non-null  float64       
 14  L1          49416 non-null  float64       
 15  W2          48921 non-null  object        
 16  L2          48922 non-

In [22]:
df_odds_train = preprocess_dataset(df_odds_train) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds[odds_columns] = df_odds[odds_columns].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds['EXW'] = df_odds['EXW'].replace('2.,3', 2.3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds['WRank'].replace('NR', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See t

In [23]:
df_odds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49663 entries, 0 to 49662
Columns: 380 entries, ATP to Round_The Final
dtypes: bool(1), datetime64[ns](1), float64(376), int64(2)
memory usage: 143.7 MB


In [24]:
means = df_odds_train.select_dtypes(include=[np.number]).mean()
df_odds_train.fillna(means, inplace=True)

In [25]:
df_odds_validation = all_years_df[all_years_df.Date >= split_time]
df_odds_validation = preprocess_dataset(df_odds_validation) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds[odds_columns] = df_odds[odds_columns].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds['EXW'] = df_odds['EXW'].replace('2.,3', 2.3)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_odds['WRank'].replace('NR', np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See t

In [26]:
df_odds_validation.fillna(means, inplace=True)

In [27]:
df_odds_train

Unnamed: 0,ATP,Date,Best of,WRank,LRank,Wsets,Lsets,B365W,B365L,B&WW,...,Surface_Grass,Surface_Hard,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final
0,1,2000-01-03,3,63.0,77.0,2.0,0.0,1.822448,3.574495,1.728633,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2000-01-03,3,5.0,56.0,2.0,0.0,1.822448,3.574495,1.728633,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2000-01-03,3,40.0,655.0,2.0,1.0,1.822448,3.574495,1.728633,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2000-01-03,3,65.0,87.0,2.0,0.0,1.822448,3.574495,1.728633,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2000-01-03,3,81.0,198.0,2.0,1.0,1.822448,3.574495,1.728633,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49658,67,2017-11-18,3,8.0,2.0,2.0,1.0,9.000000,1.070000,1.728633,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49659,67,2017-11-18,3,6.0,9.0,2.0,1.0,1.400000,3.000000,1.728633,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49660,67,2017-11-19,3,6.0,8.0,2.0,1.0,1.330000,3.250000,1.728633,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
49661,1,2017-12-31,3,169.0,26.0,2.0,0.0,2.200000,1.610000,1.728633,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
df_odds_validation

Unnamed: 0,ATP,Date,Best of,WRank,LRank,Wsets,Lsets,B365W,B365L,EXW,...,Surface_Grass,Surface_Hard,Round_1st Round,Round_2nd Round,Round_3rd Round,Round_4th Round,Round_Quarterfinals,Round_Round Robin,Round_Semifinals,Round_The Final
0,1,2018-01-01,3,47.0,52.0,2.0,1.0,1.61,2.20,1.620000,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2018-01-01,3,76.0,79.0,2.0,0.0,2.50,1.50,2.350000,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2018-01-01,3,33.0,218.0,2.0,0.0,1.40,2.75,1.470000,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2018-01-01,3,66.0,120.0,2.0,0.0,2.20,1.61,2.150000,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2018-01-01,3,54.0,94.0,2.0,0.0,1.83,1.83,1.850000,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5240,66,2019-11-15,3,1.0,6.0,2.0,1.0,1.44,2.75,1.800726,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5241,66,2019-11-15,3,7.0,4.0,2.0,0.0,1.90,1.90,1.800726,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5242,66,2019-11-16,3,6.0,3.0,2.0,0.0,3.50,1.30,1.800726,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5243,66,2019-11-16,3,5.0,7.0,2.0,0.0,1.80,2.00,1.800726,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Naive Model

In [29]:
def higher_ranked_wins(tennis_df):

    matches_won_by_higher_ranked = tennis_df.apply(lambda row: row['WRank'] < row['LRank'], axis=1).sum()

    total_matches = len(tennis_df)

    probability_higher_ranked_wins = matches_won_by_higher_ranked / total_matches

    return probability_higher_ranked_wins

### Metrics for Trained Data

In [30]:
prob_higher_rank_win = higher_ranked_wins(df_odds_train)

In [31]:
naive_predictions = np.full_like(df_odds_train["higher_rank_won"], fill_value = 1)
naive_predictions
naive_probability_predictions = np.full_like(df_odds_train["higher_rank_won"], fill_value = prob_higher_rank_win, dtype = 'float64')                              

In [32]:
evaluate_predictions(df_odds_train["higher_rank_won"], naive_predictions, naive_probability_predictions)

{'accuracy': 0.6565048426393895,
 'calibration': 0.9999999999999999,
 'log_loss': 0.6433266208820796}

### Metrics for Test Data

In [33]:
naive_predictions = np.full_like(df_odds_validation["higher_rank_won"], fill_value = 1)
naive_predictions
naive_probability_predictions = np.full_like(df_odds_validation["higher_rank_won"], fill_value = prob_higher_rank_win, dtype = 'float64')                              

In [34]:
evaluate_predictions(df_odds_validation["higher_rank_won"], naive_predictions, naive_probability_predictions)

{'accuracy': 0.6242135367016206,
 'calibration': 1.0517311849858269,
 'log_loss': 0.664243542167962}

## Logistic Model

### Metrics for Trained Data

In [35]:
from sklearn.linear_model import LogisticRegression

df_odds_train_encoded_X = df_odds_train.drop(columns = ["higher_rank_won", "Date", 'WRank', 'LRank'], axis = 1)
logr= LogisticRegression(fit_intercept=False)
logr1= LogisticRegression(fit_intercept=False)



logr.fit(df_odds_train_encoded_X, df_odds_train["higher_rank_won"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(fit_intercept=False)

In [36]:
tennis_train_predictions_logr = logr.predict(df_odds_train_encoded_X)
tennis_train_predictions_logr

tennis_train_prediction_prob_logr = logr.predict_proba(df_odds_train_encoded_X)[:,1]
tennis_train_prediction_prob_logr

array([0.72481772, 0.72481772, 0.72480639, ..., 0.84608132, 0.36004688,
       0.15681054])

In [37]:
evaluate_predictions(df_odds_train["higher_rank_won"], tennis_train_predictions_logr, tennis_train_prediction_prob_logr)

{'accuracy': 0.8071602601534341,
 'calibration': 1.0010585198103492,
 'log_loss': 0.41208935945223824}

In [38]:
df_odds_validation_X = df_odds_validation.drop(columns = ["higher_rank_won", "Date", 'WRank', 'LRank'], axis = 1)

In [39]:
for column in df_odds_train_encoded_X.columns:
    if column not in df_odds_validation_X.columns:
        df_odds_validation_X[column] = 0

  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation_X[column] = 0
  df_odds_validation

In [40]:
df_odds_validation_X.columns

Index(['ATP', 'Best of', 'Wsets', 'Lsets', 'B365W', 'B365L', 'EXW', 'EXL',
       'LBW', 'LBL',
       ...
       'Tournament_Valencia Open 500', 'Tournament_Verizon Tennis Challenge',
       'Tournament_Vietnam Open', 'Tournament_adidas International',
       'Tournament_adidas Open', 'Tournament_bet-at-home Open',
       'Series_International', 'Series_International Gold', 'Series_Masters',
       'Surface_Carpet'],
      dtype='object', length=391)

In [41]:
for i in df_odds_validation_X.columns:
    if i not in df_odds_train_encoded_X.columns:
        df_odds_train_encoded_X[i] = 0


In [42]:
df_odds_train_encoded_X

Unnamed: 0,ATP,Best of,Wsets,Lsets,B365W,B365L,B&WW,B&WL,CBW,CBL,...,Tournament_Halle Open,Tournament_Hungarian Open,Tournament_Maharashtra Open,Tournament_Millennium Estoril Open,Tournament_New York Open,Tournament_Queen's Club Championships,Tournament_Rosmalen Grass Court Championships,Tournament_Sofia Open,Tournament_Tata Open,Tournament_Zhuhai Open
0,1,3,2.0,0.0,1.822448,3.574495,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
1,1,3,2.0,0.0,1.822448,3.574495,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
2,1,3,2.0,1.0,1.822448,3.574495,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
3,1,3,2.0,0.0,1.822448,3.574495,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
4,1,3,2.0,1.0,1.822448,3.574495,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49658,67,3,2.0,1.0,9.000000,1.070000,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
49659,67,3,2.0,1.0,1.400000,3.000000,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
49660,67,3,2.0,1.0,1.330000,3.250000,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0
49661,1,3,2.0,0.0,2.200000,1.610000,1.728633,2.547347,1.81208,3.000658,...,0,0,0,0,0,0,0,0,0,0


In [43]:
df_odds_validation_X

Unnamed: 0,ATP,Best of,Wsets,Lsets,B365W,B365L,EXW,EXL,LBW,LBL,...,Tournament_Valencia Open 500,Tournament_Verizon Tennis Challenge,Tournament_Vietnam Open,Tournament_adidas International,Tournament_adidas Open,Tournament_bet-at-home Open,Series_International,Series_International Gold,Series_Masters,Surface_Carpet
0,1,3,2.0,1.0,1.61,2.20,1.620000,2.150000,1.670000,2.200000,...,0,0,0,0,0,0,0,0,0,0
1,1,3,2.0,0.0,2.50,1.50,2.350000,1.530000,2.370000,1.570000,...,0,0,0,0,0,0,0,0,0,0
2,1,3,2.0,0.0,1.40,2.75,1.470000,2.500000,1.570000,2.370000,...,0,0,0,0,0,0,0,0,0,0
3,1,3,2.0,0.0,2.20,1.61,2.150000,1.620000,2.100000,1.730000,...,0,0,0,0,0,0,0,0,0,0
4,1,3,2.0,0.0,1.83,1.83,1.850000,1.850000,1.910000,1.910000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5240,66,3,2.0,1.0,1.44,2.75,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0
5241,66,3,2.0,0.0,1.90,1.90,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0
5242,66,3,2.0,0.0,3.50,1.30,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0
5243,66,3,2.0,0.0,1.80,2.00,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0


In [44]:
logr1.fit(df_odds_train_encoded_X, df_odds_train["higher_rank_won"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(fit_intercept=False)

In [45]:
### Metrics after retraining training data

In [46]:
tennis_train_predictions_logr = logr1.predict(df_odds_train_encoded_X)
tennis_train_predictions_logr

tennis_train_prediction_prob_logr = logr1.predict_proba(df_odds_train_encoded_X)[:,1]
tennis_train_prediction_prob_logr

array([0.73002965, 0.73002965, 0.7303924 , ..., 0.84350215, 0.3602344 ,
       0.15353259])

In [47]:
evaluate_predictions(df_odds_train["higher_rank_won"], tennis_train_predictions_logr, tennis_train_prediction_prob_logr)

{'accuracy': 0.8073414815858889,
 'calibration': 0.9997660961758722,
 'log_loss': 0.41209162081035133}

### Metrics for Validation Data

In [50]:
df_odds_validation_X

Unnamed: 0,ATP,Best of,Wsets,Lsets,B365W,B365L,EXW,EXL,LBW,LBL,...,Tournament_Valencia Open 500,Tournament_Verizon Tennis Challenge,Tournament_Vietnam Open,Tournament_adidas International,Tournament_adidas Open,Tournament_bet-at-home Open,Series_International,Series_International Gold,Series_Masters,Surface_Carpet
0,1,3,2.0,1.0,1.61,2.20,1.620000,2.150000,1.670000,2.200000,...,0,0,0,0,0,0,0,0,0,0
1,1,3,2.0,0.0,2.50,1.50,2.350000,1.530000,2.370000,1.570000,...,0,0,0,0,0,0,0,0,0,0
2,1,3,2.0,0.0,1.40,2.75,1.470000,2.500000,1.570000,2.370000,...,0,0,0,0,0,0,0,0,0,0
3,1,3,2.0,0.0,2.20,1.61,2.150000,1.620000,2.100000,1.730000,...,0,0,0,0,0,0,0,0,0,0
4,1,3,2.0,0.0,1.83,1.83,1.850000,1.850000,1.910000,1.910000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5240,66,3,2.0,1.0,1.44,2.75,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0
5241,66,3,2.0,0.0,1.90,1.90,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0
5242,66,3,2.0,0.0,3.50,1.30,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0
5243,66,3,2.0,0.0,1.80,2.00,1.800726,3.283226,1.806538,3.485135,...,0,0,0,0,0,0,0,0,0,0


In [52]:
tennis_validation_predictions_logr = logr1.predict(df_odds_validation_X)
tennis_validation_predictions_logr

tennis_validation_prediction_prob_logr = logr1.predict_proba(df_odds_validation_X)[:,1]
tennis_validation_prediction_prob_logr

Feature names must be in the same order as they were in fit.

Feature names must be in the same order as they were in fit.



array([0.39030241, 0.03345907, 0.69587696, ..., 0.01128994, 0.39562815,
       0.2523281 ])

In [53]:
evaluate_predictions(df_odds_validation['higher_rank_won'], tennis_validation_predictions_logr,  tennis_validation_prediction_prob_logr)

{'accuracy': 0.76930409914204,
 'calibration': 0.8395392477913478,
 'log_loss': 0.5135596806031469}

## BCM Model

In [None]:
for bookmaker in ['B365', 'B&W', 'CB', 'EX', 'LB', 'GB', 'IW', 'PS', 'SB', 'SJ', 'UB', 'Max', 'Avg']:
    df_odds_train[f'{bookmaker}_prob_w'], df_odds[f'{bookmaker}_prob_l'] = zip(*df_odds_train.apply(lambda row: calculate_implied_probabilities(row[f'{bookmaker}W'], row[f'{bookmaker}L']), axis=1))

In [None]:
 df_odds

In [None]:
def logit(p):
    if p == 0 or p == 1 or pd.isna(p):
        return np.nan
    return np.log(p / (1 - p))

def inverse_logit(y):
    if pd.isna(y):
        return np.nan
    return np.exp(y) / (1 + np.exp(y))

In [None]:
bookmakers = ['B365', 'B&W', 'CB', 'EX', 'LB', 'GB', 'IW', 'PS', 'SB', 'SJ', 'UB', 'Max', 'Avg']

In [None]:
for bookmaker in bookmakers :
    df_odds_train[f'{bookmaker}_logit_prob_w'] = df_odds_train[f'{bookmaker}_prob_w'].apply(logit)

df_odds_train['consensus_logit_prob_w'] = df_odds_train[[f'{bookmaker}_logit_prob_w' for bookmaker in ['B365', 'B&W', 'CB', 'EX', 'LB', 'GB', 'IW', 'PS', 'SB', 'SJ', 'UB', 'Max', 'Avg']]].mean(axis=1, skipna=True)

df_odds_train['consensus_prob_w'] = df_odds_train['consensus_logit_prob_w'].apply(inverse_logit)

In [None]:
df_odds_train

In [None]:
for bookmaker in bookmakers :
    df_odds_validation[f'{bookmaker}_logit_prob_w'] = df_odds_validation[f'{bookmaker}_prob_w'].apply(logit)

df_odds_validation['consensus_logit_prob_w'] = df_odds_train[[f'{bookmaker}_logit_prob_w' for bookmaker in ['B365', 'B&W', 'CB', 'EX', 'LB', 'GB', 'IW', 'PS', 'SB', 'SJ', 'UB', 'Max', 'Avg']]].mean(axis=1, skipna=True)

df_odds_validation['consensus_prob_w'] = df_odds_validation['consensus_logit_prob_w'].apply(inverse_logit)

In [None]:
df_odds['actual_winner'] = df_odds['WRank'] < df_odds['LRank']

predictions = df_odds['consensus_prob_w'] > 0.5

valid_indices = df_odds['consensus_prob_w'].notna()

accuracy_score = accuracy(df_odds.loc[valid_indices, 'actual_winner'], predictions[valid_indices])
accuracy_score

In [None]:
calibration(df_odds['actual_winner'],df_odds['consensus_prob_w'] )

In [None]:
logloss(df_odds['actual_winner'],df_odds['consensus_prob_w'])