In [1]:
# Import necessary libraries
import numpy as np                  
import pandas as pd                 
import matplotlib.pyplot as plt     
from sklearn.preprocessing import OneHotEncoder 

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [2]:
def preprocess_dataset(df):

    # Convert 'WRank' and 'LRank' to numeric, coerce errors to NaN
    df['WRank'] = pd.to_numeric(df['WRank'], errors='coerce')
    df['LRank'] = pd.to_numeric(df['LRank'], errors='coerce')

    # Fill missing ranks with a high number (indicative of a very low rank) and convert to float
    df['WRank'] = df['WRank'].fillna(100000).astype(float)
    df['LRank'] = df['LRank'].fillna(100000).astype(float)

    # Determine which player had the higher rank and who won
    df['higher_rank_won'] = (df['WRank'] < df['LRank']).astype(int)

    return df

In [3]:
def accuracy(actual, predictions):
    # Calculate the mean of correct predictions
    logr_accuracy_all_predictors = np.round(np.mean(actual == predictions),4)
    return logr_accuracy_all_predictors

def calibration(actual, predictions):
    # Calculate the ratio of the sum of predictions to the sum of actual values
    return np.round((np.sum(predictions) / np.sum(actual)), 4)

def logloss(actual, predictions):
    epsilon = 1e-15  # Small constant to prevent division by zero
    # Clip predictions to avoid log of zero. Values are clipped to the range [epsilon, 1-epsilon]
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    
    
    logr_logloss_all_predictors = -(1 / len(actual)) * np.sum(
        actual * np.log(predictions) + (1 - actual) * np.log(1 - predictions))
    return np.round(logr_logloss_all_predictors, 4)

In [4]:
# Evaluating predictions using accuracy, calibration and logloss
def evaluate_predictions(actual_outcomes, binary_predictions, probability_predictions):
    accuracy_result = accuracy(actual_outcomes, binary_predictions)
    
    calibration_result = calibration(actual_outcomes, probability_predictions)
    
    logloss_result = logloss(actual_outcomes, probability_predictions)
    
    return {
        'accuracy': accuracy_result,  
        'calibration': calibration_result,  
        'log_loss': logloss_result  
    }

In [5]:
def precompute_top_players(data, top_n):
    # Group data by year, then find top n players based on 'WRank' and 'LRank' for winners and losers.
    top_players_by_year = {}
    for year, group in data.groupby(data['Date'].dt.year):
        top_winners = set(group.nsmallest(top_n, 'WRank')['Winner'].unique())
        top_losers = set(group.nsmallest(top_n, 'LRank')['Loser'].unique())
        top_players_by_year[year] = top_winners.union(top_losers)
    return top_players_by_year