### Summary

In this notebook, I create and run functions to:
* create the ELO ratings
* measure the performance of ELO ratings

Using default value for the hyper-parameter `K` of 32, we see that the predictions from ELO are under-confident: if the ELO rating predicts that a player will win 80%, they actually win about 90% of the time. Similarly for other percentages. After doing a (manual) gridsearch, we see that `K=100` gives predictions that are calibrated.

### Setup and data loading

The data that is loaded is the outcome of running the notebook `01-la-processing.ipynb`.

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [None]:
dir_processed = "../data/processed"

In [None]:
matches = pd.read_csv(dir_processed+"matches.csv", index_col=0)

In [None]:
matches.head()

### ELO analysis

Define and run the various functions to carry out and evaluate the elo rating system.

In [None]:
def calculate_predicted_score(
    rating1: float,
    rating2: float
) -> float:
    """
    Calculate the predicted score using ELO rating system
    
    Parameters
    ----------
    rating1, rating2: float
        ELO ratings of two players
    
    Returns
    -------
    float
        The predicted score of a player with rating1 against
        a player with rating2 using the ELO rating system
    """
    return 1 / (1 + 10**((rating2 - rating1) / 400))

In [None]:
def calculate_new_ratings(
    rating_winner: float,
    rating_loser: float,
    predicted_score: float,
    K: float = 32,
) -> (float, float):
    """
    Calculate new elo ratings after a single match.

    Parameters
    ----------
    rating_winner, rating_loser: float
        ELO ratings of the winner and loser, respectively, before
        the match took place.
    predicted_score: float in range [0, 1]
        The expected score of the winner of the match.
    K: float, default 32
        Constant that determines how much the ratings are adjusted.

    Returns
    -------
    new_rating_winner, new_rating_loser: float
        New ELO ratings
    """
    delta_rating = K * (1 - predicted_score)

    new_rating_winner = rating_winner + delta_rating
    new_rating_loser = rating_loser - delta_rating

    return new_rating_winner, new_rating_loser

In [None]:
# manually test the above functions

for delta in range(-500, 501, 50):
    predicted_score = calculate_predicted_score(delta, 0)
    new_rating_winner, new_rating_loser = calculate_new_ratings(
        delta, 0, predicted_score
    )
    
    print(f'Old winner rating: {delta:3}.')
    print(f'Old loser rating: 0')
    print(f'Predicted score: {predicted_score}')
    print(f'New winner rating: {new_rating_winner}')
    print(f'New loser rating: {new_rating_loser}')
    print()

In [None]:
def update_ratings_single(player_ratings, winner_name, loser_name, K=32):
    """
    Update ratings based on a single new result.

    If winner_name or loser_name is not already in the
    player_ratings dictionary, then a fresh entry with a rating
    of 1500 is created before updating based on new results.

    Parameters
    ----------
    player_ratings: Dict[str, float]
        dictionary of player ratings
    winner_name, loser_name: str,
        name of winner and loser
    K: float, default 32
        Constant that determines how much the ratings are adjusted.

    Returns
    -------
    player_ratings
        updated player ratings
    rating_winner_old
    rating_winner_new
    rating_loser_old
    rating_loser_new
    predicted_score
    """
    if winner_name not in player_ratings:
        player_ratings[winner_name] = 1500
    rating_winner_old = player_ratings[winner_name]

    if loser_name not in player_ratings:
        player_ratings[loser_name] = 1500
    rating_loser_old = player_ratings[loser_name]

    predicted_score = calculate_predicted_score(rating_winner_old, rating_loser_old)

    rating_winner_new, rating_loser_new = calculate_new_ratings(
        rating_winner_old, rating_loser_old, predicted_score, K=K
    )
    
    player_ratings[winner_name] = rating_winner_new
    player_ratings[loser_name] = rating_loser_new

    return (
        player_ratings,
        rating_winner_old,
        rating_winner_new,
        rating_loser_old,
        rating_loser_new,
        predicted_score,
    )

In [None]:
# manually test update_ratings

test_ratings = {'a': 1000, 'b': 900}

test_ratings, _, _, _, _, _ = update_ratings_single(test_ratings, 'a', 'b')
test_ratings, _, _, _, _, _ = update_ratings_single(test_ratings, 'c', 'd')
test_ratings, _, _, _, _, _ = update_ratings_single(test_ratings, 'd', 'c')

test_ratings

In [None]:
def update_ratings(matches: pd.DataFrame, K=32, player_ratings={}) -> pd.DataFrame:
    """
    Update elo ratings based on all match results in `matches`.

    Parameters
    ----------
    matches: pd.DataFrame
        dataframe of match history
    K: float, default 32
        Constant that determines how much the ratings are adjusted
    player_ratings: Dict[str, float]
        dictionary of players' elo ratings before the matches
        in `matches` were played

    Returns
    -------
    player_ratings: Dict[str, float]
        updated player_ratings
    pd.DataFrame
        copy of matches dataframe with new columns for:
        * rating_winner_old
        * rating_winner_new
        * rating_loser_old
        * rating_loser_new
        * predicted_score
    """
    ratings_winner_old = []
    ratings_winner_new = []
    ratings_loser_old = []
    ratings_loser_new = []
    predicted_scores = []

    df = matches.copy()

    for _, row in tqdm(df.iterrows()):
        winner_name = row.winner_name
        loser_name = row.loser_name

        (
            player_ratings,
            rating_winner_old,
            rating_winner_new,
            rating_loser_old,
            rating_loser_new,
            predicted_score,
        ) = update_ratings_single(player_ratings, winner_name, loser_name, K=K)
        
        ratings_winner_old.append(rating_winner_old)
        ratings_winner_new.append(rating_winner_new)
        ratings_loser_old.append(rating_loser_old)
        ratings_loser_new.append(rating_loser_new)
        predicted_scores.append(predicted_score)
    
    df['rating_winner_old'] = ratings_winner_old
    df['rating_winner_new'] = ratings_winner_new
    df['rating_loser_old'] = ratings_loser_old
    df['rating_loser_new'] = ratings_loser_new
    df['predicted_score'] = predicted_scores
    
    return player_ratings, df

In [None]:
player_ratings, matches = update_ratings(matches)
matches.head()

In [None]:
def view_player_history(df: pd.DataFrame, player: str) -> None:
    """
    View all results of a player.
    
    Prints the following for all games that `player` played:
    * name of winner
    * name of loser
    * winner's elo rating (before the match)
    * loser's elo rating (before the match)
    * winner's seed in the tournament
    * loser's seed in the tournament
    * predicted score from elo ratings for the match
    
    Parameters
    ----------
    df
        dataframe of matches as outputted by `update_ratings`
    player: str
        name of the player
    
    Returns
    -------
    None
    """
    indices = (df.winner_name == player) | (df.loser_name == player)
    df_player = df[indices]
    
    for _,row in df_player.iterrows():
        w = row.winner_name
        wr = row.rating_winner_old
        ws = row.winner_seed
        l = row.loser_name
        lr = row.rating_loser_old
        ls = row.loser_seed
        pred = row.predicted_score
        print(f'{w[0:10]:10} beat {l[0:10]:10} {wr:.0f} vs {lr:.0f}   {ws:3} vs {ls:3}   {pred:.2f}')

In [None]:
view_player_history(matches, 'Ramy Ashour')

In [None]:
def evaluate_calibration(df_input: pd.DataFrame, N: int = 2) -> pd.Series:
    """
    Evaluate how well calibrated the ELO ratings are.

    Parameters
    ----------
    df_input
        Dataframe as outputted by calculate_elo
    N: int
        Number of times each bucket of size 0.1 is broken up.
        See the index of the returned pd.Series for an example

    Returns
    -------
    pd.Series
        * index is predicted score, rounded to nearest 0.1/N. For
          example, if N=2, then rounded to nearest 0.05, so index is
          0.5, 0.55, 0.6,...,0.95, 1
        * values are the average true score of matches whose predicted
        score is in that bucket
    """
    df = df_input.copy()

    df["predicted_score_better_player"] = df.predicted_score.apply(
        lambda x: round(N * x, 1) / N if x > 0.5 else 1 - round(N * x, 1) / N
    )

    df["true_score_better_player"] = df.predicted_score.apply(
        lambda x: 1 if x > 0.5 else 0
    )

    return (
        df
        .groupby("predicted_score_better_player")
        .agg({"true_score_better_player": ["count", "mean"]})
    )

In [None]:
evaluate_calibration(matches)

In [None]:
def evaluate_calibration_seeds(df: pd.DataFrame, seeds, N: int = 2, null=False):
    """
    Evaluate calibration of ratings for players' whose seed is in `seeds`
    """
    indices = df.winner_seed.isin(seeds) & df.loser_seed.isin(seeds)
    
    if null:
        null_loss = df.winner_seed.isin(seeds) & df.loser_seed.isnull()
        null_win = df.winner_seed.isnull() & df.loser_seed.isin(seeds)
        indices = indices | null_loss | null_win
    
    return evaluate_calibration(df[indices])

In [None]:
evaluate_calibration_seeds(matches, ['1','2'])

In [None]:
evaluate_calibration_seeds(matches, ['1'], null=True)

In [None]:
evaluate_calibration_seeds(matches, ['2'], null=True)

## Tuning for K
Looking at the above, we see that the predicted_scores are underconfident: the predicted scores are generally smaller than the observed scores. This suggests that the ELO rating is not updating enough after each game, i.e. that K is too small. Hence, I will try various values of K and see which gives better calibrated results.

In [None]:
def do_full_analysis(file=dir_processed+"matches.csv", K=32):
    matches = pd.read_csv(dir_processed+"matches.csv", index_col=0)
    _, matches = update_ratings(matches, K=K)
    return evaluate_calibration(matches)

In [None]:
K=10
do_full_analysis(K=K)

In [None]:
K=50
do_full_analysis(K=K)

In [None]:
K=100
do_full_analysis(K=K)

In [None]:
K=200
do_full_analysis(K=K)

In [None]:
K=500
do_full_analysis(K=K)

Based on the above, it looks like K=100 is a good value for K.