In [54]:
#notebook to calculate accuracy of the models and compare to benchmarks community votes and betting odds
#import necessary packages
import pandas as pd
import numpy as np

In [55]:
#function to calculate rps score
def rps(predictions, observed):
    """Vectorized version of Ranked Probability Score.
    A lower value is a better score.
    From: Colin Catlin, https://syllepsis.live/2022/01/22/ranked-probability-score-in-python/

    Args:
        predictions (pd.DataFrame): each column is an outcome category
            with values as the 0 to 1 probability of that category
        observed (pd.DataFrame): each column is an outcome category
            with values of 0 OR 1 with 1 being that category occurred
    """
    assert (
        predictions.shape == observed.shape
    ), "prediction and observed array shapes must match"
    ncat = predictions.shape[1] - 1
    return (
        np.sum(
            (np.cumsum(predictions, axis=1) - np.cumsum(observed, axis=1)) ** 2, axis=1
        ) / ncat
    )
predictions_df = pd.DataFrame({
    'H': [1, 0.9, 0.8, 0.5, 0.35, 0.6, 0.6, 0.6, 0.5, 0.55],
    'D': [0, 0.1, 0.1, 0.25, 0.3, 0.3, 0.3, 0.1, 0.45, 0.1],
    'A': [0, 0, 0.1, 0.25, 0.35, 0.1, 0.1, 0.3, 0.05, 0.35],
    "model": np.tile(["a", "b"], 5),
    "match": np.repeat(np.arange(1, 6), 2),
    "outcome": np.repeat(["H", "H", "D", "H", "H"], 2),
})
expected_result = [0, 0.005, 0.025, 0.15625, 0.1225, 0.185, 0.085, 0.125, 0.12625, 0.1625]
predictions = predictions_df[["H", "D", "A"]]
observed = pd.get_dummies(predictions_df["outcome"]).reindex(
    columns=predictions.columns, fill_value=0
)
rps_result = rps(predictions, observed).round(5)


#print(predictions)
#print(f"do results match expected: {all(expected_result == rps_result)}")

#print(np.mean(rps_result))

In [56]:
#calculate rps score for betting odds
PL_20_21 = pd.read_csv('Data/2020-21/Fixtures/E_20_21.csv')
PL_21_22 = pd.read_csv('Data/2021-22/Fixtures/E_21_22.csv')
PL_20_21['B365H'] = 1/PL_20_21['B365H']
PL_20_21['B365D'] = 1/PL_20_21['B365D']
PL_20_21['B365A'] = 1/PL_20_21['B365A']
PL_20_21['SUM'] = PL_20_21['B365H'] + PL_20_21['B365D'] + PL_20_21['B365A']
PL_20_21['B365H'] = PL_20_21['B365H']/PL_20_21['SUM']
PL_20_21['B365D'] = PL_20_21['B365D']/PL_20_21['SUM']
PL_20_21['B365A'] = PL_20_21['B365A']/PL_20_21['SUM']
PL_21_22['B365H'] = 1/PL_21_22['B365H']
PL_21_22['B365D'] = 1/PL_21_22['B365D']
PL_21_22['B365A'] = 1/PL_21_22['B365A']
PL_21_22['SUM'] = PL_21_22['B365H'] + PL_21_22['B365D'] + PL_21_22['B365A']
PL_21_22['B365H'] = PL_21_22['B365H']/PL_21_22['SUM']
PL_21_22['B365D'] = PL_21_22['B365D']/PL_21_22['SUM']
PL_21_22['B365A'] = PL_21_22['B365A']/PL_21_22['SUM']
probabilities_20 = PL_20_21[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities_20 = probabilities_20.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions_20 = probabilities_20[["H", "D", "A"]]
observed_20 = pd.get_dummies(probabilities_20["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)

probabilities_21 = PL_21_22[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities_21 = probabilities_21[-350:]
probabilities_21 = probabilities_21.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions_21 = probabilities_21[["H", "D", "A"]]
observed_21 = pd.get_dummies(probabilities_21["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)



rps_result_21 = rps(predictions_21, observed_21).round(5)
rps_result_20 = rps(predictions_20, observed_20).round(5)

rps_betting_odds = np.mean(rps_result_21)
print(rps_betting_odds)

0.19134634285714286


In [61]:
#calculate rps score for community votes
PL_21_22.replace(['Leeds', 'Leicester', 'Man City', 'Man United', 'Newcastle', 'Norwich', 'Sheff Utd', 'Spurs'],
                 ['Leeds United', 'Leicester City', 'Manchester City', 'Manchester Utd', 'Newcastle Utd',
                  'Norwich City', 'Sheffield Utd', 'Tottenham'], inplace=True)
PL_21_22['game_id'] = PL_21_22['HomeTeam'] + PL_21_22['AwayTeam']
PL_21_22['HomeTeam'].drop_duplicates()
transfermarkt_probs_21 = pd.read_csv('Data/2021-22/Fixtures/tranfermarkt_21_probs.csv')
test = pd.merge(PL_21_22, transfermarkt_probs_21, on='game_id', how='left')

predictions = test[["H", "D", "A"]]
observed = pd.get_dummies(test["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_community = rps(predictions, observed).round(5)

rps_community_votes = np.mean(rps_result_community)

print(rps_community_votes)

0.22040236842105265


In [58]:
#calculate rps score for rf model benchmark
probs_benchmark = pd.read_csv('Data/Total/probs_benchmark_info.csv')

predictions = probs_benchmark[["H", "D", "A"]]
observed = pd.get_dummies(probabilities_21["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result = rps(predictions, observed).round(5)

rps_rf_benchmark = np.mean(rps_result)
print(rps_rf_benchmark)

0.2156126842105263


In [59]:
#calculate rps score for rf model benchmark
probs_player_stats= pd.read_csv('Data/Total/probs_player_stats_info.csv')

predictions = probs_player_stats[["H", "D", "A"]]
observed = pd.get_dummies(probabilities_21["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result = rps(predictions, observed).round(5)

rps_rf_player_stats = np.mean(rps_result)
print(rps_rf_player_stats)

0.21135973684210527


In [60]:
print('RPS score random forest benchmark:', rps_rf_benchmark)
print('RPS score random forest incl player stats:', rps_rf_player_stats)
print('RPS score community votes:', rps_community_votes)
print('RPS score betting odds:', rps_betting_odds)

RPS score random forest benchmark: 0.2156126842105263
RPS score random forest incl player stats: 0.21135973684210527
RPS score community votes: 0.22040236842105265
RPS score betting odds: 0.19134634285714286
