In [1]:
#notebook to calculate accuracy of the models and compare to benchmarks community votes and betting odds
#import necessary packages
import pandas as pd
import numpy as np

In [2]:
def rps(predictions, observed):
    """Vectorized version of Ranked Probability Score.
    A lower value is a better score.
    From: Colin Catlin, https://syllepsis.live/2022/01/22/ranked-probability-score-in-python/

    Args:
        predictions (pd.DataFrame): each column is an outcome category
            with values as the 0 to 1 probability of that category
        observed (pd.DataFrame): each column is an outcome category
            with values of 0 OR 1 with 1 being that category occurred
    """
    assert (
        predictions.shape == observed.shape
    ), "prediction and observed array shapes must match"
    ncat = predictions.shape[1] - 1
    return (
        np.sum(
            (np.cumsum(predictions, axis=1) - np.cumsum(observed, axis=1)) ** 2, axis=1
        ) / ncat
    )


"""
Sample data is of soccer/football matches,
5 matches, with 2 different models predicting (with 'a' the better model)
3 Categories
Where H = home team wins, D = draw, A = away wins

ORDER MATTERS (of H, D, and A), which is the whole point of this metric.
"""
predictions_df = pd.DataFrame({
    'H': [1, 0.9, 0.8, 0.5, 0.35, 0.6, 0.6, 0.6, 0.5, 0.55],
    'D': [0, 0.1, 0.1, 0.25, 0.3, 0.3, 0.3, 0.1, 0.45, 0.1],
    'A': [0, 0, 0.1, 0.25, 0.35, 0.1, 0.1, 0.3, 0.05, 0.35],
    "model": np.tile(["a", "b"], 5),
    "match": np.repeat(np.arange(1, 6), 2),
    "outcome": np.repeat(["H", "H", "D", "H", "H"], 2),
})
expected_result = [0, 0.005, 0.025, 0.15625, 0.1225, 0.185, 0.085, 0.125, 0.12625, 0.1625]
predictions = predictions_df[["H", "D", "A"]]
observed = pd.get_dummies(predictions_df["outcome"]).reindex(
    columns=predictions.columns, fill_value=0
)
rps_result = rps(predictions, observed).round(5)


print(predictions)
#print(f"do results match expected: {all(expected_result == rps_result)}")

print(np.mean(rps_result))

      H     D     A
0  1.00  0.00  0.00
1  0.90  0.10  0.00
2  0.80  0.10  0.10
3  0.50  0.25  0.25
4  0.35  0.30  0.35
5  0.60  0.30  0.10
6  0.60  0.30  0.10
7  0.60  0.10  0.30
8  0.50  0.45  0.05
9  0.55  0.10  0.35
0.09925


In [11]:
PL_20_21 = pd.read_csv('Data/2020-21/Fixtures/E_20_21.csv')
PL_21_22 = pd.read_csv('Data/2021-22/Fixtures/E_21_22.csv')
PL_20_21['B365H'] = 1/PL_20_21['B365H']
PL_20_21['B365D'] = 1/PL_20_21['B365D']
PL_20_21['B365A'] = 1/PL_20_21['B365A']
PL_20_21['SUM'] = PL_20_21['B365H'] + PL_20_21['B365D'] + PL_20_21['B365A']
PL_20_21['B365H'] = PL_20_21['B365H']/PL_20_21['SUM']
PL_20_21['B365D'] = PL_20_21['B365D']/PL_20_21['SUM']
PL_20_21['B365A'] = PL_20_21['B365A']/PL_20_21['SUM']
PL_21_22['B365H'] = 1/PL_21_22['B365H']
PL_21_22['B365D'] = 1/PL_21_22['B365D']
PL_21_22['B365A'] = 1/PL_21_22['B365A']
PL_21_22['SUM'] = PL_21_22['B365H'] + PL_21_22['B365D'] + PL_21_22['B365A']
PL_21_22['B365H'] = PL_21_22['B365H']/PL_21_22['SUM']
PL_21_22['B365D'] = PL_21_22['B365D']/PL_21_22['SUM']
PL_21_22['B365A'] = PL_21_22['B365A']/PL_21_22['SUM']
probabilities_20 = PL_20_21[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities_20 = probabilities_20.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions_20 = probabilities_20[["H", "D", "A"]]
observed_20 = pd.get_dummies(probabilities_20["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)

probabilities_21 = PL_21_22[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities_21 = probabilities_21[-350:]
probabilities_21 = probabilities_21.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions_21 = probabilities_21[["H", "D", "A"]]
observed_21 = pd.get_dummies(probabilities_21["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)



rps_result_21 = rps(predictions_21, observed_21).round(5)
rps_result_20 = rps(predictions_20, observed_20).round(5)

probabilities_21
print(np.mean(rps_result_21))

0.19134634285714286


In [12]:
probs_benchmark = pd.read_csv('Data/Total/probs_benchmark.csv')

predictions = probs_benchmark[["H", "D", "A"]]
observed = pd.get_dummies(probabilities_21["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result = rps(predictions, observed).round(5)


print(np.mean(rps_result))
print(predictions)

0.2158239210526316
            H         D         A
0    0.533596  0.260783  0.205621
1    0.718345  0.095985  0.185670
2    0.151457  0.614028  0.234515
3    0.100253  0.720705  0.179042
4    0.850411  0.069783  0.079807
..        ...       ...       ...
345  0.204146  0.318307  0.477548
346  0.547199  0.146458  0.306342
347  0.771624  0.121954  0.106423
348  0.761513  0.122945  0.115542
349  0.229868  0.541379  0.228753

[350 rows x 3 columns]


In [13]:
probs_player_stats= pd.read_csv('Data/Total/probs_player_stats.csv')

predictions = probs_player_stats[["H", "D", "A"]]
observed = pd.get_dummies(probabilities_21["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result = rps(predictions, observed).round(5)


print(np.mean(rps_result))
print(predictions)

0.21415544736842104
            H         D         A
0    0.631022  0.200140  0.168838
1    0.660644  0.136281  0.203074
2    0.151899  0.674000  0.174101
3    0.112982  0.771997  0.115021
4    0.848845  0.075893  0.075262
..        ...       ...       ...
345  0.259396  0.378656  0.361948
346  0.626677  0.173323  0.200000
347  0.604902  0.264070  0.131028
348  0.629223  0.209536  0.161241
349  0.179034  0.591900  0.229065

[350 rows x 3 columns]


In [24]:
PL_21_22.replace(['Leeds','Leicester','Man City' ,'Man United','Newcastle', 'Norwich','Sheff Utd', 'Spurs'], ['Leeds United', 'Leicester City', 'Manchester City', 'Manchester Utd', 'Newcastle Utd', 'Norwich City','Sheffield Utd', 'Tottenham'], inplace=True)
PL_21_22['game_id'] = PL_21_22['HomeTeam'] + PL_21_22['AwayTeam']
PL_21_22['HomeTeam'].drop_duplicates()

0           Brentford
1      Manchester Utd
2             Burnley
3             Chelsea
4             Everton
5      Leicester City
6             Watford
7        Norwich City
8       Newcastle Utd
9           Tottenham
10          Liverpool
11        Aston Villa
12     Crystal Palace
13       Leeds United
14    Manchester City
15           Brighton
16        Southampton
17             Wolves
18            Arsenal
19           West Ham
Name: HomeTeam, dtype: object

In [26]:
transfermarkt_probs_21 = pd.read_csv('Data/2021-22/Fixtures/tranfermarkt_21_probs.csv')
test =  pd.merge(PL_21_22, transfermarkt_probs_21, on= 'game_id', how='left')


In [27]:
predictions = test[["H", "D", "A"]]
observed = pd.get_dummies(test["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_community = rps(predictions, observed).round(5)


print('RPS score community votes:', np.mean(rps_result_community))


RPS score community votes: 0.22040236842105265


In [65]:
probs = pd.read_csv('Data/2020-21/Fixtures/probabilities.csv')
probs
predictions = probs[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_rf_player_stats = rps(predictions, observed).round(5)


print('RPS score random forest incl player stats:', np.mean(rps_result_rf_player_stats))

RPS score random forest incl player stats: 0.2381925


In [70]:
probs = pd.read_csv('Data/2020-21/Fixtures/probabilities_ohne_pl_stats.csv')
probs
predictions = probs[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_rf = rps(predictions, observed).round(5)


print('RPS score random forest incl player stats:', np.mean(rps_result_rf))

RPS score random forest incl player stats: 0.24053026315789475


In [66]:
PL_20_21 = pd.read_csv('Data/2020-21/Fixtures/E_20_21.csv')
PL_20_21['B365H'] = 1/PL_20_21['B365H']
PL_20_21['B365D'] = 1/PL_20_21['B365D']
PL_20_21['B365A'] = 1/PL_20_21['B365A']
PL_20_21['SUM'] = PL_20_21['B365H'] + PL_20_21['B365D'] + PL_20_21['B365A']
PL_20_21['B365H'] = PL_20_21['B365H']/PL_20_21['SUM']
PL_20_21['B365D'] = PL_20_21['B365D']/PL_20_21['SUM']
PL_20_21['B365A'] = PL_20_21['B365A']/PL_20_21['SUM']
probabilities = PL_20_21[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities = probabilities.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions = probabilities[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
observed
rps_result_betting_odds = rps(predictions, observed).round(5)

print('RPS score betting odds:', np.mean(rps_result_betting_odds))

RPS score betting odds: 0.21371036842105265


In [71]:
print('RPS score random forest benchmark:', np.mean(rps_result_rf))
print('RPS score random forest incl player stats:', np.mean(rps_result_rf_player_stats))
print('RPS score community votes:', np.mean(rps_result_community))
print('RPS score betting odds:', np.mean(rps_result_betting_odds))

RPS score random forest benchmark: 0.24053026315789475
RPS score random forest incl player stats: 0.2381925
RPS score community votes: 0.2701898265895954
RPS score betting odds: 0.21371036842105265
