In [1]:
#import necessary packages
import pandas as pd
import numpy as np

In [39]:
def rps(predictions, observed):
    """Vectorized version of Ranked Probability Score.
    A lower value is a better score.
    From: Colin Catlin, https://syllepsis.live/2022/01/22/ranked-probability-score-in-python/

    Args:
        predictions (pd.DataFrame): each column is an outcome category
            with values as the 0 to 1 probability of that category
        observed (pd.DataFrame): each column is an outcome category
            with values of 0 OR 1 with 1 being that category occurred
    """
    assert (
        predictions.shape == observed.shape
    ), "prediction and observed array shapes must match"
    ncat = predictions.shape[1] - 1
    return (
        np.sum(
            (np.cumsum(predictions, axis=1) - np.cumsum(observed, axis=1)) ** 2, axis=1
        ) / ncat
    )


"""
Sample data is of soccer/football matches,
5 matches, with 2 different models predicting (with 'a' the better model)
3 Categories
Where H = home team wins, D = draw, A = away wins

ORDER MATTERS (of H, D, and A), which is the whole point of this metric.
"""
predictions_df = pd.DataFrame({
    'H': [1, 0.9, 0.8, 0.5, 0.35, 0.6, 0.6, 0.6, 0.5, 0.55],
    'D': [0, 0.1, 0.1, 0.25, 0.3, 0.3, 0.3, 0.1, 0.45, 0.1],
    'A': [0, 0, 0.1, 0.25, 0.35, 0.1, 0.1, 0.3, 0.05, 0.35],
    "model": np.tile(["a", "b"], 5),
    "match": np.repeat(np.arange(1, 6), 2),
    "outcome": np.repeat(["H", "H", "D", "H", "H"], 2),
})
expected_result = [0, 0.005, 0.025, 0.15625, 0.1225, 0.185, 0.085, 0.125, 0.12625, 0.1625]
predictions = predictions_df[["H", "D", "A"]]
observed = pd.get_dummies(predictions_df["outcome"]).reindex(
    columns=predictions.columns, fill_value=0
)
rps_result = rps(predictions, observed).round(5)


print(predictions)
#print(f"do results match expected: {all(expected_result == rps_result)}")

print(np.mean(rps_result))

      H     D     A
0  1.00  0.00  0.00
1  0.90  0.10  0.00
2  0.80  0.10  0.10
3  0.50  0.25  0.25
4  0.35  0.30  0.35
5  0.60  0.30  0.10
6  0.60  0.30  0.10
7  0.60  0.10  0.30
8  0.50  0.45  0.05
9  0.55  0.10  0.35


In [58]:
PL_20_21 = pd.read_csv('Data/2020-21/Fixtures/E_20_21.csv')
PL_20_21['B365H'] = 1/PL_20_21['B365H']
PL_20_21['B365D'] = 1/PL_20_21['B365D']
PL_20_21['B365A'] = 1/PL_20_21['B365A']
PL_20_21['SUM'] = PL_20_21['B365H'] + PL_20_21['B365D'] + PL_20_21['B365A']
PL_20_21['B365H'] = PL_20_21['B365H']/PL_20_21['SUM']
PL_20_21['B365D'] = PL_20_21['B365D']/PL_20_21['SUM']
PL_20_21['B365A'] = PL_20_21['B365A']/PL_20_21['SUM']
probabilities = PL_20_21[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities = probabilities.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions = probabilities[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
observed
rps_result = rps(predictions, observed).round(5)


print(np.mean(rps_result))
print(predictions)

0.21371036842105265
            H         D         A
0    0.158548  0.219697  0.621756
1    0.306573  0.292424  0.401003
2    0.741801  0.158251  0.099948
3    0.441558  0.279221  0.279221
4    0.249733  0.263607  0.486660
..        ...       ...       ...
375  0.826366  0.110830  0.062804
376  0.662454  0.190787  0.146759
377  0.339623  0.264151  0.396226
378  0.573888  0.236729  0.189383
379  0.378758  0.270541  0.350701

[380 rows x 3 columns]


In [53]:
probs = pd.read_csv('Data/2020-21/Fixtures/probabilities.csv')
probs
predictions = probs[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result = rps(predictions, observed).round(5)


print(np.mean(rps_result))
print(predictions)

0.2381925
        H     D     A
0    0.06  0.91  0.03
1    0.81  0.18  0.01
2    0.91  0.08  0.01
3    0.72  0.28  0.00
4    0.06  0.91  0.03
..    ...   ...   ...
375  0.86  0.10  0.04
376  0.42  0.41  0.17
377  0.80  0.15  0.05
378  0.85  0.11  0.04
379  0.10  0.79  0.11

[380 rows x 3 columns]


In [47]:
PL_20_21['game_id'] = PL_20_21['HomeTeam'] + PL_20_21['AwayTeam']
PL_20_21

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,SUM,game_id
0,E0,12/09/2020,12:30,Fulham,Arsenal,0,3,A,0,1,...,2.01,1.89,2.02,1.91,2.13,1.92,2.02,1.87,1.051208,FulhamArsenal
1,E0,12/09/2020,15:00,Crystal Palace,Southampton,1,0,H,1,0,...,1.78,2.13,1.79,2.17,1.85,2.18,1.79,2.12,1.052214,Crystal PalaceSouthampton
2,E0,12/09/2020,17:30,Liverpool,Leeds,4,3,H,3,2,...,1.85,2.05,1.85,2.08,1.90,2.16,1.84,2.04,1.053180,LiverpoolLeeds
3,E0,12/09/2020,20:00,West Ham,Newcastle,0,2,A,0,0,...,2.03,1.87,2.04,1.88,2.09,1.91,2.02,1.86,1.053352,West HamNewcastle
4,E0,13/09/2020,14:00,West Brom,Leicester,0,3,A,0,0,...,1.92,1.98,1.93,1.99,1.95,2.01,1.91,1.97,1.053756,West BromLeicester
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,23/05/2021,16:00,Liverpool,Crystal Palace,2,0,H,1,0,...,1.86,2.04,1.88,2.03,1.98,2.14,1.88,2.00,1.061507,LiverpoolCrystal Palace
376,E0,23/05/2021,16:00,Man City,Everton,5,0,H,2,0,...,2.01,1.89,1.99,1.89,2.20,2.00,2.03,1.85,1.048291,Man CityEverton
377,E0,23/05/2021,16:00,Sheffield United,Burnley,1,0,H,1,0,...,2.04,1.86,2.05,1.86,2.17,1.90,2.03,1.84,1.051587,Sheffield UnitedBurnley
378,E0,23/05/2021,16:00,West Ham,Southampton,3,0,H,2,0,...,2.00,1.90,2.02,1.91,2.06,2.01,1.99,1.89,1.056061,West HamSouthampton


In [54]:
transfermarkt_probs = pd.read_csv('Data/2020-21/Fixtures/tranfermarkt_probs.csv')
transfermarkt_probs

Unnamed: 0.1,Unnamed: 0,home_team,away_team,H,D,A,game_id
0,0,Chelsea,Crystal Palace,0.939,0.041,0.020,ChelseaCrystal Palace
1,1,Everton,Brighton,0.955,0.045,0.000,EvertonBrighton
2,2,Leeds,Man City,0.063,0.137,0.800,LeedsMan City
3,3,Newcastle,Burnley,0.840,0.121,0.039,NewcastleBurnley
4,4,Leicester,West Ham,0.971,0.024,0.005,LeicesterWest Ham
...,...,...,...,...,...,...,...
345,5,Liverpool,Crystal Palace,0.962,0.025,0.013,LiverpoolCrystal Palace
346,6,Man City,Everton,0.834,0.102,0.064,Man CityEverton
347,7,Sheffield United,Burnley,0.172,0.242,0.586,Sheffield UnitedBurnley
348,8,West Ham,Southampton,0.879,0.089,0.032,West HamSouthampton


In [56]:
test =  pd.merge(PL_20_21, transfermarkt_probs, on= 'game_id', how='left')
test = test.dropna()
test

Unnamed: 0.1,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgCAHH,AvgCAHA,SUM,game_id,Unnamed: 0,home_team,away_team,H,D,A
28,E0,03/10/2020,12:30,Chelsea,Crystal Palace,4,0,H,0,0,...,1.90,1.98,1.049812,ChelseaCrystal Palace,0.0,Chelsea,Crystal Palace,0.939,0.041,0.020
29,E0,03/10/2020,15:00,Everton,Brighton,4,2,H,2,1,...,2.03,1.86,1.056395,EvertonBrighton,1.0,Everton,Brighton,0.955,0.045,0.000
30,E0,03/10/2020,17:30,Leeds,Man City,1,1,D,0,1,...,1.91,1.96,1.059104,LeedsMan City,2.0,Leeds,Man City,0.063,0.137,0.800
31,E0,03/10/2020,20:00,Newcastle,Burnley,3,1,H,1,0,...,2.07,1.83,1.051747,NewcastleBurnley,3.0,Newcastle,Burnley,0.840,0.121,0.039
32,E0,04/10/2020,12:00,Leicester,West Ham,0,3,A,0,2,...,2.06,1.83,1.049689,LeicesterWest Ham,4.0,Leicester,West Ham,0.971,0.024,0.005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,E0,23/05/2021,16:00,Liverpool,Crystal Palace,2,0,H,1,0,...,1.88,2.00,1.061507,LiverpoolCrystal Palace,5.0,Liverpool,Crystal Palace,0.962,0.025,0.013
376,E0,23/05/2021,16:00,Man City,Everton,5,0,H,2,0,...,2.03,1.85,1.048291,Man CityEverton,6.0,Man City,Everton,0.834,0.102,0.064
377,E0,23/05/2021,16:00,Sheffield United,Burnley,1,0,H,1,0,...,2.03,1.84,1.051587,Sheffield UnitedBurnley,7.0,Sheffield United,Burnley,0.172,0.242,0.586
378,E0,23/05/2021,16:00,West Ham,Southampton,3,0,H,2,0,...,1.99,1.89,1.056061,West HamSouthampton,8.0,West Ham,Southampton,0.879,0.089,0.032


In [64]:
probs
predictions = test[["H", "D", "A"]]
observed = pd.get_dummies(test["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_community = rps(predictions, observed).round(5)


print('RPS score community votes:', np.mean(rps_result_community))


RPS score community votes: 0.2701898265895954


In [65]:
probs = pd.read_csv('Data/2020-21/Fixtures/probabilities.csv')
probs
predictions = probs[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_rf_player_stats = rps(predictions, observed).round(5)


print('RPS score random forest incl player stats:', np.mean(rps_result_rf_player_stats))

RPS score random forest incl player stats: 0.2381925


In [70]:
probs = pd.read_csv('Data/2020-21/Fixtures/probabilities_ohne_pl_stats.csv')
probs
predictions = probs[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
rps_result_rf = rps(predictions, observed).round(5)


print('RPS score random forest incl player stats:', np.mean(rps_result_rf))

RPS score random forest incl player stats: 0.24053026315789475


In [66]:
PL_20_21 = pd.read_csv('Data/2020-21/Fixtures/E_20_21.csv')
PL_20_21['B365H'] = 1/PL_20_21['B365H']
PL_20_21['B365D'] = 1/PL_20_21['B365D']
PL_20_21['B365A'] = 1/PL_20_21['B365A']
PL_20_21['SUM'] = PL_20_21['B365H'] + PL_20_21['B365D'] + PL_20_21['B365A']
PL_20_21['B365H'] = PL_20_21['B365H']/PL_20_21['SUM']
PL_20_21['B365D'] = PL_20_21['B365D']/PL_20_21['SUM']
PL_20_21['B365A'] = PL_20_21['B365A']/PL_20_21['SUM']
probabilities = PL_20_21[['HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A', 'FTR']]
probabilities = probabilities.rename(columns={'B365H':'H','B365D':'D','B365A':'A'})
predictions = probabilities[["H", "D", "A"]]
observed = pd.get_dummies(probabilities["FTR"]).reindex(
    columns=predictions.columns, fill_value=0)
observed
rps_result_betting_odds = rps(predictions, observed).round(5)

print('RPS score betting odds:', np.mean(rps_result_betting_odds))

RPS score betting odds: 0.21371036842105265


In [71]:
print('RPS score random forest benchmark:', np.mean(rps_result_rf))
print('RPS score random forest incl player stats:', np.mean(rps_result_rf_player_stats))
print('RPS score community votes:', np.mean(rps_result_community))
print('RPS score betting odds:', np.mean(rps_result_betting_odds))

RPS score random forest benchmark: 0.24053026315789475
RPS score random forest incl player stats: 0.2381925
RPS score community votes: 0.2701898265895954
RPS score betting odds: 0.21371036842105265
