In [73]:
import pandas as pd

In [74]:
split_date = '2022-10-01'

In [75]:
matches = pd.read_csv("premier_league_2019-2024.csv", index_col=0)

In [76]:
matches.shape # rows and columns
# 2 seasons * 20 teams * 38 matches

(3466, 129)

In [77]:
matches["targetWins"] = (matches["result"] == "W").astype("int")
matches["targetLosses"] = (matches["result"] == "L").astype("int")

In [78]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes == 1

In [79]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes #each opponent now has their own code

In [80]:
matches["date"] = pd.to_datetime(matches["date"])
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") #remove : and minutes, keep just hour
matches["day_code"] = matches["date"].dt.dayofweek

del matches["time"]
del matches["season"]

In [81]:
matches["formation_code"] = matches["formation"].astype("category").cat.codes #each opponent now has their own code
matches["referee_code"] = matches["referee"].astype("category").cat.codes #each opponent now has their own code

In [82]:
matches

Unnamed: 0,date,round,day,venue,result,gf,ga,opponent,xg,xga,...,won%,team,targetWins,targetLosses,venue_code,opp_code,hour,day_code,formation_code,referee_code
0,2019-08-09,Matchweek 1,Fri,Home,W,4.0,1.0,Norwich City,1.8,0.9,...,68.2,Liverpool,1,0,True,17,20,4,15,16
1,2019-08-17,Matchweek 2,Sat,Away,W,2.0,1.0,Southampton,1.3,1.9,...,55.9,Liverpool,1,0,False,20,15,5,15,0
2,2019-08-24,Matchweek 3,Sat,Home,W,3.0,1.0,Arsenal,2.5,1.0,...,38.9,Liverpool,1,0,True,0,17,5,15,2
3,2019-08-31,Matchweek 4,Sat,Away,W,3.0,0.0,Burnley,1.0,0.9,...,52.3,Liverpool,1,0,False,5,17,5,15,3
4,2019-09-14,Matchweek 5,Sat,Home,W,3.0,1.0,Newcastle Utd,3.0,0.3,...,65.8,Liverpool,1,0,True,16,12,5,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,2023-12-22,Matchweek 18,Fri,Away,D,1.0,1.0,Aston Villa,0.6,0.9,...,50.0,Sheffield United,0,0,False,1,20,4,4,2
18,2023-12-26,Matchweek 19,Tue,Home,L,2.0,3.0,Luton Town,2.4,0.6,...,50.0,Sheffield United,0,1,True,13,15,1,17,25
19,2023-12-30,Matchweek 20,Sat,Away,L,0.0,2.0,Manchester City,0.3,2.2,...,37.5,Sheffield United,0,1,False,14,15,5,4,7
20,2024-01-21,Matchweek 21,Sun,Home,D,2.0,2.0,West Ham,2.5,2.3,...,46.2,Sheffield United,0,0,True,24,14,6,10,17


In [83]:
matches.dtypes

date              datetime64[ns]
round                     object
day                       object
venue                     object
result                    object
                       ...      
opp_code                    int8
hour                       int64
day_code                   int32
formation_code              int8
referee_code                int8
Length: 135, dtype: object

In [84]:
from sklearn.ensemble import RandomForestClassifier

In [85]:
rfWins = RandomForestClassifier(n_estimators=1000, min_samples_split=2, random_state=1, max_depth=3)

In [86]:
train = matches[matches["date"] < split_date]

In [87]:
test = matches[matches["date"] > split_date]

In [88]:
predictors = ["venue_code", "opp_code", "hour", "day_code", "formation_code", "referee_code"]

In [89]:
rfWins.fit(train[predictors], train["targetWins"])

In [90]:
predsWins = rfWins.predict(test[predictors])

In [91]:
from sklearn.metrics import accuracy_score

In [92]:
errorWins = accuracy_score(test["targetWins"], predsWins)

In [93]:
print(errorWins)

0.6107899807321773


In [94]:
combinedWins = pd.DataFrame(dict(actual=test["targetWins"], predicted=predsWins))

In [95]:
pd.crosstab(index=combinedWins["actual"], columns=combinedWins["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,626,2
1,402,8


In [96]:
from sklearn.metrics import precision_score

precision_score(test["targetWins"], predsWins)

0.8

In [97]:
grouped_matches = matches.groupby("team")

In [98]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [99]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [100]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [101]:
matches_rolling = matches_rolling.droplevel('team')

In [102]:
matches_rolling.index = range(matches_rolling.shape[0])

In [103]:
def make_predictions(data, predictors):
    train = data[data["date"] < split_date]
    test = data[data["date"] > split_date]

    rfWins.fit(train[predictors], train["targetWins"])

    predsWinsTrain = rfWins.predict(train[predictors])
    predsWinsTest = rfWins.predict(test[predictors])

    # test
    predsProba = rfWins.predict_proba(test[predictors])[:, 1]
    count = 0
    for num in predsProba:
        if(num >= 0.5):
            count += 1
    print(count)

    combinedWinsTrain = pd.DataFrame(dict(actual=train["targetWins"], predicted=predsWinsTrain), index=train.index)
    errorWinsTrain = precision_score(train["targetWins"], predsWinsTrain)

    combinedWinsTest = pd.DataFrame(dict(actual=test["targetWins"], predicted=predsWinsTest), index=test.index)
    errorWinsTest = precision_score(test["targetWins"], predsWinsTest)

    print(errorWinsTrain)
    print(errorWinsTest)

    return combinedWinsTest, errorWinsTest

In [104]:
combinedWins, errorWins = make_predictions(matches_rolling, predictors + new_cols)

66
0.7454545454545455
0.6363636363636364


In [105]:
combinedWins = combinedWins.merge(matches_rolling[["result", "gf", "ga", "team", "opponent", "venue", "date", "hour", "formation_code", "referee_code"]], left_index=True, right_index=True)

In [106]:
test = combinedWins[(combinedWins["predicted"] == 1) & (combinedWins["venue"] == "Home") & (combinedWins["hour"] < 16)].sort_values("date")
test

Unnamed: 0,actual,predicted,result,gf,ga,team,opponent,venue,date,hour,formation_code,referee_code
2001,1,1,W,6.0,3.0,Manchester City,Manchester Utd,Home,2022-10-02,14,15,16
2341,1,1,W,5.0,1.0,Newcastle United,Brentford,Home,2022-10-08,15,15,10
2002,1,1,W,4.0,0.0,Manchester City,Southampton,Home,2022-10-08,15,12,1
2004,1,1,W,3.0,1.0,Manchester City,Brighton,Home,2022-10-22,15,15,4
658,1,1,W,1.0,0.0,Brighton and Hove Albion,Bournemouth,Home,2023-02-04,15,4,4
2019,1,1,W,2.0,0.0,Manchester City,Newcastle Utd,Home,2023-03-04,12,12,27
661,1,1,W,4.0,0.0,Brighton and Hove Albion,West Ham,Home,2023-03-04,15,12,28
138,1,1,W,4.0,1.0,Arsenal,Crystal Palace,Home,2023-03-19,14,15,28
139,1,1,W,4.0,1.0,Arsenal,Leeds United,Home,2023-04-01,15,15,6
1836,1,1,W,3.0,2.0,Liverpool,Nott'ham Forest,Home,2023-04-22,15,15,16


In [107]:
print(len(test))
print(len(test[test["actual"] == 1]))
print(int(len(test[test["actual"] == 1]) / len(test) * 10000) / 100, "%")

24
20
83.33 %


In [108]:
correctW = 0
for index, entity in combinedWins.iterrows():
	if entity['actual'] == entity['predicted'] and entity['predicted'] == 1:
		correctW += 1

uncorrectW = len(combinedWins[combinedWins['predicted'] == 1]) - correctW

print("Wins:")
print("Correct:", correctW)
print("Wrong:", uncorrectW)
print("Ratio:", correctW / (correctW + uncorrectW) * 100, "%")
print("\n")
print(len(combinedWins[combinedWins['actual'] == combinedWins['predicted']]) / len(combinedWins))

Wins:
Correct: 42
Wrong: 24
Ratio: 63.63636363636363 %


0.621256038647343
