In [1658]:
import pandas as pd
import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import numpy as np

In [1659]:
current_time = datetime.datetime.today()
year = str(current_time.year)
month = str(current_time.month)
day = str(current_time.day)

if len(month) == 1:
	month = "0" + month
if len(day) == 1:
	day = "0" + day

split_date = year + '-' + month + '-' + day
split_date

split_date = '2023-04-01'

In [1660]:
li = []

li.append(pd.read_csv("../data/premier_league_2019-2023.csv", index_col=0))
# li.append(pd.read_csv("../data/la_liga_2019-2023.csv", index_col=0))
# li.append(pd.read_csv("../data/bundesliga_2019-2023.csv", index_col=0))
# li.append(pd.read_csv("../data/serie_a_2019-2023.csv", index_col=0))
matches = pd.concat(li, axis=0, ignore_index=True)
matches.shape

(3466, 129)

In [1661]:
matches["target"] = (matches["result"] == "W").astype("int")

In [1662]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes == 1
matches["opponent_code"] = matches["opponent"].astype("category").cat.codes #each opponent now has their own code

In [1663]:
matches["date"] = pd.to_datetime(matches["date"])
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") #remove : and minutes, keep just hour
matches["day_code"] = matches["date"].dt.dayofweek

In [1664]:
matches["formation_code"] = matches["formation"].astype("category").cat.codes #each opponent now has their own code
matches["referee_code"] = matches["referee"].astype("category").cat.codes #each opponent now has their own code

In [1665]:
rf = RandomForestClassifier(n_estimators=10000, random_state=1)

In [1666]:
train = matches[matches["date"] < split_date]
test = matches[matches["date"] > split_date]

In [1667]:
predictors = ["venue_code", "opponent_code", "hour", "day_code", "formation_code", "referee_code"] # add date

In [1668]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date") # date wird immer spaeter von 2019 bis 2024

    weights = [0.333333, 0.333333, 0.333333] # Weights der Games

    rolling_stats = group[cols].rolling(3, closed='left').apply(lambda x: (x * weights).sum())
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)

    return group

In [1669]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "sota"]
# , "saves", "opp", "stp"
new_cols = [f"{c}_rolling" for c in cols]
opp_cols = [f"{c}_rolling_opp" for c in cols]

In [1670]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [1671]:
def rolling_averages_opponent(matches_rolling):
    grouped = matches_rolling.groupby("team")

    for index, game in matches_rolling.iterrows():
        date = game['date']
        opponent_name = game['opponent']
        team = game['team']

        opponent = grouped.get_group(opponent_name)

        year = str(date.year)
        month = str(date.month)
        day = str(date.day)
        
        if len(month) == 1:
            month = "0" + month
        if len(day) == 1:
            day = "0" + day
              
        date_str = year + "-" + month + "-" + day
        opponent_game = opponent[opponent["date"] == date_str]

        for column in opponent:
            if not "rolling" in column:
                continue
            
            matches_rolling.loc[index, column + "_opp"] = opponent_game[column].values[0] if not opponent_game.empty else None
        # columns = opponent.columns["rolling" in opponent.columns]
        # print(columns)
        # print(opponent)
        # print(game)    

        # break
    
    return matches_rolling

In [1672]:
def rolling_averages_opponent2(matches_rolling):
    matches_rolling['date_str'] = matches_rolling['date'].dt.strftime('%Y-%m-%d')
    matches_rolling.set_index(['team', 'date_str'], inplace=True)

    rolling_cols = [col for col in matches_rolling.columns if 'rolling' in col]
    print(rolling_cols)
    for col in rolling_cols:
        matches_rolling[col + '_opp'] = matches_rolling.groupby('opponent')[col].shift()

    matches_rolling.reset_index(inplace=True)
    matches_rolling = matches_rolling.dropna(subset=opp_cols)
    return matches_rolling

In [1673]:
matches_rolling = rolling_averages_opponent2(matches_rolling)
# matches_rolling.head(50)

['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'sota_rolling']


In [1674]:
matches_rolling.index = range(matches_rolling.shape[0])

In [1675]:
matches_rolling = matches_rolling[matches_rolling["venue"] == "Home"]
print(len(matches_rolling))

1682


In [1676]:
def make_predictions(data, predictors):
    train = data[data["date"] < split_date]
    test = data[data["date"] > split_date]

    rf.fit(train[predictors], train["target"])

    predsWinsTrain = rf.predict(train[predictors])
    predsWinsTest = rf.predict(test[predictors])
    print(predsWinsTest)

    probabilities = rf.predict_proba(test[predictors])
    probabilities = probabilities[:, 0]

    combinedWinsTrain = pd.DataFrame(dict(actual=train["target"], predicted=predsWinsTrain), index=train.index)
    errorWinsTrain = precision_score(train["target"], predsWinsTrain)

    combinedWinsTest = pd.DataFrame(dict(actual=test["target"], predicted=predsWinsTest), index=test.index)
    combinedWinsTest['probabilities'] = probabilities
    
    errorWinsTest = precision_score(test["target"], predsWinsTest)

    print(errorWinsTrain)
    print(errorWinsTest)

    return combinedWinsTest, errorWinsTest

In [1677]:
combinedWins, errorWins = make_predictions(matches_rolling, predictors + new_cols + opp_cols)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 1 1 1 0 0 0
 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 0 0
 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 0 0 0 1 0 0
 1 1 1 1 1 1 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
1.0
0.7281553398058253


In [1678]:
combinedWins = combinedWins.merge(matches_rolling[["result", "gf", "ga", "team", "opponent", "venue", "date", "hour", "formation_code", "referee_code"]], left_index=True, right_index=True)

In [1679]:
finalGuess = combinedWins[(combinedWins["predicted"] == 1)].sort_values("date")
finalGuess

Unnamed: 0,actual,predicted,probabilities,result,gf,ga,team,opponent,venue,date,hour,formation_code,referee_code
3156,1,1,0.4566,W,1.0,0.0,West Ham United,Southampton,Home,2023-04-02,14,15,20
1510,1,1,0.4204,W,2.0,1.0,Leeds United,Nott'ham Forest,Home,2023-04-04,19,12,23
285,1,1,0.4260,W,2.0,0.0,Aston Villa,Nott'ham Forest,Home,2023-04-08,15,17,2
1997,1,1,0.4088,W,3.0,1.0,Manchester City,Leicester City,Home,2023-04-15,17,1,6
946,0,1,0.4338,L,1.0,2.0,Chelsea,Brighton,Home,2023-04-15,15,12,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1837,1,1,0.3849,W,4.0,2.0,Liverpool,Newcastle Utd,Home,2024-01-01,20,15,2
144,1,1,0.3827,W,5.0,0.0,Arsenal,Crystal Palace,Home,2024-01-20,12,15,20
671,0,1,0.3398,D,0.0,0.0,Brighton and Hove Albion,Wolves,Home,2024-01-22,19,15,4
1856,1,1,0.4789,W,4.0,0.0,Luton Town,Brighton,Home,2024-01-30,19,4,23


In [1680]:
print(len(finalGuess))
print(len(finalGuess[finalGuess["actual"] == 1]))
print(int(len(finalGuess[finalGuess["actual"] == 1]) / len(finalGuess) * 10000) / 100, "%")
num = len(finalGuess[finalGuess["actual"] == 1]) / len(finalGuess)
print("Minimum win:", 1 + (1.0 - num) / num)

103
75
72.81 %
Minimum win: 1.3733333333333333


In [1681]:
print(len(combinedWins[combinedWins["actual"] == combinedWins["predicted"]]) / len(combinedWins))
print(len(combinedWins))

0.6634920634920635
315
