In [5]:
import pandas as pd

In [18]:
import numpy as np

In [6]:
matches = pd.read_csv('PremierLeague.csv', index_col=0)

In [13]:
matches["Date"] = pd.to_datetime(matches["Date"])

In [19]:
matches["hour"] = matches["Time"].apply(lambda x: int(x.split(":")[0]) if pd.notna(x) and x != '' else np.nan)

In [215]:
result_mapping = {
    "H": 1,
    "D": 0,
    "A": -1
}

In [216]:
matches['Result'] = matches["FullTimeResult"].map(result_mapping)

In [217]:
def implied_probability(odds):
    return 1 / odds

In [261]:
matches['ProbHomeWin'] = matches['MarketAvgHomeTeam'].apply(implied_probability)
matches['ProbDraw'] = matches['MarketAvgDraw'].apply(implied_probability)
matches['ProbAwayWin'] = matches['MarketAvgAwayTeam'].apply(implied_probability)

In [1014]:
train = matches[matches['Date'] < '2023-06-01'].copy()
test = matches[matches['Date'] > '2023-06-01'].copy()

In [1015]:
matches['RefNumber'] = matches['Referee'].astype('category').cat.codes
train = train[train['RefNumber'] != -1]

In [1040]:
#Prioritizing More Recent Games
mostRecent = ['2023-2024', '2022-2023']
somewhatRecent = ['2021-2022', '2020-2021']
lowkeyFarBack = [ '2002-2003', '2003-2004','2004-2005', '2005-2006', '2006-2007', '2007-2008', '2008-2009', '2009-2010', '2010-11', '2011-12']
tooFarBack = ['1993-1994', '1994-1995', '1995-1996', '1996-1997', '1997-1998', '1998-1999', '1999-2000', '2000-2001', '2001-2002']

conditions = [
    train['Season'].isin(mostRecent),
    train['Season'].isin(somewhatRecent),
    train['Season'].isin(lowkeyFarBack),
    train['Season'].isin(tooFarBack)
]

weight1 = 3.00
weight2 = 1.00
weight3 = 0.11
weight4 = 0.03

weights = [weight1, weight2, weight3, weight4]

train.loc[:,'Weight'] = np.select(conditions, weights, default=0.33)

In [1017]:
from sklearn.preprocessing import LabelEncoder

In [1018]:
le = LabelEncoder()

In [1019]:
matches["HomeEncoded"] = le.fit_transform(matches["HomeTeam"])
matches["AwayEncoded"] = le.transform(matches["AwayTeam"])

In [1020]:
train.loc[:,"HomeEncoded"] = train["HomeTeam"].map(dict(zip(matches["HomeTeam"], matches["HomeEncoded"])))
train.loc[:,"AwayEncoded"] = train["AwayTeam"].map(dict(zip(matches["AwayTeam"], matches["AwayEncoded"])))

In [1021]:
test["HomeEncoded"] = test["HomeTeam"].map(dict(zip(matches["HomeTeam"], matches["HomeEncoded"])))
test["AwayEncoded"] = test["AwayTeam"].map(dict(zip(matches["AwayTeam"], matches["AwayEncoded"])))

In [1022]:
#Form Train
train['HomeForm'] = train.groupby('HomeTeam')['Result'].rolling(window=3).mean().reset_index(level=0, drop=True)
train['AwayForm'] = train.groupby('AwayTeam')['Result'].rolling(window=3).mean().reset_index(level=0, drop=True)


In [1023]:
#Form Test
test['HomeForm'] = test.groupby('HomeTeam')['Result'].rolling(window=3).mean().reset_index(level=0, drop=True)
test['AwayForm'] = test.groupby('AwayTeam')['Result'].rolling(window=3).mean().reset_index(level=0, drop=True)

In [1024]:
#Goal Train
train['HomeLast5AvgGoals'] = train.groupby('HomeTeam')['FullTimeHomeTeamGoals'].transform(lambda x: x.rolling(5, closed='left').mean())
train['AwayLast5AvgGoals'] = train.groupby('AwayTeam')['FullTimeAwayTeamGoals'].transform(lambda x: x.rolling(5, closed='left').mean())

# Goal difference in the last 5 matches
train['HomeGoalDiffLast5'] = train.groupby('HomeTeam')['FullTimeHomeTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum()) - \
                               train.groupby('HomeTeam')['FullTimeAwayTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum())
train['AwayGoalDiffLast5'] = train.groupby('AwayTeam')['FullTimeAwayTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum()) - \
                               train.groupby('AwayTeam')['FullTimeHomeTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum())

In [1025]:
#Goal Train
test['HomeLast5AvgGoals'] = test.groupby('HomeTeam')['FullTimeHomeTeamGoals'].transform(lambda x: x.rolling(5, closed='left').mean())
test['AwayLast5AvgGoals'] = test.groupby('AwayTeam')['FullTimeAwayTeamGoals'].transform(lambda x: x.rolling(5, closed='left').mean())

# Goal difference in the last 5 matches
test['HomeGoalDiffLast5'] = test.groupby('HomeTeam')['FullTimeHomeTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum()) - \
                               test.groupby('HomeTeam')['FullTimeAwayTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum())
test['AwayGoalDiffLast5'] = test.groupby('AwayTeam')['FullTimeAwayTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum()) - \
                               test.groupby('AwayTeam')['FullTimeHomeTeamGoals'].transform(lambda x: x.rolling(5, closed='left').sum())

In [1026]:
features = ["HomeEncoded", "AwayEncoded", "HomeForm", "AwayForm", "HomeLast5AvgGoals", 'AwayLast5AvgGoals', 'HomeGoalDiffLast5', 'AwayGoalDiffLast5', 'hour', 'ProbHomeWin', 'ProbAwayWin', 'ProbDraw', 'RefNumber']

In [1027]:
xtrain = train[features]
ytrain = train['Result']

In [1028]:
xTest = test[features]
yTest = test['Result']

In [1029]:
from sklearn.ensemble import RandomForestClassifier

In [1030]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [1031]:
model.fit(xtrain, ytrain, sample_weight=train['Weight'])

In [1032]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [1033]:
predictions = model.predict(xTest)
pred_proba = model.predict_proba(xTest)
acc = accuracy_score(yTest, predictions)
precision = precision_score(yTest, predictions, average='weighted')

In [1034]:
test['PredResult'] = predictions
test['PredProbAwayWin'] = pred_proba[:, 0]
test['PredProbDraw'] = pred_proba[:, 1]
test['PredProbHomeWin'] = pred_proba[:, 2]

In [1035]:
test[['HomeTeam', 'AwayTeam', 'Result', 'PredResult', 'PredProbHomeWin', 'PredProbDraw', 'PredProbAwayWin']]

Unnamed: 0_level_0,HomeTeam,AwayTeam,Result,PredResult,PredProbHomeWin,PredProbDraw,PredProbAwayWin
MatchID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-2024_Burnley_Man City,Burnley,Man City,-1,-1,0.22,0.22,0.56
2023-2024_Arsenal_Nott'm Forest,Arsenal,Nott'm Forest,1,1,0.60,0.21,0.19
2023-2024_Bournemouth_West Ham,Bournemouth,West Ham,0,1,0.39,0.34,0.27
2023-2024_Brighton_Luton,Brighton,Luton,1,1,0.54,0.25,0.21
2023-2024_Everton_Fulham,Everton,Fulham,-1,0,0.26,0.49,0.25
...,...,...,...,...,...,...,...
2024-2025_Tottenham_Everton,Tottenham,Everton,1,1,0.83,0.11,0.06
2024-2025_Aston Villa_Arsenal,Aston Villa,Arsenal,-1,-1,0.07,0.05,0.88
2024-2025_Bournemouth_Newcastle,Bournemouth,Newcastle,0,-1,0.17,0.31,0.52
2024-2025_Wolves_Chelsea,Wolves,Chelsea,-1,-1,0.02,0.16,0.82
