In [202]:
import pandas as pd

In [203]:
#First column in data is actually the index column
matches = pd.read_csv("matches.csv", index_col=0)

In [204]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [205]:
# We are missing values for some of the teams because we have 2 seasons of data for most teams but some teams have only 1 season.
matches["team"].value_counts()

Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

In [206]:
#Machine learning algorithms don't work with objects so we look variables we can use as predictors
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [207]:
#Creating predictors for ML model

In [208]:
#Convert date time a datetime data type and sort by date
matches["date"] = pd.to_datetime(matches["date"])
matches.sort_values("date",ascending=False)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
38,2022-04-25,20:00,Premier League,Matchweek 34,Mon,Home,D,0.0,0.0,Leeds United,...,Match Report,,17.0,7.0,13.8,0.0,0.0,0.0,2022,Crystal Palace
36,2022-04-25,20:00,Premier League,Matchweek 34,Mon,Away,D,0.0,0.0,Crystal Palace,...,Match Report,,9.0,2.0,16.5,0.0,0.0,0.0,2022,Leeds United
36,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Away,L,0.0,1.0,Burnley,...,Match Report,,10.0,4.0,18.6,0.0,0.0,0.0,2022,Wolverhampton Wanderers
38,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Home,D,2.0,2.0,Southampton,...,Match Report,,8.0,5.0,11.2,0.0,0.0,0.0,2022,Brighton and Hove Albion
36,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Home,W,1.0,0.0,Wolves,...,Match Report,,13.0,5.0,18.8,0.0,0.0,0.0,2022,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2020-09-12,15:00,Premier League,Matchweek 1,Sat,Away,L,0.0,1.0,Crystal Palace,...,Match Report,,9.0,5.0,15.6,2.0,0.0,0.0,2021,Southampton
1,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,W,4.0,3.0,Leeds United,...,Match Report,,20.0,4.0,17.0,0.0,2.0,2.0,2021,Liverpool
0,2020-09-12,12:30,Premier League,Matchweek 1,Sat,Home,L,0.0,3.0,Arsenal,...,Match Report,,5.0,2.0,26.0,0.0,0.0,0.0,2021,Fulham
0,2020-09-12,15:00,Premier League,Matchweek 1,Sat,Home,W,1.0,0.0,Southampton,...,Match Report,,6.0,3.0,10.1,0.0,0.0,0.0,2021,Crystal Palace


In [209]:
#Whether a game is home or away might affect team performance. We can convert this category into a code to use as a predictor

matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [210]:
#Use regex to extract only the hour from the time the match is supposed to start.
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")


In [211]:
#We create a code that tells us the day of the week.For example if the game is on Saturday the code will be 6
matches["day_code"] = matches["date"].dt.dayofweek

In [212]:
#We are going to give our opponents a code.
matches["opponent_code"] = matches["opponent"].astype("category").cat.codes

In [213]:
#We want to predict whether a team won or not. If result is a win the code is 1 otherwise this variable will be equal to 0.
matches["target_result"]=(matches["result"] == "W").astype("int")

In [214]:
''' Now it's time to train machine learning model. We will use a Random Forest Classifier from scikit learn to predict 
to deal with non linearities in the code'''

" Now it's time to train machine learning model. We will use a Random Forest Classifier from scikit learn to predict \nto deal with non linearities in the code"

In [215]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [216]:
#We must make sure all training data happens before testing data.
train = matches[matches["date"] < '2021-01-01']
test = matches[matches["date"] > '2021-01-01']
predictors = ["venue_code", "opponent_code", "hour", "day_code"]

In [217]:
 #Train randm forest model to using predictors to predict our target
rf.fit(train[predictors], train["target_result"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [218]:
# We pass in our test data to make predictions
preds = rf.predict(test[predictors])

In [219]:
#Testing accuracy of our random forest model.
from sklearn.metrics import accuracy_score
error = accuracy_score(test["target_result"], preds)
error

0.5944186046511628

In [220]:
#We creating a table combining actual values with predicted values to look at differences.
combined_data = pd.DataFrame(dict(actual=test["target_result"], predicted=preds))
pd.crosstab(index=combined_data["actual"], columns=combined_data["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,572,95
1,341,67


In [221]:
#We care about how many times our team won when we predicted a win. This value is pretty low at only ~41%.
#we need to make improvements to the model

from sklearn.metrics import precision_score

precision_score(test["target_result"], preds)

0.41358024691358025

In [222]:
#These are the columns we want to do rolling averages for

cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]


In [223]:
# To improve precision calculate rolling averages for team performance. Create a df for each team.
#For example, if we ar eon matchweek 10 then we can see how the team did in the previous 9 matchweeks use that to make better predicitons
team_df = matches.groupby("team")
group = team_df.get_group("Chelsea").sort_values("date")

#We create a function that takes in the team,the columns we want to vrete a rolling avg on,new cols where the averages are stored.
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean() #closed=left means that we don't include current week in rolling avg calculation
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) #We need to get rid of missing values because some ML models don't work well with it.
    return group

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,opponent_code,target_result,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-03,12:30,Premier League,Matchweek 4,Sat,Home,W,4.0,0.0,Crystal Palace,...,6,1,2.000000,2.000000,12.333333,5.666667,20.366667,0.666667,0.333333,0.666667
6,2020-10-17,15:00,Premier League,Matchweek 5,Sat,Home,D,3.0,3.0,Southampton,...,17,0,2.333333,1.666667,14.333333,5.666667,18.933333,0.666667,0.666667,1.000000
8,2020-10-24,17:30,Premier League,Matchweek 6,Sat,Away,D,0.0,0.0,Manchester Utd,...,13,0,3.333333,2.000000,17.000000,6.666667,15.300000,0.666667,0.666667,0.666667
10,2020-10-31,15:00,Premier League,Matchweek 7,Sat,Away,W,3.0,0.0,Burnley,...,4,1,2.333333,1.000000,11.000000,3.333333,15.300000,0.000000,0.666667,0.666667
12,2020-11-07,17:30,Premier League,Matchweek 8,Sat,Home,W,4.0,1.0,Sheffield Utd,...,16,1,2.000000,1.000000,10.666667,5.000000,15.733333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Home,W,1.0,0.0,Newcastle Utd,...,14,1,2.666667,0.333333,12.000000,5.000000,15.600000,0.666667,0.000000,0.000000
47,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,L,1.0,4.0,Brentford,...,2,0,2.666667,0.333333,11.333333,5.000000,15.133333,0.666667,0.000000,0.000000
49,2022-04-09,15:00,Premier League,Matchweek 32,Sat,Away,W,6.0,0.0,Southampton,...,17,1,1.666667,1.666667,14.666667,6.000000,16.100000,0.666667,0.000000,0.000000
52,2022-04-20,19:45,Premier League,Matchweek 25,Wed,Home,L,2.0,4.0,Arsenal,...,0,0,2.666667,1.333333,17.666667,8.333333,17.000000,0.333333,0.000000,0.000000


In [224]:
#Apply this rolling matches function to all teams in the Premier League
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [225]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,opponent_code,target_result,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,16,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,12,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,10,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,13,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,1,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,32,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,...,7,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,33,2022-03-18,20:00,Premier League,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,...,9,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,34,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,...,1,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
Wolverhampton Wanderers,35,2022-04-08,20:00,Premier League,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,...,14,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [226]:
'''We want each index to be unique'''
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,opponent_code,target_result,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,16,1,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,12,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,10,0,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,13,1,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,1,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,...,7,1,1.333333,1.000000,12.333333,3.666667,19.300000,0.000000,0.000000,0.000000
1313,2022-03-18,20:00,Premier League,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,...,9,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.000000,0.000000
1314,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,...,1,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.000000,0.000000
1315,2022-04-08,20:00,Premier League,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,...,14,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.000000,0.000000


In [227]:
#Prediction function
def predict_data(data, predictors):
    train = data[data["date"] < '2021-01-01']
    test = data[data["date"] > '2021-01-01']
    rf.fit(train[predictors], train["target_result"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target_result"], predicted=preds), index=test.index)
    error = precision_score(test["target_result"], preds)
    return combined, error

In [228]:
combined_data, error = predict_data(matches_rolling, predictors + new_cols)
error

0.5050505050505051

In [229]:
#This doesn't tell us about which team played which match
combined_data

Unnamed: 0,actual,predicted
13,1,1
14,0,0
15,1,0
16,1,0
17,0,0
...,...,...
1312,1,0
1313,0,0
1314,1,1
1315,0,1


In [230]:
#We can fix it by merging using index

combined_data = combined_data.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)
combined_data

Unnamed: 0,actual,predicted,date,team,opponent,result
13,1,1,2021-01-02,Arsenal,West Brom,W
14,0,0,2021-01-14,Arsenal,Crystal Palace,D
15,1,0,2021-01-18,Arsenal,Newcastle Utd,W
16,1,0,2021-01-26,Arsenal,Southampton,W
17,0,0,2021-01-30,Arsenal,Manchester Utd,D
...,...,...,...,...,...,...
1312,1,0,2022-03-13,Wolverhampton Wanderers,Everton,W
1313,0,0,2022-03-18,Wolverhampton Wanderers,Leeds United,L
1314,1,1,2022-04-02,Wolverhampton Wanderers,Aston Villa,W
1315,0,1,2022-04-08,Wolverhampton Wanderers,Newcastle Utd,L


In [231]:
#Sometimes the team name appears differently than the opponent. This dictionary takes care of the case when the 
#Team name=opponent then return that name otherwise map that team name to the opponent name
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves"} 
mapping = MissingDict(**map_values)

In [232]:
#Merge team field with opponent field
combined_data["new_team"] = combined_data["team"].map(mapping)
merged = combined_data.merge(combined_data, left_on=["date", "new_team"], right_on=["date", "opponent"])

#When model predict team a win and team b lose what actually happened.
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

1    44
0    38
Name: actual_x, dtype: int64

In [233]:
'''End result--> We trained our model on data from September 2020 to Janaury 2021 
and tested on a dataset from Janauary 2021 till April 2022. 
44/80 times when our model said a team would win they actually won. This is around a 54% accuracy of the model.
'''

'End result--> We trained our model on data from September 2020 to Janaury 2021 \nand tested on a dataset from Janauary 2021 till April 2022. \n44/80 times when our model said a team would win they actually won. This is around a 54% accuracy of the model.\n'