# Predicting Premier League Winners using ML

## Reading match data into a pandas dataframe

In [1]:
import pandas as pd
matches = pd.read_csv('matches.csv', index_col=0)

In [2]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [3]:
matches.shape

(1520, 27)

In [4]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

## Investigating missing data

In the EPL, there are 38 matches played each season, and there are 20 teams per season. We've also scraped data for 2 full seasons therefore we should have 1520 obs in total which seems to be the case


In [5]:
matches['team'].value_counts()

Manchester City             76
Wolverhampton Wanderers     76
Burnley                     76
Leeds United                76
Everton                     76
Southampton                 76
Aston Villa                 76
Liverpool                   76
Newcastle United            76
Crystal Palace              76
Brighton and Hove Albion    76
Leicester City              76
West Ham United             76
Manchester United           76
Arsenal                     76
Tottenham Hotspur           76
Chelsea                     76
Brentford                   38
Watford                     38
Norwich City                38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Name: team, dtype: int64

Since this data was collected at the end of the EPL we know that Bournemouth, Fulham and Nottingham Forest will be taking the place of Burnley, Watford and Norwich City in the Premier League. This explains why those observations don't have the same amount of match data as the rest of the other teams in out dataset

In [6]:
matches['round'].value_counts()

Matchweek 1     40
Matchweek 29    40
Matchweek 22    40
Matchweek 23    40
Matchweek 24    40
Matchweek 25    40
Matchweek 26    40
Matchweek 27    40
Matchweek 28    40
Matchweek 31    40
Matchweek 2     40
Matchweek 32    40
Matchweek 30    40
Matchweek 34    40
Matchweek 35    40
Matchweek 36    40
Matchweek 33    40
Matchweek 37    40
Matchweek 21    40
Matchweek 20    40
Matchweek 19    40
Matchweek 18    40
Matchweek 3     40
Matchweek 4     40
Matchweek 5     40
Matchweek 6     40
Matchweek 7     40
Matchweek 8     40
Matchweek 9     40
Matchweek 10    40
Matchweek 11    40
Matchweek 12    40
Matchweek 13    40
Matchweek 14    40
Matchweek 15    40
Matchweek 16    40
Matchweek 17    40
Matchweek 38    40
Name: round, dtype: int64

We can see that there are inherently no missing values since the data has been collected at the end of the season.

## Data Cleaning

In [7]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [8]:
del matches["notes"]
del matches['comp']

In [9]:
matches["date"] = pd.to_datetime(matches["date"])

In [10]:
matches["target"] = (matches["result"] == "W").astype("int") #Codified win as 1 and lose/draw as 0 for the ML model

In [11]:
matches.head()

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,match report,sh,sot,dist,fk,pk,pkatt,season,team,target
1,2021-08-15,16:30,Matchweek 1,Sun,Away,L,0,1,Tottenham,1.9,...,Match Report,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City,0
2,2021-08-21,15:00,Matchweek 2,Sat,Home,W,5,0,Norwich City,2.7,...,Match Report,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City,1
3,2021-08-28,12:30,Matchweek 3,Sat,Home,W,5,0,Arsenal,3.8,...,Match Report,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City,1
4,2021-09-11,15:00,Matchweek 4,Sat,Away,W,1,0,Leicester City,2.9,...,Match Report,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City,1
6,2021-09-18,15:00,Matchweek 5,Sat,Home,D,0,0,Southampton,1.1,...,Match Report,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City,0


## Minor Feature Engineering

In [12]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes
#Converts the string into categories, then converts those categories into numbers

In [13]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
#Converts the string into categories, then converts those categories into unique numbers for each opposing team

In [14]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
#Used regular expression to remove the ':' and minutes to keep just the hour, then convert to integer type

In [15]:
matches["day_code"] = matches["date"].dt.dayofweek
#Codified each day into a number for each distinct match day

In [16]:
matches.iloc[:, -4:] #New columns we created

Unnamed: 0,venue_code,opp_code,hour,day_code
1,0,18,16,6
2,1,15,15,5
3,1,0,12,5
4,0,10,15,5
6,1,17,15,5
...,...,...,...,...
38,0,18,19,6
39,1,6,15,5
40,0,7,19,6
41,0,14,18,2


## Creating our intial machine learning model

In [17]:
matches["target"] = (matches["result"] == "W").astype("int") #The target we want to predict converting our boolean to integers

In [18]:
from sklearn.ensemble import RandomForestClassifier 

In [19]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
# n_estimator is the no. of individual decision trees we want to train
# min_sample_split is the no. of samples we want to have in the leaf of a tree before we split it
# random_state means that if we re-run this model multiple times we get the same result

In [20]:
train = matches[matches["date"] < '2022-01-01']

In [21]:
test = matches[matches["date"] > '2022-01-01']

In [22]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [23]:
rf.fit(train[predictors], train["target"]) #Training our model

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [24]:
preds = rf.predict(test[predictors])

## Determining the accuracy of our intial ML model

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
error = accuracy_score(test["target"], preds)
error

0.6056701030927835

In [27]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,186,47
1,106,49


From the intial model we built we can see that when predicting a loss or a draw were mostly correct in when this would occur (186 correct vs 106 wrong). However when we came to preicting team wins were actually wrong more times than we were right

Lets use another metric to see if we can get a second look at the accuracy of our model

In [28]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.5104166666666666

## Improving the accuracy of our model

In [29]:
grouped_matches = matches.groupby("team")

In [30]:
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [31]:
def rolling_averages(group, cols, new_cols): #Functions creates rolling averages for groupbed matches
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [32]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] #columns we want to predict the rolling averages for
new_cols = [f"{c}_rolling" for c in cols] #Format string for new columns we create

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,17:30,Matchweek 5,Sat,Home,W,1,0,Arsenal,1.5,...,17,5,2.000000,2.333333,17.333333,4.666667,18.900000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Matchweek 6,Sat,Away,D,1,1,West Ham,1.1,...,12,5,1.333333,2.000000,17.333333,3.666667,17.733333,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Matchweek 7,Sat,Away,W,1,0,Sheffield Utd,1.5,...,12,5,1.000000,0.666667,16.666667,4.333333,18.233333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Matchweek 8,Sun,Home,D,1,1,Liverpool,1.6,...,16,6,1.000000,0.333333,14.333333,6.666667,18.466667,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Matchweek 9,Sat,Away,L,0,2,Tottenham,1.3,...,17,5,1.000000,0.666667,12.000000,5.666667,19.366667,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2022-04-30,17:30,Matchweek 35,Sat,Away,W,4,0,Leeds United,2.7,...,17,5,3.333333,1.000000,16.000000,5.333333,16.700000,0.333333,0.333333,0.333333
54,2022-05-08,16:30,Matchweek 36,Sun,Home,W,5,0,Newcastle Utd,3.1,...,16,6,4.000000,0.333333,18.666667,6.000000,16.333333,0.000000,0.333333,0.333333
55,2022-05-11,20:15,Matchweek 33,Wed,Away,W,5,1,Wolves,3.1,...,20,2,4.666667,0.333333,20.000000,7.333333,15.166667,0.333333,0.333333,0.333333
56,2022-05-15,14:00,Matchweek 37,Sun,Away,D,2,2,West Ham,2.5,...,14,6,4.666667,0.333333,18.333333,6.666667,14.933333,0.333333,0.000000,0.000000


In [33]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [34]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,0.4,...,14,6,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
Arsenal,7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0,1,Manchester City,0.9,...,17,5,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
Arsenal,9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0,1,Leicester City,0.9,...,19,6,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
Arsenal,11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,1.1,...,16,6,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
Arsenal,13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0,3,Aston Villa,1.5,...,19,6,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,37,2022-04-30,15:00,Matchweek 35,Sat,Home,L,0,3,Brighton,0.6,...,15,5,0.666667,1.000000,8.666667,3.333333,16.966667,0.000000,0.000000,0.000000
Wolverhampton Wanderers,38,2022-05-07,15:00,Matchweek 36,Sat,Away,D,2,2,Chelsea,2.1,...,15,5,0.000000,1.666667,8.666667,2.333333,18.233333,0.333333,0.000000,0.000000
Wolverhampton Wanderers,39,2022-05-11,20:15,Matchweek 33,Wed,Home,L,1,5,Manchester City,0.5,...,20,2,0.666667,2.000000,11.666667,3.000000,17.133333,0.333333,0.000000,0.000000
Wolverhampton Wanderers,40,2022-05-15,14:00,Matchweek 37,Sun,Home,D,1,1,Norwich City,0.9,...,14,6,1.000000,3.333333,10.666667,2.666667,16.400000,0.333333,0.000000,0.000000


In [35]:
matches_rolling = matches_rolling.droplevel('team') #droplevel to remove multlevel index

In [36]:
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
6,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,0.4,...,14,6,2.000000,1.333333,7.666667,3.666667,14.733333,0.666667,0.000000,0.000000
7,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0,1,Manchester City,0.9,...,17,5,1.666667,1.666667,5.333333,3.666667,15.766667,0.000000,0.000000,0.000000
9,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0,1,Leicester City,0.9,...,19,6,1.000000,1.666667,7.000000,3.666667,16.733333,0.666667,0.000000,0.000000
11,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,1.1,...,16,6,0.666667,1.000000,9.666667,4.000000,16.033333,1.000000,0.000000,0.000000
13,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0,3,Aston Villa,1.5,...,19,6,0.333333,0.666667,9.666667,2.666667,18.033333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2022-04-30,15:00,Matchweek 35,Sat,Home,L,0,3,Brighton,0.6,...,15,5,0.666667,1.000000,8.666667,3.333333,16.966667,0.000000,0.000000,0.000000
38,2022-05-07,15:00,Matchweek 36,Sat,Away,D,2,2,Chelsea,2.1,...,15,5,0.000000,1.666667,8.666667,2.333333,18.233333,0.333333,0.000000,0.000000
39,2022-05-11,20:15,Matchweek 33,Wed,Home,L,1,5,Manchester City,0.5,...,20,2,0.666667,2.000000,11.666667,3.000000,17.133333,0.333333,0.000000,0.000000
40,2022-05-15,14:00,Matchweek 37,Sun,Home,D,1,1,Norwich City,0.9,...,14,6,1.000000,3.333333,10.666667,2.666667,16.400000,0.333333,0.000000,0.000000


In [37]:
matches_rolling.index = range(matches_rolling.shape[0]) #Created new indices strong from 0 up till the range of row observations

## Retraining our Machine Learning model

In [38]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    
    #splits the data into train and test
    
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    
    #Fits the model and creates predictions
    
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    
    #Return our precision value for model evaluation
    
    return combined, error

In [41]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [42]:
combined.head(10)

Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,1,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
60,1,1,2022-03-13,Arsenal,Leicester City,W
61,0,0,2022-03-16,Arsenal,Liverpool,L
62,1,0,2022-03-19,Arsenal,Aston Villa,W
63,0,0,2022-04-04,Arsenal,Crystal Palace,L
64,0,0,2022-04-09,Arsenal,Brighton,L


In [43]:
error

0.6027397260273972