**Portfolio Project: Predicting EPL Football Match Winners With Machine Learning**

In [1]:
# Reading in the initial data

import pandas as pd

matches = pd.read_csv("matches.csv", index_col=0)

In [2]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [3]:
matches.shape

(1389, 27)

In [4]:
matches["team"].value_counts()

Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

In [5]:
matches['round'].value_counts()

Matchweek 1     39
Matchweek 16    39
Matchweek 34    39
Matchweek 32    39
Matchweek 31    39
Matchweek 29    39
Matchweek 28    39
Matchweek 26    39
Matchweek 25    39
Matchweek 24    39
Matchweek 23    39
Matchweek 2     39
Matchweek 19    39
Matchweek 17    39
Matchweek 20    39
Matchweek 15    39
Matchweek 5     39
Matchweek 3     39
Matchweek 13    39
Matchweek 12    39
Matchweek 4     39
Matchweek 11    39
Matchweek 10    39
Matchweek 9     39
Matchweek 8     39
Matchweek 14    39
Matchweek 7     39
Matchweek 6     39
Matchweek 30    37
Matchweek 27    37
Matchweek 22    37
Matchweek 21    37
Matchweek 18    37
Matchweek 33    32
Matchweek 35    20
Matchweek 36    20
Matchweek 37    20
Matchweek 38    20
Name: round, dtype: int64

**CLEANING DATA FOR MACHINE LEARNING**

In [6]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [7]:
matches['date'] = pd.to_datetime(matches['date'])
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team                    object
dtype: object

**CREATING PREDICTORS FOR MACHINE LEARNING**

In [8]:
matches['venue_code'] = matches["venue"].astype('category').cat.codes
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,fk,pk,pkatt,season,team,venue_code
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City,0
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United,0
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United,1
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United,0
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United,0


In [9]:
matches['venue_code'] = matches["venue"].astype('category').cat.codes
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,fk,pk,pkatt,season,team,venue_code
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City,0
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United,0
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United,1
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United,0
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United,0


In [10]:
matches['opp_code'] = matches['opponent'].astype('category').cat.codes

In [11]:
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

In [12]:
matches['day_code'] = matches["date"].dt.dayofweek
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,dist,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,16.9,1.0,0.0,0.0,2022,Manchester City,0,18,16,6
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,17.3,1.0,0.0,0.0,2022,Manchester City,1,15,15,5
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,14.3,0.0,0.0,0.0,2022,Manchester City,1,0,12,5
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,14.0,0.0,0.0,0.0,2022,Manchester City,0,10,15,5
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,15.7,1.0,0.0,0.0,2022,Manchester City,1,17,15,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,17.4,0.0,0.0,0.0,2021,Sheffield United,0,18,19,6
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,11.4,1.0,0.0,0.0,2021,Sheffield United,1,6,15,5
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,17.0,0.0,0.0,0.0,2021,Sheffield United,0,7,19,6
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,16.0,1.0,0.0,0.0,2021,Sheffield United,0,14,18,2


In [13]:
matches["target"] = (matches['result'] == 'W').astype('int')
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,1.0,0.0,0.0,2022,Manchester City,0,18,16,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,1.0,0.0,0.0,2022,Manchester City,1,15,15,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,0.0,0.0,0.0,2022,Manchester City,1,0,12,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,0.0,0.0,0.0,2022,Manchester City,0,10,15,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,1.0,0.0,0.0,2022,Manchester City,1,17,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0.0,4.0,Tottenham,...,0.0,0.0,0.0,2021,Sheffield United,0,18,19,6,0
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0.0,2.0,Crystal Palace,...,1.0,0.0,0.0,2021,Sheffield United,1,6,15,5,0
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1.0,0.0,Everton,...,0.0,0.0,0.0,2021,Sheffield United,0,7,19,6,1
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Newcastle Utd,...,1.0,0.0,0.0,2021,Sheffield United,0,14,18,2,0


**TRAINING AN INITIAL ML MODEL**

In [14]:
from sklearn.ensemble import RandomForestClassifier


In [15]:
RF = RandomForestClassifier(n_estimators=58, min_samples_split=10, random_state=1)

In [16]:
train = matches[matches["date"] < '2022-01-01']

In [17]:
test = matches[matches['date'] > '2022-01-01']

In [18]:
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [19]:
RF.fit(train[predictors], train['target'])

RandomForestClassifier(min_samples_split=10, n_estimators=58, random_state=1)

In [20]:
preds = RF.predict(test[predictors])

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
acc = accuracy_score(test['target'], preds)

In [23]:
acc

0.605072463768116

In [24]:
combined = pd.DataFrame(dict(actual=test['target'], prediction= preds))

In [25]:
pd.crosstab(index = combined['actual'], columns= combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,138,34
1,75,29


In [26]:
from sklearn.metrics import precision_score
precision_score (test['target'], preds)

0.4603174603174603

**Improving the Model with Rolling Averages**

In [27]:
segmented_matches = matches.groupby("team")

In [28]:
group = segmented_matches.get_group("Arsenal")

In [29]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,...,1.0,0.0,0.0,2022,Arsenal,0,2,20,4,0
1,2021-08-22,16:30,Premier League,Matchweek 2,Sun,Home,L,0.0,2.0,Chelsea,...,0.0,0.0,0.0,2022,Arsenal,1,5,16,6,0
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Away,L,0.0,5.0,Manchester City,...,0.0,0.0,0.0,2022,Arsenal,0,12,12,5,0
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,1.0,0.0,0.0,2022,Arsenal,1,15,15,5,1
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,1.0,0.0,0.0,2022,Arsenal,0,4,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2021-05-02,14:00,Premier League,Matchweek 34,Sun,Away,W,2.0,0.0,Newcastle Utd,...,1.0,0.0,0.0,2021,Arsenal,0,14,14,6,1
54,2021-05-09,19:00,Premier League,Matchweek 35,Sun,Home,W,3.0,1.0,West Brom,...,1.0,0.0,0.0,2021,Arsenal,1,20,19,6,1
55,2021-05-12,20:15,Premier League,Matchweek 36,Wed,Away,W,1.0,0.0,Chelsea,...,0.0,0.0,0.0,2021,Arsenal,0,5,20,2,1
56,2021-05-19,19:00,Premier League,Matchweek 37,Wed,Away,W,3.0,1.0,Crystal Palace,...,0.0,0.0,0.0,2021,Arsenal,0,6,19,2,1


In [30]:
def rolling_averages(group, cols, new_cols):
    rolling_stats = group[cols].rolling(3, closed ="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset= new_cols)
    return group

In [31]:
cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk','pk', 'pkatt']
new_cols=[f"{c}_rolling" for c in cols]

In [32]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [33]:
rolling_averages(group, cols, new_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.600000,0.333333,0.0,0.0
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,13.866667,0.333333,0.0,0.0
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.766667,0.666667,0.0,0.0
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.000000,0.666667,0.0,0.0
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,18.966667,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2021-05-02,14:00,Premier League,Matchweek 34,Sun,Away,W,2.0,0.0,Newcastle Utd,...,6,1,1.333333,0.666667,15.333333,4.333333,16.833333,0.666667,0.0,0.0
54,2021-05-09,19:00,Premier League,Matchweek 35,Sun,Home,W,3.0,1.0,West Brom,...,6,1,1.000000,0.666667,17.000000,4.000000,16.000000,0.666667,0.0,0.0
55,2021-05-12,20:15,Premier League,Matchweek 36,Wed,Away,W,1.0,0.0,Chelsea,...,2,1,1.666667,0.666667,16.000000,4.666667,17.100000,0.666667,0.0,0.0
56,2021-05-19,19:00,Premier League,Matchweek 37,Wed,Away,W,3.0,1.0,Crystal Palace,...,2,1,2.000000,0.333333,13.000000,4.333333,17.933333,0.666667,0.0,0.0


In [34]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages (x, cols, new_cols))

In [35]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.600000,0.333333,0.0,0.0
Arsenal,5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,13.866667,0.333333,0.0,0.0
Arsenal,7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.766667,0.666667,0.0,0.0
Arsenal,8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.000000,0.666667,0.0,0.0
Arsenal,9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,18.966667,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,37,2021-05-03,18:00,Premier League,Matchweek 34,Mon,Away,D,1.0,1.0,West Brom,...,0,0,0.666667,1.333333,9.666667,3.000000,17.266667,0.666667,0.0,0.0
Wolverhampton Wanderers,38,2021-05-09,12:00,Premier League,Matchweek 35,Sun,Home,W,2.0,1.0,Brighton,...,6,1,0.666667,1.666667,14.000000,5.333333,18.433333,0.333333,0.0,0.0
Wolverhampton Wanderers,39,2021-05-16,14:05,Premier League,Matchweek 36,Sun,Away,L,0.0,2.0,Tottenham,...,6,0,1.000000,2.000000,15.666667,6.000000,20.000000,0.666667,0.0,0.0
Wolverhampton Wanderers,40,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Everton,...,2,0,1.000000,1.333333,16.666667,6.333333,18.233333,0.333333,0.0,0.0


In [36]:
matches_rolling = matches_rolling.droplevel('team')
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.600000,0.333333,0.0,0.0
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,13.866667,0.333333,0.0,0.0
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.766667,0.666667,0.0,0.0
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.000000,0.666667,0.0,0.0
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,18.966667,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37,2021-05-03,18:00,Premier League,Matchweek 34,Mon,Away,D,1.0,1.0,West Brom,...,0,0,0.666667,1.333333,9.666667,3.000000,17.266667,0.666667,0.0,0.0
38,2021-05-09,12:00,Premier League,Matchweek 35,Sun,Home,W,2.0,1.0,Brighton,...,6,1,0.666667,1.666667,14.000000,5.333333,18.433333,0.333333,0.0,0.0
39,2021-05-16,14:05,Premier League,Matchweek 36,Sun,Away,L,0.0,2.0,Tottenham,...,6,0,1.000000,2.000000,15.666667,6.000000,20.000000,0.666667,0.0,0.0
40,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Everton,...,2,0,1.000000,1.333333,16.666667,6.333333,18.233333,0.333333,0.0,0.0


In [37]:
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.600000,0.333333,0.0,0.0
1,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,13.866667,0.333333,0.0,0.0
2,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.766667,0.666667,0.0,0.0
3,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.000000,0.666667,0.0,0.0
4,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,18.966667,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1312,2021-05-03,18:00,Premier League,Matchweek 34,Mon,Away,D,1.0,1.0,West Brom,...,0,0,0.666667,1.333333,9.666667,3.000000,17.266667,0.666667,0.0,0.0
1313,2021-05-09,12:00,Premier League,Matchweek 35,Sun,Home,W,2.0,1.0,Brighton,...,6,1,0.666667,1.666667,14.000000,5.333333,18.433333,0.333333,0.0,0.0
1314,2021-05-16,14:05,Premier League,Matchweek 36,Sun,Away,L,0.0,2.0,Tottenham,...,6,0,1.000000,2.000000,15.666667,6.000000,20.000000,0.666667,0.0,0.0
1315,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0.0,1.0,Everton,...,2,0,1.000000,1.333333,16.666667,6.333333,18.233333,0.333333,0.0,0.0


**Retraining Our Machine Learning Model**

In [38]:
def make_predictions(data, predictors):
    train = data[data['date']< '2022-01-01']
    test = data[data['date']> '2020-01-01']
    RF.fit(train[predictors], train['target'])
    preds= RF.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test ['target'], predicted = preds), index= test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [39]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
precision

0.9291338582677166

In [40]:
combined

Unnamed: 0,actual,predicted
0,1,1
1,1,1
2,1,1
3,0,0
4,0,0
...,...,...
1312,0,0
1313,1,1
1314,0,0
1315,0,0


In [41]:
combined = combined.merge (matches_rolling[['date', 'team', 'opponent', 'result',]], left_index = True, right_index = True)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
0,1,1,2021-09-11,Arsenal,Norwich City,W
1,1,1,2021-09-18,Arsenal,Burnley,W
2,1,1,2021-09-26,Arsenal,Tottenham,W
3,0,0,2021-10-02,Arsenal,Brighton,D
4,0,0,2021-10-18,Arsenal,Crystal Palace,D
...,...,...,...,...,...,...
1312,0,0,2021-05-03,Wolverhampton Wanderers,West Brom,D
1313,1,1,2021-05-09,Wolverhampton Wanderers,Brighton,W
1314,0,0,2021-05-16,Wolverhampton Wanderers,Tottenham,L
1315,0,0,2021-05-19,Wolverhampton Wanderers,Everton,L


**Combining Home And Away predictions**

In [42]:
class MissingDict(dict):
    _missing_ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "Westham United": "WestHam",
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values)

In [43]:
combined['new_team'] = combined['team'].map(mapping)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
0,1,1,2021-09-11,Arsenal,Norwich City,W,
1,1,1,2021-09-18,Arsenal,Burnley,W,
2,1,1,2021-09-26,Arsenal,Tottenham,W,
3,0,0,2021-10-02,Arsenal,Brighton,D,
4,0,0,2021-10-18,Arsenal,Crystal Palace,D,
...,...,...,...,...,...,...,...
1312,0,0,2021-05-03,Wolverhampton Wanderers,West Brom,D,Wolves
1313,1,1,2021-05-09,Wolverhampton Wanderers,Brighton,W,Wolves
1314,0,0,2021-05-16,Wolverhampton Wanderers,Tottenham,L,Wolves
1315,0,0,2021-05-19,Wolverhampton Wanderers,Everton,L,Wolves


In [48]:
merged = combined.merge(combined, left_on = ['date', 'new_team'], right_on= ['date', 'opponent'])
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2021-09-11,Brighton and Hove Albion,Brentford,W,Brighton,0,0,Brentford,Brighton,L,
1,1,0,2021-09-19,Brighton and Hove Albion,Leicester City,W,Brighton,0,0,Leicester City,Brighton,L,
2,0,0,2021-09-27,Brighton and Hove Albion,Crystal Palace,D,Brighton,0,0,Crystal Palace,Brighton,D,
3,0,0,2021-10-02,Brighton and Hove Albion,Arsenal,D,Brighton,0,0,Arsenal,Brighton,D,
4,0,0,2021-10-16,Brighton and Hove Albion,Norwich City,D,Brighton,0,0,Norwich City,Brighton,D,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,0,0,2021-05-03,Wolverhampton Wanderers,West Brom,D,Wolves,0,0,West Bromwich Albion,Wolves,D,
330,1,1,2021-05-09,Wolverhampton Wanderers,Brighton,W,Wolves,0,0,Brighton and Hove Albion,Wolves,L,Brighton
331,0,0,2021-05-16,Wolverhampton Wanderers,Tottenham,L,Wolves,1,1,Tottenham Hotspur,Wolves,W,Tottenham
332,0,0,2021-05-19,Wolverhampton Wanderers,Everton,L,Wolves,1,1,Everton,Wolves,W,


In [49]:
merged[(merged['predicted_x']==1) & (merged['predicted_y']==0)]['actual_x'].value_counts()

1    87
0     8
Name: actual_x, dtype: int64

In [50]:
87/40

2.175