In [None]:
import pandas as pd

In [None]:
#reading the scraped matches data into a dataframe
matches = pd.read_csv("matches.csv",index_col=0)

In [None]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,Match Report,,13.0,1.0,18.7,1.0,1,1,2023,Manchester City
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,Match Report,,19.0,7.0,17.5,0.0,0,0,2023,Manchester City
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,Match Report,,21.0,10.0,16.2,1.0,0,0,2023,Manchester City
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,Match Report,,18.0,5.0,14.1,0.0,0,0,2023,Manchester City
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,Match Report,,17.0,9.0,14.8,0.0,0,0,2023,Manchester City


In [None]:
matches.shape

(1520, 27)

In [None]:
matches['team'].value_counts()

Manchester City             76
Crystal Palace              76
Southampton                 76
Leeds United                76
Leicester City              76
Everton                     76
West Ham United             76
Wolverhampton Wanderers     76
Arsenal                     76
Chelsea                     76
Brentford                   76
Tottenham Hotspur           76
Aston Villa                 76
Brighton and Hove Albion    76
Liverpool                   76
Newcastle United            76
Manchester United           76
Fulham                      38
Bournemouth                 38
Nottingham Forest           38
Burnley                     38
Watford                     38
Norwich City                38
Name: team, dtype: int64

In [None]:
matches['round'].value_counts()

Matchweek 1     40
Matchweek 30    40
Matchweek 23    40
Matchweek 12    40
Matchweek 24    40
Matchweek 25    40
Matchweek 26    40
Matchweek 27    40
Matchweek 29    40
Matchweek 31    40
Matchweek 2     40
Matchweek 33    40
Matchweek 34    40
Matchweek 28    40
Matchweek 35    40
Matchweek 36    40
Matchweek 37    40
Matchweek 32    40
Matchweek 22    40
Matchweek 21    40
Matchweek 7     40
Matchweek 20    40
Matchweek 3     40
Matchweek 4     40
Matchweek 5     40
Matchweek 6     40
Matchweek 8     40
Matchweek 9     40
Matchweek 10    40
Matchweek 11    40
Matchweek 13    40
Matchweek 14    40
Matchweek 15    40
Matchweek 16    40
Matchweek 17    40
Matchweek 18    40
Matchweek 19    40
Matchweek 38    40
Name: round, dtype: int64

In [None]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [None]:
#changing the dtype of 'date' column into datetime
matches['date'] = pd.to_datetime(matches['date'])

In [None]:
#transforming the venues to some categorical codes
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [None]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [None]:
#extracting the hours from the "time" column
matches["hour"] = matches['time'].str.replace(":.+","",regex = True).astype("int")

In [None]:
matches["day_code"] = matches['date'].dt.dayofweek

In [None]:
matches["target"] = (matches["result"] == "W").astype("int")

In [None]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,1.0,1,1,2023,Manchester City,0,21,16,6,1
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,0.0,0,0,2023,Manchester City,1,2,15,5,1
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3,3,Newcastle Utd,...,1.0,0,0,2023,Manchester City,0,15,16,6,0
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4,2,Crystal Palace,...,0.0,0,0,2023,Manchester City,1,7,15,5,1
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6,0,Nott'ham Forest,...,0.0,0,0,2023,Manchester City,1,17,19,2,1


In [None]:
#creating a initial Random Forest model
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches["date"] < '2023-01-01']

In [None]:
test = matches[matches["date"] > '2023-01-01']

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [None]:
rf.fit(train[predictors], train["target"])

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix

In [None]:
acc = accuracy_score(test['target'],preds)

In [None]:
acc

0.5981308411214953

In [None]:
print(confusion_matrix(test["target"], preds))

[[203  60]
 [112  53]]


In [None]:
precision_score(test["target"], preds)

0.4690265486725664

In [None]:
grouped_matches = matches.groupby("team")

In [None]:
#calculating the rolling averages of some columns
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [None]:
cols = ["gf","ga","sh","sot","dist","fk","pk","pkatt"]

In [None]:
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
group = grouped_matches.get_group("Manchester City")

In [None]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,5,1,3.333333,0.333333,19.666667,6.000000,16.866667,0.666667,0.000000,0.000000
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,5,0,3.666667,0.000000,22.000000,7.333333,15.866667,0.333333,0.000000,0.000000
8,2021-09-25,12:30,Premier League,Matchweek 6,Sat,Away,W,1,0,Chelsea,...,5,1,2.000000,0.000000,22.000000,6.333333,15.166667,0.333333,0.000000,0.000000
10,2021-10-03,16:30,Premier League,Matchweek 7,Sun,Away,D,2,2,Liverpool,...,6,0,0.666667,0.000000,18.666667,4.000000,15.933333,0.333333,0.000000,0.000000
11,2021-10-16,15:00,Premier League,Matchweek 8,Sat,Home,W,2,0,Burnley,...,5,1,1.000000,0.666667,14.333333,2.333333,16.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,2,1,Leeds United,...,5,1,3.000000,0.666667,13.666667,8.000000,15.433333,0.000000,0.333333,0.333333
54,2023-05-14,14:00,Premier League,Matchweek 36,Sun,Away,W,3,0,Everton,...,6,1,2.333333,0.666667,14.666667,7.000000,16.366667,0.666667,0.333333,0.666667
56,2023-05-21,16:00,Premier League,Matchweek 37,Sun,Home,W,1,0,Chelsea,...,6,1,2.666667,0.333333,14.000000,5.666667,18.100000,1.333333,0.000000,0.333333
57,2023-05-24,20:00,Premier League,Matchweek 32,Wed,Away,D,1,1,Brighton,...,2,0,2.000000,0.333333,13.666667,4.000000,18.933333,1.333333,0.000000,0.333333


In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x,cols,new_cols) )

In [None]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1,0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
Arsenal,5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1,0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
Arsenal,7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3,1,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
Arsenal,8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0,0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
Arsenal,9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2,2,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,39,2023-04-29,15:00,Premier League,Matchweek 34,Sat,Away,L,0,6,Brighton,...,5,0,1.666667,0.666667,11.666667,4.666667,18.700000,0.666667,0.333333,0.333333
Wolverhampton Wanderers,40,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,1,0,Aston Villa,...,5,1,1.000000,2.666667,11.333333,2.333333,18.800000,0.666667,0.333333,0.333333
Wolverhampton Wanderers,41,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,...,5,0,1.000000,2.000000,8.000000,2.000000,17.766667,0.000000,0.333333,0.333333
Wolverhampton Wanderers,42,2023-05-20,15:00,Premier League,Matchweek 37,Sat,Home,D,1,1,Everton,...,5,0,0.333333,2.666667,7.000000,1.333333,15.600000,0.000000,0.000000,0.000000


In [None]:
matches_rolling = matches_rolling.droplevel('team')

In [None]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1,0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1,0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3,1,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0,0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2,2,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,2023-04-29,15:00,Premier League,Matchweek 34,Sat,Away,L,0,6,Brighton,...,5,0,1.666667,0.666667,11.666667,4.666667,18.700000,0.666667,0.333333,0.333333
40,2023-05-06,15:00,Premier League,Matchweek 35,Sat,Home,W,1,0,Aston Villa,...,5,1,1.000000,2.666667,11.333333,2.333333,18.800000,0.666667,0.333333,0.333333
41,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Away,L,0,2,Manchester Utd,...,5,0,1.000000,2.000000,8.000000,2.000000,17.766667,0.000000,0.333333,0.333333
42,2023-05-20,15:00,Premier League,Matchweek 37,Sat,Home,D,1,1,Everton,...,5,0,0.333333,2.666667,7.000000,1.333333,15.600000,0.000000,0.000000,0.000000


In [None]:
matches_rolling = matches_rolling.reset_index(drop=True)

In [None]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-01-01']
    test = data[data["date"] > '2023-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    acc = accuracy_score(test['target'],preds)
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return acc, combined, error

In [None]:
acc, combined, error = make_predictions(matches_rolling, predictors + new_cols)

In [None]:
acc

0.633177570093458

In [None]:
error

0.5384615384615384

In [None]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [None]:
combined.head(10)

Unnamed: 0,actual,predicted,date,team,opponent,result
51,0,1,2023-01-03,Arsenal,Newcastle Utd,D
52,1,0,2023-01-15,Arsenal,Tottenham,W
53,1,1,2023-01-22,Arsenal,Manchester Utd,W
54,0,1,2023-02-04,Arsenal,Everton,L
55,0,1,2023-02-11,Arsenal,Brentford,D
56,0,0,2023-02-15,Arsenal,Manchester City,L
57,1,1,2023-02-18,Arsenal,Aston Villa,W
58,1,1,2023-02-25,Arsenal,Leicester City,W
59,1,0,2023-03-01,Arsenal,Everton,W
60,1,1,2023-03-04,Arsenal,Bournemouth,W


In [None]:
#rectifying the name conflicts in the team and opponent column

class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton",
              "Manchester United": "Manchester Utd",
              "Newcastle United": "Newcastle Utd",
              "Tottenham Hotspur": "Tottenham",
              "West Ham United": "West Ham",
              "Wolverhampton Wanderers": "Wolves"}
mapping = MissingDict(**map_values)

In [None]:
combined["new_team"] = combined["team"].map(mapping)

In [None]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
51,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal
52,1,0,2023-01-15,Arsenal,Tottenham,W,Arsenal
53,1,1,2023-01-22,Arsenal,Manchester Utd,W,Arsenal
54,0,1,2023-02-04,Arsenal,Everton,L,Arsenal
55,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal
...,...,...,...,...,...,...,...
1446,0,0,2023-04-29,Wolverhampton Wanderers,Brighton,L,Wolves
1447,1,0,2023-05-06,Wolverhampton Wanderers,Aston Villa,W,Wolves
1448,0,0,2023-05-13,Wolverhampton Wanderers,Manchester Utd,L,Wolves
1449,0,0,2023-05-20,Wolverhampton Wanderers,Everton,D,Wolves


In [None]:
#merging the predictions according to home-away conditions
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [None]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2023-01-03,Arsenal,Newcastle Utd,D,Arsenal,0,0,Newcastle United,Arsenal,D,Newcastle Utd
1,1,0,2023-01-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
2,1,1,2023-01-22,Arsenal,Manchester Utd,W,Arsenal,0,1,Manchester United,Arsenal,L,Manchester Utd
3,0,1,2023-02-04,Arsenal,Everton,L,Arsenal,1,0,Everton,Arsenal,W,Everton
4,0,1,2023-02-11,Arsenal,Brentford,D,Arsenal,0,0,Brentford,Arsenal,D,Brentford
...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,0,0,2023-04-29,Wolverhampton Wanderers,Brighton,L,Wolves,1,1,Brighton and Hove Albion,Wolves,W,Brighton
403,1,0,2023-05-06,Wolverhampton Wanderers,Aston Villa,W,Wolves,0,1,Aston Villa,Wolves,L,Aston Villa
404,0,0,2023-05-13,Wolverhampton Wanderers,Manchester Utd,L,Wolves,1,1,Manchester United,Wolves,W,Manchester Utd
405,0,0,2023-05-20,Wolverhampton Wanderers,Everton,D,Wolves,0,0,Everton,Wolves,D,Everton


In [None]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

1    50
0    39
Name: actual_x, dtype: int64