In [3]:
import pandas as pd

In [4]:
matches = pd.read_csv("matches.csv", index_col=0)

In [5]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2.0,0.0,Ipswich Town,...,Match Report,,18.0,5.0,14.8,0.0,0,0,2024,Liverpool
1,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2.0,0.0,Brentford,...,Match Report,,19.0,8.0,13.6,1.0,0,0,2024,Liverpool
2,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3.0,0.0,Manchester Utd,...,Match Report,,11.0,3.0,13.4,0.0,0,0,2024,Liverpool
3,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0.0,1.0,Nott'ham Forest,...,Match Report,,14.0,5.0,14.9,0.0,0,0,2024,Liverpool
5,2024-09-21,15:00,Premier League,Matchweek 5,Sat,Home,W,3.0,0.0,Bournemouth,...,Match Report,,19.0,12.0,16.6,0.0,0,0,2024,Liverpool


In [6]:
matches.shape

(1130, 28)

In [7]:
matches["team"].value_counts()

team
Nottingham Forest           57
Bournemouth                 57
Manchester City             57
West Ham United             57
Crystal Palace              57
Tottenham Hotspur           57
Fulham                      57
Wolverhampton Wanderers     57
Chelsea                     56
Liverpool                   56
Arsenal                     56
Newcastle United            56
Brighton and Hove Albion    56
Brentford                   56
Aston Villa                 56
Manchester United           56
Everton                     56
Burnley                     38
Luton Town                  38
Sheffield United            38
Leicester City              19
Southampton                 19
Ipswich Town                18
Name: count, dtype: int64

In [8]:
matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf               float64
ga               float64
opponent          object
xg               float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes            float64
sh               float64
sot              float64
dist             float64
fk               float64
pk                 int64
pkatt              int64
season             int64
team              object
dtype: object

In [9]:
matches["date"] = pd.to_datetime(matches["date"])

In [10]:
matches.dtypes

date             datetime64[ns]
time                     object
comp                     object
round                    object
day                      object
venue                    object
result                   object
gf                      float64
ga                      float64
opponent                 object
xg                      float64
xga                     float64
poss                    float64
attendance              float64
captain                  object
formation                object
opp formation            object
referee                  object
match report             object
notes                   float64
sh                      float64
sot                     float64
dist                    float64
fk                      float64
pk                        int64
pkatt                     int64
season                    int64
team                     object
dtype: object

In [11]:
matches["venue_code"]=matches["venue"].astype("category").cat.codes

In [12]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [13]:
matches["hour"] = matches["time"].str.replace(":.+","",regex=True).astype("int")

In [14]:
matches["day_code"]=matches["date"].dt.dayofweek

In [15]:
matches["target"] = (matches["result"] == 'W').astype("int")

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [18]:
train = matches[matches["date"]<'2024-01-01']

In [19]:
test = matches[matches["date"]>'2024-01-01']

In [20]:
predictors = ["venue_code", "opp_code","hour","day_code"]

In [21]:
rf.fit(train[predictors], train["target"])

In [22]:
preds = rf.predict(test[predictors])

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
acc = accuracy_score(test["target"], preds)

In [25]:
combined = pd.DataFrame(dict(actual=test["target"],prediction=preds))

In [26]:
pd.crosstab(index=combined["actual"], columns = combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,358,107
1,184,87


In [27]:
from sklearn.metrics import precision_score

In [28]:
precision_score(test["target"],preds)

0.4484536082474227

In [29]:
grouped_matches = matches.groupby("team")

In [30]:
group = grouped_matches.get_group("Arsenal")

In [31]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [32]:
cols = ["gf","ga","sh", "sot","dist","fk","pk","pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [33]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [34]:
rolling_averages(group,cols,new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,6,1,1.666667,1.0,15.333333,6.0,16.433333,0.0,0.666667,0.666667
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,6,1,2.0,1.0,16.0,5.333333,15.066667,0.0,0.666667,0.666667
7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,...,6,0,2.0,1.0,16.0,6.0,15.4,0.0,0.333333,0.333333
9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,...,5,1,2.0,1.0,14.0,4.333333,16.433333,0.0,0.333333,0.333333
11,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.333333,0.666667,12.666667,4.666667,16.6,0.0,1.0,1.0
12,2023-10-21,17:30,Premier League,Matchweek 9,Sat,Away,D,2.0,2.0,Chelsea,...,5,0,2.333333,0.666667,12.333333,4.0,17.2,0.0,1.0,1.0
14,2023-10-28,15:00,Premier League,Matchweek 10,Sat,Home,W,5.0,0.0,Sheffield Utd,...,5,1,2.333333,0.666667,12.666667,3.666667,16.333333,0.0,0.666667,0.666667
16,2023-11-04,17:30,Premier League,Matchweek 11,Sat,Away,L,0.0,1.0,Newcastle Utd,...,5,0,2.666667,0.666667,12.333333,3.666667,16.833333,0.0,0.333333,0.333333
18,2023-11-11,15:00,Premier League,Matchweek 12,Sat,Home,W,3.0,1.0,Burnley,...,5,1,2.333333,1.0,13.0,3.333333,17.333333,0.0,0.333333,0.333333
19,2023-11-25,17:30,Premier League,Matchweek 13,Sat,Away,W,1.0,0.0,Brentford,...,5,1,2.666667,0.666667,14.0,4.333333,17.166667,0.0,0.333333,0.333333


In [35]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x,cols,new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x,cols,new_cols))


In [36]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
Arsenal,5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
Arsenal,7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
Arsenal,9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
Arsenal,11,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,16,2024-12-09,20:00,Premier League,Matchweek 15,Mon,Away,L,1.0,2.0,West Ham,...,0,0,2.000000,3.000000,8.666667,3.333333,19.033333,0.333333,0.000000,0.000000
Wolverhampton Wanderers,17,2024-12-14,15:00,Premier League,Matchweek 16,Sat,Home,L,1.0,2.0,Ipswich Town,...,5,0,1.000000,3.333333,11.666667,3.333333,19.500000,0.333333,0.000000,0.000000
Wolverhampton Wanderers,18,2024-12-22,14:00,Premier League,Matchweek 17,Sun,Away,W,3.0,0.0,Leicester City,...,6,1,0.666667,2.666667,13.666667,4.333333,19.000000,0.000000,0.000000,0.000000
Wolverhampton Wanderers,19,2024-12-26,17:30,Premier League,Matchweek 18,Thu,Home,W,2.0,0.0,Manchester Utd,...,3,1,1.666667,1.333333,14.333333,5.000000,16.966667,0.333333,0.000000,0.000000


In [37]:
matches_rolling = matches_rolling.droplevel("team")

In [38]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
11,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,2024-12-09,20:00,Premier League,Matchweek 15,Mon,Away,L,1.0,2.0,West Ham,...,0,0,2.000000,3.000000,8.666667,3.333333,19.033333,0.333333,0.000000,0.000000
17,2024-12-14,15:00,Premier League,Matchweek 16,Sat,Home,L,1.0,2.0,Ipswich Town,...,5,0,1.000000,3.333333,11.666667,3.333333,19.500000,0.333333,0.000000,0.000000
18,2024-12-22,14:00,Premier League,Matchweek 17,Sun,Away,W,3.0,0.0,Leicester City,...,6,1,0.666667,2.666667,13.666667,4.333333,19.000000,0.000000,0.000000,0.000000
19,2024-12-26,17:30,Premier League,Matchweek 18,Thu,Home,W,2.0,0.0,Manchester Utd,...,3,1,1.666667,1.333333,14.333333,5.000000,16.966667,0.333333,0.000000,0.000000


In [39]:
matches_rolling.index = range(matches_rolling.shape[0])

In [40]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3.0,1.0,Manchester Utd,...,6,1,1.666667,1.000000,15.333333,6.000000,16.433333,0.000000,0.666667,0.666667
1,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1.0,0.0,Everton,...,6,1,2.000000,1.000000,16.000000,5.333333,15.066667,0.000000,0.666667,0.666667
2,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2.0,2.0,Tottenham,...,6,0,2.000000,1.000000,16.000000,6.000000,15.400000,0.000000,0.333333,0.333333
3,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4.0,0.0,Bournemouth,...,5,1,2.000000,1.000000,14.000000,4.333333,16.433333,0.000000,0.333333,0.333333
4,2023-10-08,16:30,Premier League,Matchweek 8,Sun,Home,W,1.0,0.0,Manchester City,...,6,1,2.333333,0.666667,12.666667,4.666667,16.600000,0.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056,2024-12-09,20:00,Premier League,Matchweek 15,Mon,Away,L,1.0,2.0,West Ham,...,0,0,2.000000,3.000000,8.666667,3.333333,19.033333,0.333333,0.000000,0.000000
1057,2024-12-14,15:00,Premier League,Matchweek 16,Sat,Home,L,1.0,2.0,Ipswich Town,...,5,0,1.000000,3.333333,11.666667,3.333333,19.500000,0.333333,0.000000,0.000000
1058,2024-12-22,14:00,Premier League,Matchweek 17,Sun,Away,W,3.0,0.0,Leicester City,...,6,1,0.666667,2.666667,13.666667,4.333333,19.000000,0.000000,0.000000,0.000000
1059,2024-12-26,17:30,Premier League,Matchweek 18,Thu,Home,W,2.0,0.0,Manchester Utd,...,3,1,1.666667,1.333333,14.333333,5.000000,16.966667,0.333333,0.000000,0.000000


In [47]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors],train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual = test["target"], predicted = preds), index = test.index)
    precision = precision_score(test["target"],preds)
    return combined, precision

In [48]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [50]:
precision

0.5225225225225225

In [51]:
combined

Unnamed: 0,actual,predicted
17,1,0
18,1,1
19,1,1
20,1,0
21,1,1
...,...,...
1056,0,0
1057,0,0
1058,1,0
1059,1,1


In [55]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [56]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
17,1,0,2024-01-20,Arsenal,Crystal Palace,W
18,1,1,2024-01-30,Arsenal,Nott'ham Forest,W
19,1,1,2024-02-04,Arsenal,Liverpool,W
20,1,0,2024-02-11,Arsenal,West Ham,W
21,1,1,2024-02-17,Arsenal,Burnley,W
...,...,...,...,...,...,...
1056,0,0,2024-12-09,Wolverhampton Wanderers,West Ham,L
1057,0,0,2024-12-14,Wolverhampton Wanderers,Ipswich Town,L
1058,1,0,2024-12-22,Wolverhampton Wanderers,Leicester City,W
1059,1,1,2024-12-26,Wolverhampton Wanderers,Manchester Utd,W


In [57]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion" : "Brighton",
    "Manchester United" :"Manchester Utd",
    "Newcastle United" : "Newcastle Utd",
    "Tottenham Hotspur" : "Tottenham" ,
    "West Ham United" : "West Ham",
    "Wolverhamption Wanderers": "Wolves",
    }
mapping = MissingDict(**map_values)

In [59]:
combined["new_team"] = combined["team"].map(mapping)

In [60]:
merged = combined.merge(combined, left_on =["date", "new_team"], right_on=["date","opponent"])

In [61]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,0,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nottingham Forest
2,1,1,2024-02-04,Arsenal,Liverpool,W,Arsenal,0,0,Liverpool,Arsenal,L,Liverpool
3,1,0,2024-02-11,Arsenal,West Ham,W,Arsenal,0,0,West Ham United,Arsenal,L,West Ham
4,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal,0,0,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,1,1,2024-12-09,West Ham United,Wolves,W,West Ham,0,0,Wolverhampton Wanderers,West Ham,L,Wolverhampton Wanderers
623,0,0,2024-12-16,West Ham United,Bournemouth,D,West Ham,0,1,Bournemouth,West Ham,D,Bournemouth
624,0,1,2024-12-21,West Ham United,Brighton,D,West Ham,0,0,Brighton and Hove Albion,West Ham,D,Brighton
625,1,0,2024-12-26,West Ham United,Southampton,W,West Ham,0,0,Southampton,West Ham,L,Southampton


In [63]:
merged[(merged["predicted_x"] ==1) & (merged["predicted_y"]==0)]["actual_x"].value_counts()

actual_x
1    88
0    67
Name: count, dtype: int64