In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("matchess.csv", index_col=0)

In [3]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,Match Report,,10.0,2.0,15.0,1.0,0.0,0.0,2023,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,Match Report,,19.0,7.0,12.6,0.0,0.0,0.0,2023,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,Match Report,,14.0,6.0,14.6,0.0,0.0,0.0,2023,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,Match Report,,23.0,8.0,16.6,1.0,0.0,0.0,2023,Arsenal
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,Match Report,,13.0,1.0,18.2,1.0,1.0,1.0,2023,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0.0,2.0,Aston Villa,...,Match Report,,10.0,3.0,20.7,0.0,0.0,0.0,2022,Norwich City
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0.0,4.0,West Ham,...,Match Report,,8.0,2.0,21.5,1.0,0.0,0.0,2022,Norwich City
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0.0,3.0,Leicester City,...,Match Report,,9.0,5.0,16.2,0.0,0.0,0.0,2022,Norwich City
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1.0,1.0,Wolves,...,Match Report,,11.0,2.0,13.4,0.0,0.0,0.0,2022,Norwich City


In [4]:
next_matches = pd.read_csv("results - results.csv", index_col=0)

In [5]:
next_matches

Unnamed: 0,date,venue,result,gf,ga,opponent,xg,xga,poss,sh,...,venue_c,opp_c,target,gfR,gaR,possR,shR,sotR,distR,fkR
,2022-08-30,Home,W,2,4,Brentford,0,2,25,2,...,1,3,,1,1,46,11,4,15,0
,2022-08-30,Away,L,1,1,Crystal Palace,3,2,58,20,...,0,7,,3,2,45,11,5,16,0
,2022-08-30,Home,W,1,2,Brighton and Hove Albion,1,2,29,11,...,1,4,,2,1,39,11,4,16,0
,2022-08-30,Away,L,1,0,Fulham,2,1,43,13,...,0,9,,1,0,47,13,3,17,0
,2022-08-30,Home,W,0,1,Chelsea,2,1,49,18,...,1,6,,2,2,41,11,3,16,0
,2022-08-30,Away,L,2,1,Southampton,2,2,46,7,...,0,18,,1,2,63,15,4,17,0
,2022-08-30,Home,D,0,1,Everton,1,2,57,10,...,1,8,,2,1,45,12,5,16,0
,2022-08-30,Away,D,1,1,Leeds United,2,3,42,14,...,0,10,,1,1,43,14,5,16,0
,2022-08-31,Home,D,5,1,Aston Villa,2,1,71,23,...,1,1,,3,1,51,14,5,14,0
,2022-08-31,Away,D,2,3,Arsenal,1,0,58,9,...,0,0,,1,2,57,13,3,17,0


In [6]:
matches["date"] = pd.to_datetime(matches["date"])
next_matches["date"] = pd.to_datetime(next_matches["date"])

In [7]:
matches = matches.sort_values(by='date')
next_matches = next_matches.sort_values(by='date')

In [8]:
#Creating numeric values for Home and Away, so that machine can learn from it.
matches['venue_c'] = matches['venue'].astype('category').cat.codes
next_matches['venue_c'] = next_matches['venue'].astype('category').cat.codes

In [9]:
#Creating unique code for each opponent squad.
matches['opp_c'] = matches['opponent'].astype('category').cat.codes
next_matches['opp_c'] = next_matches['opponent'].astype('category').cat.codes

In [10]:
#Creating a normalized hour time.
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

In [11]:
#Monday = 0, Tuesday = 1, etc.
matches['day_c'] = matches['date'].dt.dayofweek
next_matches['day_c'] = next_matches['date'].dt.dayofweek

In [12]:
#Setting up target to predict. (Wether the team won or not). Win = 1, Loss or Draw = 0.
matches['target'] = (matches['result'] == "W").astype("int")
next_matches['target'] = (next_matches['result'] == "W").astype("int")

In [13]:
matches = matches.drop(columns=['round', 'day', 'comp', 'attendance', 'captain', 'formation', 'referee', 'match report', 'notes'])

In [14]:
matches

Unnamed: 0,date,time,venue,result,gf,ga,opponent,xg,xga,poss,...,fk,pk,pkatt,season,team,venue_c,opp_c,hour,day_c,target
0,2021-08-13,20:00,Home,W,2.0,0.0,Arsenal,1.3,1.4,34.0,...,0.0,0.0,0.0,2022,Brentford,1,0,20,4,1
0,2021-08-13,20:00,Away,L,0.0,2.0,Brentford,1.4,1.3,66.0,...,1.0,0.0,0.0,2022,Arsenal,0,3,20,4,0
0,2021-08-14,15:00,Away,L,2.0,3.0,Watford,1.1,1.2,61.0,...,1.0,1.0,1.0,2022,Aston Villa,0,20,15,5,0
0,2021-08-14,15:00,Home,L,1.0,2.0,Brighton,1.0,1.3,35.0,...,1.0,0.0,0.0,2022,Burnley,1,4,15,5,0
0,2021-08-14,17:30,Away,W,3.0,0.0,Norwich City,2.0,1.4,51.0,...,1.0,0.0,0.0,2022,Liverpool,0,16,17,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2022-08-28,14:00,Away,W,1.0,0.0,Aston Villa,0.4,0.6,42.0,...,1.0,0.0,0.0,2023,West Ham United,0,1,14,6,1
4,2022-08-28,14:00,Away,D,1.0,1.0,Wolves,2.0,0.8,62.0,...,1.0,0.0,0.0,2023,Newcastle United,0,22,14,6,0
4,2022-08-28,14:00,Home,D,1.0,1.0,Newcastle Utd,0.8,2.0,38.0,...,1.0,0.0,0.0,2023,Wolverhampton Wanderers,1,15,14,6,0
4,2022-08-28,14:00,Home,L,0.0,1.0,West Ham,0.6,0.4,58.0,...,0.0,0.0,0.0,2023,Aston Villa,1,21,14,6,0


In [15]:
next_matches

Unnamed: 0,date,venue,result,gf,ga,opponent,xg,xga,poss,sh,...,opp_c,target,gfR,gaR,possR,shR,sotR,distR,fkR,day_c
,2022-08-30,Home,W,2,4,Brentford,0,2,25,2,...,3,1,1,1,46,11,4,15,0,1
,2022-08-30,Away,L,1,1,Crystal Palace,3,2,58,20,...,6,0,3,2,45,11,5,16,0,1
,2022-08-30,Home,W,1,2,Brighton and Hove Albion,1,2,29,11,...,4,1,2,1,39,11,4,16,0,1
,2022-08-30,Away,L,1,0,Fulham,2,1,43,13,...,8,0,1,0,47,13,3,17,0,1
,2022-08-30,Home,W,0,1,Chelsea,2,1,49,18,...,5,1,2,2,41,11,3,16,0,1
,2022-08-30,Away,L,2,1,Southampton,2,2,46,7,...,14,0,1,2,63,15,4,17,0,1
,2022-08-30,Home,D,0,1,Everton,1,2,57,10,...,7,0,2,1,45,12,5,16,0,1
,2022-08-30,Away,D,1,1,Leeds United,2,3,42,14,...,9,0,1,1,43,14,5,16,0,1
,2022-08-31,Away,D,2,0,West Ham United,3,1,45,18,...,16,0,2,1,48,13,6,13,0,2
,2022-08-31,Home,D,1,0,Tottenham,0,1,42,7,...,15,0,0,2,44,12,2,16,1,2


In [16]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [17]:
train = matches[matches['date'] < '2022-01-01']

In [18]:
test = matches[matches['date'] > '2022-01-01']

In [19]:
matches[matches['date'] > '2022-01-01'][430:]

Unnamed: 0,date,time,venue,result,gf,ga,opponent,xg,xga,poss,...,fk,pk,pkatt,season,team,venue_c,opp_c,hour,day_c,target
2,2022-08-20,15:00,Away,L,2.0,3.0,Fulham,1.5,1.9,57.0,...,0.0,0.0,0.0,2023,Brentford,0,9,15,5,0
2,2022-08-20,15:00,Home,L,1.0,2.0,Southampton,0.3,0.7,62.0,...,1.0,0.0,0.0,2023,Leicester City,1,18,15,5,0
2,2022-08-20,15:00,Away,L,1.0,3.0,Crystal Palace,1.2,3.1,48.0,...,0.0,0.0,0.0,2023,Aston Villa,0,7,15,5,0
2,2022-08-20,15:00,Home,W,3.0,2.0,Brentford,1.9,1.5,43.0,...,1.0,0.0,0.0,2023,Fulham,1,3,15,5,1
2,2022-08-20,12:30,Away,L,0.0,1.0,Tottenham,0.8,1.5,49.0,...,1.0,0.0,0.0,2023,Wolverhampton Wanderers,0,19,12,5,0
2,2022-08-20,17:30,Home,L,0.0,3.0,Arsenal,0.3,1.8,41.0,...,1.0,0.0,0.0,2023,Bournemouth,1,0,17,5,0
2,2022-08-20,15:00,Away,D,1.0,1.0,Everton,1.0,1.8,52.0,...,1.0,0.0,0.0,2023,Nottingham Forest,0,8,15,5,0
2,2022-08-20,15:00,Home,W,3.0,1.0,Aston Villa,3.1,1.2,52.0,...,1.0,0.0,1.0,2023,Crystal Palace,1,1,15,5,1
2,2022-08-20,15:00,Home,D,1.0,1.0,Nott'ham Forest,1.8,1.0,48.0,...,1.0,0.0,0.0,2023,Everton,1,17,15,5,0
2,2022-08-20,15:00,Away,W,2.0,1.0,Leicester City,0.7,0.3,38.0,...,0.0,0.0,0.0,2023,Southampton,0,11,15,5,1


In [20]:
predictors = ['venue_c', 'opp_c', 'poss', 'xg', 'xga']

In [21]:
rf.fit(train[predictors], train['target'])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [22]:
test[predictors]

Unnamed: 0,venue_c,opp_c,poss,xg,xga
22,1,1,45.0,0.5,1.1
19,1,4,50.0,1.6,1.5
28,0,6,45.0,1.4,1.3
21,1,5,54.0,1.3,0.9
21,0,8,50.0,1.5,1.6
...,...,...,...,...,...
5,0,1,42.0,0.4,0.6
4,0,22,62.0,2.0,0.8
4,1,15,38.0,0.8,2.0
4,1,21,58.0,0.6,0.4


In [23]:
preds = rf.predict(test[predictors])

In [24]:
acc = accuracy_score(test['target'], preds)

In [25]:
acc

0.7393162393162394

In [26]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [27]:
combined

Unnamed: 0,actual,prediction
22,1,0
19,0,0
28,0,0
21,1,0
21,1,0
...,...,...
5,1,0
4,0,1
4,0,0
4,0,0


In [28]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,247,37
1,85,99


In [29]:
precision_score(test['target'], preds)

0.7279411764705882

In [30]:
grouped_matches = matches.groupby('team')

In [31]:
group = grouped_matches.get_group("Manchester City")

In [32]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [33]:
cols = ['gf', 'ga', 'poss', 'sh', 'sot', 'dist', 'fk']
new_cols = [f"{c}R" for c in cols]

In [34]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))

In [35]:
matches_rolling = matches_rolling.droplevel('team')

In [36]:
matches_rolling.index = range(matches_rolling.shape[0])

In [37]:
matches_rolling = round(matches_rolling)

In [38]:
#matches_rolling

In [39]:
#matches_rolling.loc[2]

In [40]:
matches_rolling = matches_rolling.drop(columns=['pk', 'pkatt', 'day_c', 'hour', 'time', 'season'])

### Here's where we get the stats for a team to copy and paste them.

In [41]:
team = "Manchester United"

matches_rolling[matches_rolling['team'] == team][-5:].drop(columns=['venue_c', 'venue', 'opponent', 'fkR'])

Unnamed: 0,date,result,gf,ga,xg,xga,poss,sh,sot,dist,fk,team,opp_c,target,gfR,gaR,possR,shR,sotR,distR
500,2022-05-22,L,0.0,1.0,1.0,0.0,61.0,10.0,4.0,20.0,1.0,Manchester United,7,0,1.0,2.0,52.0,10.0,4.0,18.0
501,2022-08-07,L,1.0,2.0,2.0,2.0,63.0,17.0,5.0,16.0,0.0,Manchester United,4,0,1.0,2.0,61.0,11.0,4.0,20.0
502,2022-08-13,L,0.0,4.0,1.0,2.0,67.0,15.0,4.0,19.0,0.0,Manchester United,3,0,0.0,2.0,60.0,14.0,5.0,18.0
503,2022-08-22,W,2.0,1.0,2.0,2.0,30.0,13.0,5.0,20.0,2.0,Manchester United,12,1,0.0,2.0,64.0,14.0,4.0,18.0
504,2022-08-27,W,1.0,0.0,1.0,2.0,51.0,11.0,4.0,15.0,1.0,Manchester United,18,1,1.0,2.0,53.0,15.0,5.0,18.0


In [42]:
#x = list(range(len(next_matches)))
#x

In [43]:
#len(next_matches)

In [44]:
idx = pd.Series(list(range(len(next_matches))))

In [45]:
next_matches = next_matches.set_index(idx)
next_matches = pd.DataFrame(next_matches)

In [46]:
matches_rolling["date"] = pd.to_datetime(matches_rolling["date"])
next_matches["date"] = pd.to_datetime(next_matches["date"])

In [47]:
predicted = [matches_rolling, next_matches]
predicted = pd.concat(predicted)
predicted = predicted.fillna('D')

In [48]:
predicted.sort_values('date')[-20:]

Unnamed: 0,date,venue,result,gf,ga,opponent,xg,xga,poss,sh,...,opp_c,target,gfR,gaR,possR,shR,sotR,distR,fkR,day_c
579,2022-08-28,Home,L,0.0,2.0,Tottenham,1.0,3.0,55.0,18.0,...,19,0,1.0,1.0,45.0,11.0,4.0,16.0,1.0,D
77,2022-08-28,Home,L,0.0,1.0,West Ham,1.0,0.0,58.0,9.0,...,21,0,1.0,2.0,57.0,13.0,3.0,17.0,0.0,D
6,2022-08-30,Home,D,0.0,1.0,Everton,1.0,2.0,57.0,10.0,...,7,0,2.0,1.0,45.0,12.0,5.0,16.0,0.0,1.0
5,2022-08-30,Away,L,2.0,1.0,Southampton,2.0,2.0,46.0,7.0,...,14,0,1.0,2.0,63.0,15.0,4.0,17.0,0.0,1.0
4,2022-08-30,Home,W,0.0,1.0,Chelsea,2.0,1.0,49.0,18.0,...,5,1,2.0,2.0,41.0,11.0,3.0,16.0,0.0,1.0
7,2022-08-30,Away,D,1.0,1.0,Leeds United,2.0,3.0,42.0,14.0,...,9,0,1.0,1.0,43.0,14.0,5.0,16.0,0.0,1.0
2,2022-08-30,Home,W,1.0,2.0,Brighton and Hove Albion,1.0,2.0,29.0,11.0,...,4,1,2.0,1.0,39.0,11.0,4.0,16.0,0.0,1.0
1,2022-08-30,Away,L,1.0,1.0,Crystal Palace,3.0,2.0,58.0,20.0,...,6,0,3.0,2.0,45.0,11.0,5.0,16.0,0.0,1.0
0,2022-08-30,Home,W,2.0,4.0,Brentford,0.0,2.0,25.0,2.0,...,3,1,1.0,1.0,46.0,11.0,4.0,15.0,0.0,1.0
3,2022-08-30,Away,L,1.0,0.0,Fulham,2.0,1.0,43.0,13.0,...,8,0,1.0,0.0,47.0,13.0,3.0,17.0,0.0,1.0


In [49]:
def make_predictions(data, predictors):
    train = data[data['date'] < '2022-08-16']
    test = data[data['date'] > '2022-08-16']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [50]:
combined, precision = make_predictions(predicted, predictors + new_cols)

In [51]:
make_predictions(predicted, predictors + new_cols)

(     actual  predicted
 37        1          1
 38        1          1
 76        0          0
 77        0          0
 78        0          0
 116       0          0
 117       0          1
 155       1          1
 156       1          1
 229       0          0
 230       1          0
 268       1          1
 269       0          0
 307       0          0
 308       0          0
 309       0          0
 347       1          1
 348       0          0
 386       0          0
 387       0          0
 425       0          0
 426       1          1
 464       0          1
 465       1          1
 503       1          0
 504       1          0
 542       0          0
 543       0          1
 579       0          0
 617       1          0
 618       0          1
 656       1          1
 657       1          1
 730       0          0
 731       1          0
 769       0          0
 770       0          0
 0         1          0
 1         0          1
 2         1          0
 3         0    

In [52]:
combined

Unnamed: 0,actual,predicted
37,1,1
38,1,1
76,0,0
77,0,0
78,0,0
116,0,0
117,0,1
155,1,1
156,1,1
229,0,0


In [53]:
precision

0.5

In [54]:
combined = combined.merge(predicted[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

In [55]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton", 
              "Manchester United": "Manchester Utd", 
              "Newcastle United": "Newcastle Utd", 
              "Tottenham Hotspur": "Tottenham", 
              "West Ham United": "West Ham", 
              "Wolverhampton Wanderers": "Wolves",
            "Nott'ham Forest": "Nottingham"
} 
mapping = MissingDict(**map_values)

In [56]:
combined["new_team"] = combined["team"].map(mapping)

In [62]:
combined.sort_values(by='date')[-10:]

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
12,0,1,2022-08-31,Arsenal,Aston Villa,D,Arsenal
17,0,1,2022-08-31,Newcastle United,Liverpool,D,Newcastle Utd
8,0,1,2022-08-31,Tottenham Hotspur,West Ham United,D,Tottenham
16,0,0,2022-08-31,Wolverhampton Wanderers,Bournemouth,D,Wolves
9,0,0,2022-08-31,West Ham United,Tottenham,D,West Ham
15,0,1,2022-08-31,Liverpool,Newcastle,D,Liverpool
10,0,0,2022-08-31,Nottingham Forest,Manchester City,D,Nottingham Forest
14,0,0,2022-08-31,Aston Villa,Arsenal,D,Aston Villa
11,0,1,2022-08-31,Manchester City,Nottingham,D,Manchester City
13,0,0,2022-08-31,Bournemouth,Wolves,D,Bournemouth


In [58]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [59]:
merged = merged.sort_values(by='date')
merged[-18:]

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
38,0,1,2022-08-28,Newcastle United,Wolves,D,Newcastle Utd,0,0,Wolverhampton Wanderers,Newcastle Utd,D,Wolves
42,1,1,2022-08-28,Tottenham Hotspur,Nott'ham Forest,W,Tottenham,0,0,Nottingham Forest,Tottenham,L,Nottingham Forest
46,0,0,2022-08-28,Wolverhampton Wanderers,Newcastle Utd,D,Wolves,0,1,Newcastle United,Wolves,D,Newcastle Utd
16,0,0,2022-08-28,Aston Villa,West Ham,L,Aston Villa,1,0,West Ham United,Aston Villa,W,West Ham
6,0,0,2022-08-30,Everton,Leeds United,D,Everton,0,0,Leeds United,Everton,D,Leeds United
5,0,0,2022-08-30,Leeds United,Everton,D,Leeds United,0,0,Everton,Leeds United,D,Everton
4,0,0,2022-08-30,Chelsea,Southampton,L,Chelsea,1,1,Southampton,Chelsea,W,Southampton
3,1,1,2022-08-30,Southampton,Chelsea,W,Southampton,0,0,Chelsea,Southampton,L,Chelsea
2,1,0,2022-08-30,Fulham,Brighton and Hove Albion,W,Fulham,0,1,Brighton and Hove Albion,Fulham,L,Brighton
1,0,1,2022-08-30,Brentford,Crystal Palace,L,Brentford,1,0,Crystal Palace,Brentford,W,Crystal Palace


In [60]:
#Let's now get only the predictions, where one team was predicted to win and the other to lose.
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0) & (merged['actual_x'] == 1)].shape

(10, 13)

In [None]:
#Let's search for some draws.
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0)].shape

In [None]:
merged.shape

In [None]:
merged.describe()