In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("upcoming.csv", index_col=0)

In [3]:
matches["date"] = pd.to_datetime(matches["date"])
matches = matches.sort_values(by='date')

#Creating numeric values for Home and Away, so that machine can learn from it.
matches['venue_c'] = matches['venue'].astype('category').cat.codes

#Creating unique code for each opponent squad.
matches['opp_c'] = matches['opponent'].astype('category').cat.codes

#Creating a normalized hour time.
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

#Monday = 0, Tuesday = 1, etc.
matches['day_c'] = matches['date'].dt.dayofweek

#Setting up target to predict. (Wether the team won or not). Win = 1, Loss or Draw = 0.
matches['target'] = (matches['result'] == "W").astype("int")

matches = matches.drop(columns=['round', 'day', 'comp'])

In [4]:
matches

Unnamed: 0,date,time,venue,result,gf,ga,opponent,xg,xga,poss,...,sot,dist,pk,pkatt,team,venue_c,opp_c,hour,day_c,target
0,2018-08-10,20:00,Home,W,2.0,1.0,Leicester City,1.7,1.0,46.0,...,5.0,19.0,1.0,1.0,Manchester United,1,13,20,4,1
0,2018-08-10,20:00,Away,L,1.0,2.0,Manchester Utd,1.0,1.7,54.0,...,4.0,20.2,0.0,0.0,Leicester City,0,16,20,4,0
0,2018-08-11,15:00,Home,L,0.0,2.0,Crystal Palace,0.6,1.0,66.0,...,6.0,19.0,0.0,0.0,Fulham,1,8,15,5,0
0,2018-08-11,15:00,Home,L,0.0,3.0,Chelsea,0.3,2.0,37.0,...,2.0,16.2,0.0,0.0,Huddersfield Town,1,7,15,5,0
0,2018-08-11,15:00,Away,W,2.0,0.0,Fulham,1.0,0.6,34.0,...,10.0,16.6,0.0,0.0,Crystal Palace,0,10,15,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45,2023-05-28,15:00,Away,W,2.0,1.0,Southampton,1.5,0.8,71.0,...,6.0,17.4,0.0,0.0,Liverpool,0,21,15,6,1
39,2023-05-28,15:00,Away,L,1.0,2.0,Aston Villa,1.7,1.4,60.0,...,2.0,14.4,1.0,1.0,Brighton and Hove Albion,0,1,15,6,0
44,2023-05-28,15:00,Away,D,1.0,1.0,Leeds United,0.7,1.1,62.0,...,4.0,16.2,0.0,0.0,Tottenham Hotspur,0,12,15,6,0
39,2023-05-28,15:00,Away,L,0.0,6.0,Crystal Palace,0.4,3.7,26.0,...,1.0,18.9,0.0,0.0,Nottingham Forest,0,8,15,6,0


In [5]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

train = matches[matches['date'] < '2021-01-01']
test = matches[matches['date'] > '2021-01-01']

In [6]:
predictors = ['venue_c', 'opp_c', 'poss', 'xg', 'xga']

In [7]:
rf.fit(train[predictors], train['target'])

preds = rf.predict(test[predictors])

In [8]:
acc = accuracy_score(test['target'], preds)
acc

0.7304170905391658

In [9]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [10]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1044,218
1,312,392


In [11]:
precision_score(test['target'], preds)

0.6426229508196721

In [12]:
grouped_matches = matches.groupby('team')

In [13]:
group = grouped_matches.get_group("Manchester City")

In [14]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [15]:
cols = ['gf', 'ga', 'poss', 'sh', 'sot', 'dist']
new_cols = [f"{c}R" for c in cols]

In [16]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling = round(matches_rolling)

In [17]:
matches_rolling

Unnamed: 0,date,time,venue,result,gf,ga,opponent,xg,xga,poss,...,opp_c,hour,day_c,target,gfR,gaR,possR,shR,sotR,distR
0,2018-09-02,13:30,Away,W,3.0,2.0,Cardiff City,2.0,1.0,72.0,...,6,13,6,1,2.0,2.0,47.0,14.0,6.0,18.0
1,2018-09-15,15:00,Away,W,2.0,1.0,Newcastle Utd,1.0,0.0,63.0,...,17,15,5,1,3.0,2.0,57.0,16.0,9.0,17.0
2,2018-09-23,16:00,Home,W,2.0,0.0,Everton,1.0,1.0,61.0,...,9,16,6,1,3.0,1.0,65.0,15.0,8.0,17.0
3,2018-09-29,15:00,Home,W,2.0,0.0,Watford,2.0,2.0,63.0,...,23,15,5,1,2.0,1.0,65.0,13.0,6.0,16.0
4,2018-10-07,12:00,Away,W,5.0,1.0,Fulham,2.0,1.0,52.0,...,10,12,6,1,2.0,0.0,62.0,10.0,3.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3714,2023-04-29,15:00,Away,D,0.0,0.0,Brighton,2.0,0.0,65.0,...,4,15,5,0,0.0,0.0,65.0,17.0,3.0,17.0
3715,2023-05-06,15:00,Home,D,0.0,0.0,Aston Villa,2.0,0.0,65.0,...,1,15,5,0,0.0,0.0,65.0,17.0,3.0,17.0
3716,2023-05-13,15:00,Away,D,0.0,0.0,Manchester Utd,2.0,0.0,65.0,...,16,15,5,0,0.0,0.0,65.0,17.0,3.0,17.0
3717,2023-05-20,15:00,Home,D,0.0,0.0,Everton,2.0,0.0,65.0,...,9,15,5,0,0.0,0.0,65.0,17.0,3.0,17.0


In [18]:
def make_predictions(data, predictors):
    train = data[data['date'] <= '2022-09-01']
    test = data[data['date'] > '2022-09-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [19]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [20]:
make_predictions(matches_rolling, predictors + new_cols)

(      actual  predicted
 154        1          1
 155        1          1
 156        1          1
 157        1          1
 158        1          1
 ...      ...        ...
 3714       0          1
 3715       0          1
 3716       0          1
 3717       0          1
 3718       0          1
 
 [660 rows x 2 columns],
 0.4727272727272727)

In [21]:
precision

0.4727272727272727

In [22]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

In [23]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton", 
              "Manchester United": "Manchester Utd", 
              "Newcastle United": "Newcastle Utd", 
              "Tottenham Hotspur": "Tottenham", 
              "West Ham United": "West Ham", 
              "Wolverhampton Wanderers": "Wolves",
            "Nott'ham Forest": "Nottingham"
} 
mapping = MissingDict(**map_values)

In [24]:
#combined["new_team"] = combined["team"].map(mapping)

In [25]:
merged = combined.merge(combined, left_on=["date", "team"], right_on=["date", "opponent"])

In [27]:
merged.sort_values(by='date').drop(columns=['actual_x', 'actual_y', 'result_y', 'result_x'])[:15]

Unnamed: 0,predicted_x,date,team_x,opponent_x,predicted_y,team_y,opponent_y
330,1,2022-09-03,Liverpool,Everton,0,Everton,Liverpool
99,0,2022-09-03,Brentford,Leeds United,1,Leeds United,Brentford
132,0,2022-09-03,Chelsea,West Ham,0,West Ham United,Chelsea
66,0,2022-09-03,Bournemouth,Nott'ham Forest,0,Nottingham Forest,Bournemouth
363,1,2022-09-03,Manchester City,Aston Villa,0,Aston Villa,Manchester City
264,1,2022-09-03,Leeds United,Brentford,0,Brentford,Leeds United
165,0,2022-09-03,Crystal Palace,Newcastle Utd,0,Newcastle United,Crystal Palace
33,0,2022-09-03,Aston Villa,Manchester City,1,Manchester City,Aston Villa
396,0,2022-09-03,Southampton,Wolves,1,Wolverhampton Wanderers,Southampton
231,0,2022-09-03,Fulham,Tottenham,0,Tottenham Hotspur,Fulham


In [None]:
#Let's now get only the predictions, where one team was predicted to win and the other to lose.
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0) & (merged['actual_x'] == 1)].shape

In [None]:
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0)].shape

In [None]:
merged.shape

In [None]:
merged.describe()