In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("matchess.csv", index_col=0)

In [3]:
matches.shape

(2359, 27)

In [4]:
matches["date"] = pd.to_datetime(matches["date"])

In [5]:
#Creating numeric values for Home and Away, so that machine can learn from it.
matches['venue_code'] = matches['venue'].astype('category').cat.codes

In [6]:
#Creating unique code for each opponent squad.
matches['opp_code'] = matches['opponent'].astype('category').cat.codes

In [7]:
#Creating a normalized hour time.
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

In [8]:
#Monday = 0, Tuesday = 1, etc.
matches['day_code'] = matches['date'].dt.dayofweek

In [9]:
#Setting up target to predict. (Wether the team won or not). Win = 1, Loss or Draw = 0.
matches['target'] = (matches['result'] == "W").astype("int")

In [10]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [11]:
train = matches[matches['date'] < '2022-03-01']

In [12]:
test = matches[matches['date'] > '2022-03-01']

In [13]:
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [14]:
rf.fit(train[predictors], train['target'])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [15]:
preds = rf.predict(test[predictors])

In [16]:
acc = accuracy_score(test['target'], preds)

In [17]:
acc

0.5714285714285714

In [18]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [19]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])
#We predicted win 59 times and were right 31 times.
#Predicted loss or draw 217 times and 141 were right.

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,2
1,4,7


In [20]:
precision_score(test['target'], preds)

0.7777777777777778

In [21]:
grouped_matches = matches.groupby('team')

In [22]:
group = grouped_matches.get_group("Manchester City")

In [23]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,1.0,0,0,2023,Manchester City,0,23,16,6,0
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,1.0,0,0,2023,Manchester City,1,18,15,5,1
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,0.0,0,0,2023,Manchester City,1,0,12,5,1
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,0.0,0,0,2023,Manchester City,0,13,15,5,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,1.0,0,0,2023,Manchester City,1,20,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2018-04-22,16:30,Premier League,Matchweek 35,Sun,Home,W,5.0,0.0,Swansea City,...,1.0,0,1,2021,Manchester City,1,22,16,6,1
53,2018-04-29,14:15,Premier League,Matchweek 36,Sun,Away,W,4.0,1.0,West Ham,...,1.0,0,0,2021,Manchester City,0,26,14,6,1
54,2018-05-06,13:30,Premier League,Matchweek 37,Sun,Home,D,0.0,0.0,Huddersfield,...,0.0,0,0,2021,Manchester City,1,11,13,6,0
55,2018-05-09,20:00,Premier League,Matchweek 31,Wed,Home,W,3.0,1.0,Brighton,...,1.0,0,0,2021,Manchester City,1,4,20,2,1


In [24]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [25]:
cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f"{c}_rolling" for c in cols]

In [26]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [27]:
rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
3,2017-09-09,12:30,Premier League,Matchweek 4,Sat,Home,W,5.0,0.0,Liverpool,...,5,1,1.666667,0.666667,17.333333,6.000000,18.233333,1.333333,0.000000,0.000000
5,2017-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,6.0,0.0,Watford,...,5,1,2.666667,0.666667,17.000000,8.000000,16.500000,0.666667,0.000000,0.000000
7,2017-09-23,15:00,Premier League,Matchweek 6,Sat,Home,W,5.0,0.0,Crystal Palace,...,5,1,4.333333,0.333333,19.333333,9.000000,15.966667,0.333333,0.333333,0.333333
9,2017-09-30,17:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Chelsea,...,5,1,5.333333,0.000000,21.666667,9.333333,14.866667,0.000000,0.333333,0.333333
10,2017-10-14,15:00,Premier League,Matchweek 8,Sat,Home,W,7.0,2.0,Stoke City,...,5,1,4.000000,0.000000,23.000000,7.666667,15.866667,0.333333,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,2022-04-30,17:30,Premier League,Matchweek 35,Sat,Away,W,4.0,0.0,Leeds United,...,5,1,3.333333,1.000000,16.000000,5.333333,16.700000,0.333333,0.333333,0.333333
54,2022-05-08,16:30,Premier League,Matchweek 36,Sun,Home,W,5.0,0.0,Newcastle Utd,...,6,1,4.000000,0.333333,18.666667,6.000000,16.333333,0.000000,0.333333,0.333333
55,2022-05-11,20:15,Premier League,Matchweek 33,Wed,Away,W,5.0,1.0,Wolves,...,2,1,4.666667,0.333333,20.000000,7.333333,15.166667,0.333333,0.333333,0.333333
56,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,2.0,2.0,West Ham,...,6,0,4.666667,0.333333,18.333333,6.666667,14.933333,0.333333,0.000000,0.000000


In [28]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))

In [29]:
matches_rolling = matches_rolling.droplevel('team')

In [30]:
matches_rolling.index = range(matches_rolling.shape[0])

In [31]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2017-09-09,15:00,Premier League,Matchweek 4,Sat,Home,W,3.0,0.0,Bournemouth,...,5,1,1.333333,2.666667,18.000000,5.666667,17.600000,0.000000,0.000000,0.000000
1,2017-09-17,13:30,Premier League,Matchweek 5,Sun,Away,D,0.0,0.0,Chelsea,...,6,0,1.000000,1.666667,14.666667,5.333333,16.233333,0.333333,0.000000,0.000000
2,2017-09-25,20:00,Premier League,Matchweek 6,Mon,Home,W,2.0,0.0,West Brom,...,0,1,1.000000,1.333333,12.000000,3.666667,16.033333,0.333333,0.000000,0.000000
3,2017-10-01,12:00,Premier League,Matchweek 7,Sun,Home,W,2.0,0.0,Brighton,...,6,1,1.666667,0.000000,14.333333,5.333333,16.800000,1.333333,0.333333,0.333333
4,2017-10-14,17:30,Premier League,Matchweek 8,Sat,Away,L,1.0,2.0,Watford,...,5,0,1.333333,0.000000,17.333333,5.000000,17.833333,1.666667,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2270,2020-07-08,18:00,Premier League,Matchweek 34,Wed,Away,L,0.0,1.0,Sheffield Utd,...,2,0,0.666667,0.666667,10.000000,1.666667,18.000000,0.333333,0.000000,0.000000
2271,2020-07-12,12:00,Premier League,Matchweek 35,Sun,Home,W,3.0,0.0,Everton,...,6,1,0.333333,1.000000,8.333333,1.333333,17.633333,0.333333,0.000000,0.000000
2272,2020-07-15,18:00,Premier League,Matchweek 36,Wed,Away,D,1.0,1.0,Burnley,...,2,0,1.000000,1.000000,9.666667,3.000000,18.500000,0.333333,0.333333,0.333333
2273,2020-07-20,20:15,Premier League,Matchweek 37,Mon,Home,W,2.0,0.0,Crystal Palace,...,0,1,1.333333,0.666667,11.000000,4.333333,17.633333,0.333333,0.333333,0.333333


In [32]:
def make_predictions(data, predictors):
    train = data[data['date'] < '2022-03-01']
    test = data[data['date'] > '2022-03-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [33]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [34]:
precision

0.75

In [35]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

In [51]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
111,1,0,2022-08-05,Arsenal,Crystal Palace,W,Arsenal
112,1,0,2022-08-13,Arsenal,Leicester City,W,Arsenal
113,1,1,2022-08-20,Arsenal,Bournemouth,W,Arsenal
1353,1,1,2022-03-06,Manchester City,Manchester Utd,W,Manchester City
1354,0,1,2022-03-14,Manchester City,Crystal Palace,D,Manchester City
1355,1,1,2022-04-02,Manchester City,Burnley,W,Manchester City
1356,0,1,2022-04-10,Manchester City,Liverpool,D,Manchester City
1357,1,1,2022-04-20,Manchester City,Brighton,W,Manchester City
1358,1,1,2022-04-23,Manchester City,Watford,W,Manchester City
1359,1,1,2022-04-30,Manchester City,Leeds United,W,Manchester City


In [46]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton", 
              "Manchester United": "Manchester Utd", 
              "Newcastle United": "Newcastle Utd", 
              "Tottenham Hotspur": "Tottenham", 
              "West Ham United": "West Ham", 
              "Wolverhampton Wanderers": "Wolves"
} 
mapping = MissingDict(**map_values)

In [52]:
combined["new_team"] = combined["team"].map(mapping)

In [53]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [55]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y


In [None]:
#Let's now get only the predictions, where one team was predicted to win and the other to lose.
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0)]['actual_x'].value_counts()

In [None]:
#Let's search for some draws.
merged[(merged['predicted_x'] == 0) & (merged['predicted_y'] == 0)]['actual_x'].value_counts()