In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
matches = pd.read_csv("matchess.csv", index_col=0)

In [3]:
matches["date"] = pd.to_datetime(matches["date"])

In [4]:
#Creating numeric values for Home and Away, so that machine can learn from it.
matches['venue_code'] = matches['venue'].astype('category').cat.codes

In [5]:
#Creating unique code for each opponent squad.
matches['opp_code'] = matches['opponent'].astype('category').cat.codes

In [6]:
#Creating a normalized hour time.
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

In [7]:
#Monday = 0, Tuesday = 1, etc.
matches['day_code'] = matches['date'].dt.dayofweek

In [8]:
#Setting up target to predict. (Wether the team won or not). Win = 1, Loss or Draw = 0.
matches['target'] = (matches['result'] == "W").astype("int")

In [9]:
rf = RandomForestClassifier(n_estimators=150, min_samples_split=10, random_state=1)

In [10]:
train = matches[matches['date'] < '2022-01-01']

In [11]:
test = matches[matches['date'] > '2022-01-01']

In [12]:
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [13]:
rf.fit(train[predictors], train['target'])

RandomForestClassifier(min_samples_split=10, n_estimators=150, random_state=1)

In [14]:
preds = rf.predict(test[predictors])

In [15]:
acc = accuracy_score(test['target'], preds)

In [16]:
acc

0.5892857142857143

In [17]:
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))

In [18]:
pd.crosstab(index=combined['actual'], columns=combined['prediction'])
#We predicted win 59 times and were right 31 times.
#Predicted loss or draw 217 times and 141 were right.

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,219,53
1,131,45


In [19]:
precision_score(test['target'], preds)

0.45918367346938777

In [20]:
grouped_matches = matches.groupby('team')

In [21]:
group = grouped_matches.get_group("Manchester City")

In [22]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [23]:
cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f"{c}_rolling" for c in cols]

In [24]:
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))

In [25]:
matches_rolling = matches_rolling.droplevel('team')

In [26]:
matches_rolling.index = range(matches_rolling.shape[0])

In [27]:
#matches_rolling

In [28]:
def make_predictions(data, predictors):
    train = data[data['date'] < '2022-01-01']
    test = data[data['date'] > '2022-01-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [29]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [30]:
precision

0.4803921568627451

In [31]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)

In [32]:
#combined

In [33]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton", 
              "Manchester United": "Manchester Utd", 
              "Newcastle United": "Newcastle Utd", 
              "Tottenham Hotspur": "Tottenham", 
              "West Ham United": "West Ham", 
              "Wolverhampton Wanderers": "Wolves"
} 
mapping = MissingDict(**map_values)

In [34]:
combined["new_team"] = combined["team"].map(mapping)

In [35]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
17,0,0,2022-01-23,Arsenal,Burnley,D,Arsenal
18,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal
19,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal
20,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal
21,1,1,2022-03-06,Arsenal,Watford,W,Arsenal
...,...,...,...,...,...,...,...
746,0,0,2022-05-15,Wolverhampton Wanderers,Norwich City,D,Wolves
747,0,0,2022-05-22,Wolverhampton Wanderers,Liverpool,L,Wolves
748,0,0,2022-08-06,Wolverhampton Wanderers,Leeds United,L,Wolves
749,0,0,2022-08-13,Wolverhampton Wanderers,Fulham,D,Wolves


In [36]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [37]:
merged[25:60]

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
25,0,0,2022-02-19,Aston Villa,Watford,L,Aston Villa,1,0,Watford,Aston Villa,W,Watford
26,1,0,2022-02-26,Aston Villa,Brighton,W,Aston Villa,0,0,Brighton and Hove Albion,Aston Villa,L,Brighton
27,1,0,2022-03-05,Aston Villa,Southampton,W,Aston Villa,0,1,Southampton,Aston Villa,L,Southampton
28,1,0,2022-03-10,Aston Villa,Leeds United,W,Aston Villa,0,0,Leeds United,Aston Villa,L,Leeds United
29,0,1,2022-03-13,Aston Villa,West Ham,L,Aston Villa,1,0,West Ham United,Aston Villa,W,West Ham
30,0,0,2022-03-19,Aston Villa,Arsenal,L,Aston Villa,1,0,Arsenal,Aston Villa,W,Arsenal
31,0,0,2022-04-02,Aston Villa,Wolves,L,Aston Villa,1,0,Wolverhampton Wanderers,Aston Villa,W,Wolves
32,0,0,2022-04-09,Aston Villa,Tottenham,L,Aston Villa,1,1,Tottenham Hotspur,Aston Villa,W,Tottenham
33,0,0,2022-04-23,Aston Villa,Leicester City,D,Aston Villa,0,0,Leicester City,Aston Villa,D,Leicester City
34,1,0,2022-04-30,Aston Villa,Norwich City,W,Aston Villa,0,0,Norwich City,Aston Villa,L,Norwich City


In [38]:
#Let's now get only the predictions, where one team was predicted to win and the other to lose.
merged[(merged['predicted_x'] == 1) & (merged['predicted_y'] == 0)]['actual_x'].value_counts()

1    42
0    42
Name: actual_x, dtype: int64

In [41]:
#Let's search for some draws.
merged[(merged['predicted_x'] == 0) & (merged['predicted_y'] == 0)]['actual_x'].value_counts()

0    147
1     99
Name: actual_x, dtype: int64