In [592]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime as dt

In [593]:
df = pd.read_csv('Matches.csv', index_col=[0])

#### Check for Missing Data

In [594]:
df.shape

(682, 27)

In [595]:
#for each team, figure out how many games are there for 
df['team'].value_counts()

Manchester United           35
West Ham United             35
Southampton                 35
Brighton and Hove Albion    35
Newcastle United            35
Manchester City             34
Crystal Palace              34
Watford                     34
Leeds United                34
Burnley                     34
Brentford                   34
Norwich City                34
Liverpool                   34
Wolverhampton Wanderers     34
Tottenham Hotspur           34
Arsenal                     34
Chelsea                     34
Aston Villa                 33
Everton                     33
Leicester City              33
Name: team, dtype: int64

In [596]:
df['round'].value_counts()

Matchweek 1     20
Matchweek 16    20
Matchweek 34    20
Matchweek 32    20
Matchweek 31    20
Matchweek 29    20
Matchweek 28    20
Matchweek 26    20
Matchweek 25    20
Matchweek 24    20
Matchweek 23    20
Matchweek 20    20
Matchweek 2     20
Matchweek 17    20
Matchweek 19    20
Matchweek 15    20
Matchweek 9     20
Matchweek 3     20
Matchweek 4     20
Matchweek 5     20
Matchweek 6     20
Matchweek 7     20
Matchweek 14    20
Matchweek 8     20
Matchweek 10    20
Matchweek 11    20
Matchweek 12    20
Matchweek 13    20
Matchweek 27    18
Matchweek 22    18
Matchweek 21    18
Matchweek 30    18
Matchweek 18    18
Matchweek 35    18
Matchweek 33    12
Matchweek 37     2
Name: round, dtype: int64

#### Data Cleaning for ML

In [597]:
df.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

Objects cannot be used in ML algorythms. Convert dates into datetime.

In [598]:
df['date'] = df['date'].astype('datetime64[ns]')

In [599]:
df.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team                    object
dtype: object

In [600]:
df = df.sort_values('date', ascending=True)
df.index = df['date']

In [601]:
df

Unnamed: 0_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-13,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Away,L,0.0,2.0,Brentford,...,Match Report,,22.0,4.0,18.9,1.0,0.0,0.0,2022,Arsenal
2021-08-13,2021-08-13,20:00,Premier League,Matchweek 1,Fri,Home,W,2.0,0.0,Arsenal,...,Match Report,,8.0,3.0,12.1,0.0,0.0,0.0,2022,Brentford
2021-08-14,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,2.0,3.0,Watford,...,Match Report,,10.0,1.0,21.7,1.0,1.0,1.0,2022,Aston Villa
2021-08-14,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Home,L,1.0,2.0,Brighton,...,Match Report,,14.0,3.0,15.0,1.0,0.0,0.0,2022,Burnley
2021-08-14,2021-08-14,15:00,Premier League,Matchweek 1,Sat,Away,L,0.0,3.0,Chelsea,...,Match Report,,4.0,1.0,10.9,0.0,0.0,0.0,2022,Crystal Palace
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-01,2022-05-01,16:30,Premier League,Matchweek 35,Sun,Home,L,1.0,2.0,Arsenal,...,Match Report,,8.0,3.0,13.7,0.0,0.0,0.0,2022,West Ham United
2022-05-01,2022-05-01,14:00,Premier League,Matchweek 35,Sun,Away,L,0.0,1.0,Everton,...,Match Report,,17.0,5.0,16.8,0.0,0.0,0.0,2022,Chelsea
2022-05-01,2022-05-01,14:00,Premier League,Matchweek 35,Sun,Home,W,1.0,0.0,Chelsea,...,Match Report,,9.0,4.0,16.2,1.0,0.0,0.0,2022,Everton
2022-05-01,2022-05-01,14:00,Premier League,Matchweek 35,Sun,Away,L,1.0,3.0,Tottenham,...,Match Report,,6.0,1.0,18.4,0.0,0.0,0.0,2022,Leicester City


#### Create Predictors for ML

First encode 'venue' so it can be parsed numerically.

In [602]:
df['venue_encoded'] = df['venue'].map({'Away':0, 'Home':1})

Encode now the opponents to each team.

In [603]:
#Return Series of codes as well as the index.
df['opponent_encoded'] = df['opponent'].astype('category').cat.codes

Perhaps the hour in which the game takes place is a factor. Encode it.

In [604]:
df['hour_encoded'] = df['date'].dt.hour.astype('int')

Enconde also day of the week.

In [605]:
df['day_of_week'] = df['date'].dt.dayofweek

Now that the dimensions of interest are encoded, let's define the target and input variables. the result is the target variable, and the outcome we are intersted about is whether a team won or didn't. 

In [606]:
#df['result_encoded'] = df['result'].astype('category').cat.codes
df['result_encoded'] = df['result'].map({'W':1, 'L':0, 'D':0})

#### Creating the Initial ML Models (Random Forest and Logistic Regression)

Initial model will be a random forest, followed by a logistic regression model for predicting the result of the games. I will split the data taking a 66% of the indexed values as the train set, and 33% for the test set. 

In [607]:
rfc = RandomForestClassifier(n_estimators=100, random_state=1057, min_samples_split=10, max_depth=50)

In [608]:
#get length of the data set and 33% percent tranche of it
len(df) - len(df)*.33

456.94

Given the above, I will take all input values from the first index position to position 457. For the test set, all values from pistion 458 to the end of data.
The target variable is the result (in its encoded format), while the inout variables are all the other predictors I have created above.

In [609]:
#define input and target variables and split train and test data on a 66/33 split for all the values given index position
X = df[['venue_encoded', 'opponent_encoded', 'day_of_week', 'hour_encoded']]
y = df['result_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=221)

In [610]:
model = rfc.fit(X_train, y_train)
pred = model.predict(X_test)

In [611]:
#print classification report 
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.60      0.76      0.67       134
           1       0.43      0.26      0.32        92

    accuracy                           0.56       226
   macro avg       0.51      0.51      0.50       226
weighted avg       0.53      0.56      0.53       226



precision of the model is very low, close to render predictions on whether there was a win or not almost by chance. Same can be said of the accuracy.

However, let's wrap the values in a dataframe, for clarity of interpretation. Which adds to the information shown with the confusion matrix, a cross tab indicating the times in which predictions for each label were correct or not. For the case of losses and ties (0), the model is able to tell it right more times than it tells it wrong. however, when it comes to wins, the tendency is opposed. 

In [612]:
actual_predicted_values = pd.DataFrame(dict(actual=y_test, predicted=pred))
actual_predicted_values 

Unnamed: 0_level_0,actual,predicted
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-04,0,0
2021-08-21,0,0
2022-03-10,1,0
2022-02-26,0,0
2022-02-13,1,0
...,...,...
2021-11-20,1,0
2021-11-28,0,0
2021-09-11,0,0
2022-02-26,0,1


In [613]:
print(confusion_matrix(y_test, pred))

[[102  32]
 [ 68  24]]


Let's compare the performace of this model to that of a logistic regression one.

In [614]:
lr = LogisticRegression(solver='liblinear', random_state=320, max_iter=50)

In [615]:
model = lr.fit(X_train, y_train)
pred= lr.predict(X_test)

In [616]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.59      1.00      0.74       134
           1       0.00      0.00      0.00        92

    accuracy                           0.59       226
   macro avg       0.30      0.50      0.37       226
weighted avg       0.35      0.59      0.44       226



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [617]:
print(confusion_matrix(y_test, pred))

[[134   0]
 [ 92   0]]


The model fails to do anythuing noteworthy of the 'Win' label. Let's see if other predictores can be extracted out of the data set to make the models stronger.

#### Using Moving Averages to Improve Precision

Given that the data is time-series based, rolling averages can be extracted, as they can capture better the flowing nature of the results during a regular season. To that end, the data is going to be grouped given the team.

In [618]:
#group data by team and the create a variable with a data frame from a single team
team_stats = df.groupby('team')
group = team_stats.get_group('Arsenal')

To calculate rolling averages and use that information in the models, a function calculating moving averages for all teams can be created. 

In [619]:
def rolling_avg (group, col, new_cols):
    group = group
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

The data for goals scored and conceded, shots taken and on target, distance of shots, free and penalty kicks, as well as penalty kick attempts.

In [620]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols] #add '_rolling' to the name of the columns

In [621]:
rolling_avg(group, cols, new_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_of_week,result_encoded,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-11,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.0,3.0,9.666667,2.333333,14.6,0.333333,0.0,0.0
2021-09-18,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.0,13.866667,0.333333,0.0,0.0
2021-09-26,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.0,14.766667,0.666667,0.0,0.0
2021-10-02,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.0,0.666667,0.0,0.0
2021-10-18,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.0,4.0,18.966667,0.666667,0.0,0.0
2021-10-22,2021-10-22,20:00,Premier League,Matchweek 9,Fri,Home,W,3.0,1.0,Aston Villa,...,4,1,1.666667,1.0,12.333333,5.0,17.866667,0.666667,0.0,0.0
2021-10-30,2021-10-30,12:30,Premier League,Matchweek 10,Sat,Away,W,2.0,0.0,Leicester City,...,5,1,1.666667,1.0,15.333333,5.333333,17.766667,1.0,0.0,0.333333
2021-11-07,2021-11-07,14:00,Premier League,Matchweek 11,Sun,Home,W,1.0,0.0,Watford,...,6,1,2.333333,1.0,15.666667,6.333333,16.766667,0.666667,0.0,0.333333
2021-11-20,2021-11-20,17:30,Premier League,Matchweek 12,Sat,Away,L,0.0,4.0,Liverpool,...,5,0,2.0,0.333333,14.333333,6.0,17.466667,0.666667,0.0,0.666667
2021-11-27,2021-11-27,12:30,Premier League,Matchweek 13,Sat,Home,W,2.0,0.0,Newcastle Utd,...,5,1,1.0,1.333333,9.0,4.333333,17.7,0.333333,0.0,0.333333


Let's apply this approach to all of the matches.

In [622]:
rolling_matches = df.groupby('team').apply(lambda x: rolling_avg(x, cols, new_cols))

In [623]:
rolling_matches

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_of_week,result_encoded,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,2021-09-11,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,1,0.000000,3.000000,9.666667,2.333333,14.600000,0.333333,0.0,0.0
Arsenal,2021-09-18,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,1,0.333333,2.333333,12.333333,3.000000,13.866667,0.333333,0.0,0.0
Arsenal,2021-09-26,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,1,0.666667,1.666667,14.666667,3.000000,14.766667,0.666667,0.0,0.0
Arsenal,2021-10-02,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,0,1.666667,0.333333,18.333333,5.333333,18.000000,0.666667,0.0,0.0
Arsenal,2021-10-18,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,0,1.333333,0.333333,11.000000,4.000000,18.966667,0.666667,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,2022-03-18,2022-03-18,20:00,Premier League,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,...,4,0,1.666667,0.666667,12.333333,4.333333,19.600000,0.000000,0.0,0.0
Wolverhampton Wanderers,2022-04-02,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,...,5,1,2.333333,1.000000,13.000000,5.333333,19.833333,0.000000,0.0,0.0
Wolverhampton Wanderers,2022-04-08,2022-04-08,20:00,Premier League,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,...,4,0,1.666667,1.333333,13.000000,5.000000,18.533333,0.000000,0.0,0.0
Wolverhampton Wanderers,2022-04-24,2022-04-24,14:00,Premier League,Matchweek 34,Sun,Away,L,0.0,1.0,Burnley,...,6,0,1.333333,1.666667,10.000000,4.666667,17.633333,0.000000,0.0,0.0


In [624]:
rolling_matches = rolling_matches.droplevel('team')

In [625]:
#reset the index
rolling_matches = rolling_matches.reset_index()

ValueError: cannot insert date, already exists

#### Re-training the Model

With the new dimensions, let's add them to the previous model to assess if the metrics and performace improve.

In [626]:
# look at the columns so grabbing the new ones is easier
rolling_matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team', 'venue_encoded', 'opponent_encoded',
       'hour_encoded', 'day_of_week', 'result_encoded', 'gf_rolling',
       'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling',
       'pk_rolling', 'pkatt_rolling'],
      dtype='object')

In [627]:
rfc = RandomForestClassifier(n_estimators=100, random_state=9254, max_depth=50)

In [628]:
X = rolling_matches[['venue_encoded', 'opponent_encoded', 'day_of_week', 'hour_encoded','gf_rolling',
       'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling',
       'pk_rolling', 'pkatt_rolling' ]]
y = rolling_matches['result_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=426)

In [629]:
model = rfc.fit(X_train, y_train)
pred = model.predict(X_test)

In [630]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75       138
           1       0.48      0.44      0.46        68

    accuracy                           0.66       206
   macro avg       0.61      0.60      0.61       206
weighted avg       0.65      0.66      0.66       206



In [631]:
print(confusion_matrix(y_test, pred))

[[106  32]
 [ 38  30]]


In [632]:
rolling_rfc_predictions = pd.DataFrame(dict(actual=y_test, predicted=pred))
rolling_rfc_predictions

Unnamed: 0_level_0,actual,predicted
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-11,1,1
2021-11-21,0,0
2021-11-27,0,0
2021-10-23,0,1
2022-01-19,1,0
...,...,...
2022-04-06,0,0
2022-02-08,0,0
2022-03-05,0,0
2022-02-23,1,0


The performance has increased across the board. 

In [635]:
lr = LogisticRegression(solver='liblinear', random_state=320, max_iter=50)
model = lr.fit(X_train, y_train)
pred= lr.predict(X_test)

In [636]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.69      0.81      0.75       138
           1       0.41      0.26      0.32        68

    accuracy                           0.63       206
   macro avg       0.55      0.54      0.53       206
weighted avg       0.60      0.63      0.61       206



In [637]:
rolling_lr_predictions = pd.DataFrame(dict(actual=y_test, predicted=pred))
rolling_lr_predictions

Unnamed: 0_level_0,actual,predicted
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-12-11,1,1
2021-11-21,0,0
2021-11-27,0,0
2021-10-23,0,0
2022-01-19,1,0
...,...,...
2022-04-06,0,0
2022-02-08,0,0
2022-03-05,0,0
2022-02-23,1,0
