## Import Libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import PredefinedSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt

## Set Options

In [2]:
# Declare Constants
RANDOM_STATE = 24

## Load Features

In [3]:
# Export feature table to file
feature_df = pd.read_csv("../Data/Features/features_200209.csv",index_col=0)
feature_df

Unnamed: 0,HomeTeam,AwayTeam,Date,Season,Div,FTR,home_prev_3_result,home_prev_3_win,home_prev_3_loss,home_prev_3_draw,...,x5_LWW,x5_WDD,x5_WDL,x5_WDW,x5_WLD,x5_WLL,x5_WLW,x5_WWD,x5_WWL,x5_WWW
0,Bournemouth,Aston Villa,2015-08-08,1516,E0,A,WWD,2,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Chelsea,Swansea,2015-08-08,1516,E0,D,WLD,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Everton,Watford,2015-08-08,1516,E0,D,LWL,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Leicester,Sunderland,2015-08-08,1516,E0,H,WDW,2,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Man United,Tottenham,2015-08-08,1516,E0,H,DDW,1,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16771,Newcastle,West Brom,2011-05-22,1011,E0,D,DWL,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16772,Stoke,Wigan,2011-05-22,1011,E0,A,LWD,1,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16773,Tottenham,Birmingham,2011-05-22,1011,E0,H,WLD,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16774,West Ham,Sunderland,2011-05-22,1011,E0,A,LDL,0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Fix data types
feature_df['Date'] = feature_df['Date'].apply(func=datetime.datetime.strptime,args=["%Y-%m-%d"])
feature_df['Season'] = feature_df['Season'].apply(func=str)
feature_df['Season'] = feature_df['Season'].apply(lambda x: '0'+x if len(x)<4 else x)
feature_df

Unnamed: 0,HomeTeam,AwayTeam,Date,Season,Div,FTR,home_prev_3_result,home_prev_3_win,home_prev_3_loss,home_prev_3_draw,...,x5_LWW,x5_WDD,x5_WDL,x5_WDW,x5_WLD,x5_WLL,x5_WLW,x5_WWD,x5_WWL,x5_WWW
0,Bournemouth,Aston Villa,2015-08-08,1516,E0,A,WWD,2,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Chelsea,Swansea,2015-08-08,1516,E0,D,WLD,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Everton,Watford,2015-08-08,1516,E0,D,LWL,1,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Leicester,Sunderland,2015-08-08,1516,E0,H,WDW,2,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Man United,Tottenham,2015-08-08,1516,E0,H,DDW,1,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16771,Newcastle,West Brom,2011-05-22,1011,E0,D,DWL,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16772,Stoke,Wigan,2011-05-22,1011,E0,A,LWD,1,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16773,Tottenham,Birmingham,2011-05-22,1011,E0,H,WLD,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16774,West Ham,Sunderland,2011-05-22,1011,E0,A,LDL,0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Check feature data types
dict(feature_df.dtypes)

{'HomeTeam': dtype('O'),
 'AwayTeam': dtype('O'),
 'Date': dtype('<M8[ns]'),
 'Season': dtype('O'),
 'Div': dtype('O'),
 'FTR': dtype('O'),
 'home_prev_3_result': dtype('O'),
 'home_prev_3_win': dtype('int64'),
 'home_prev_3_loss': dtype('int64'),
 'home_prev_3_draw': dtype('int64'),
 'home_prev_3_win_pct': dtype('float64'),
 'home_prev_3_loss_pct': dtype('float64'),
 'home_prev_3_gd': dtype('float64'),
 'home_prev_3_gc': dtype('float64'),
 'home_prev_3_gs': dtype('float64'),
 'home_prev_3_cs': dtype('int64'),
 'away_prev_3_result': dtype('O'),
 'away_prev_3_win': dtype('int64'),
 'away_prev_3_loss': dtype('int64'),
 'away_prev_3_draw': dtype('int64'),
 'away_prev_3_win_pct': dtype('float64'),
 'away_prev_3_loss_pct': dtype('float64'),
 'away_prev_3_gd': dtype('float64'),
 'away_prev_3_gc': dtype('float64'),
 'away_prev_3_gs': dtype('float64'),
 'away_prev_3_cs': dtype('int64'),
 'x0_Arsenal': dtype('float64'),
 'x0_Aston Villa': dtype('float64'),
 'x0_Birmingham': dtype('float64'),
 '

## Split Training and Test Datasets

In [6]:
cat_feature = ['HomeTeam','AwayTeam','Season','Div','home_prev_3_result','away_prev_3_result']
remove_feature = ['Date','FTR']
# Split final validation set (18-19 Season)
test_df = feature_df[feature_df['Season']=='1819']
train_df = feature_df[feature_df['Season']!='1819']
# Separate label from data
train_label = train_df['FTR']
test_label = test_df['FTR']
train_df = train_df.drop(remove_feature,axis=1)
test_df = test_df.drop(remove_feature,axis=1)
print(f"Size of training set: {len(train_df)}\nSize of test set: {len(test_df)}\nNumber of features: {train_df.shape[1]}")

Size of training set: 3800
Size of test set: 380
Number of features: 162


In [7]:
# Seasons for doing a grouped k-fold cross-validation
cv_seasons = list(train_df['Season'].unique())
cv_seasons.sort()
cv_seasons

['0809',
 '0910',
 '1011',
 '1112',
 '1213',
 '1314',
 '1415',
 '1516',
 '1617',
 '1718']

In [8]:
# Define cross-validation folds
cv_fold = []
for ind,season in enumerate(cv_seasons):
    test_seasons = cv_seasons[ind]
    buffer_seasons = None
    if ind < (len(cv_seasons)-1):
        buffer_seasons = cv_seasons[ind+1]
    train_seasons = [s for s in cv_seasons if s not in [test_seasons,buffer_seasons]]
    train_ind = np.array(train_df['Season'].isin(train_seasons))
    test_ind = np.array(train_df['Season'].isin([test_seasons]))
    cv_fold.append((train_ind,test_ind))
cv_fold

[(array([ True,  True,  True, ...,  True,  True,  True]),
  array([False, False, False, ..., False, False, False])),
 (array([ True,  True,  True, ..., False, False, False]),
  array([False, False, False, ..., False, False, False])),
 (array([ True,  True,  True, ..., False, False, False]),
  array([False, False, False, ...,  True,  True,  True])),
 (array([ True,  True,  True, ...,  True,  True,  True]),
  array([False, False, False, ..., False, False, False])),
 (array([ True,  True,  True, ...,  True,  True,  True]),
  array([False, False, False, ..., False, False, False])),
 (array([ True,  True,  True, ...,  True,  True,  True]),
  array([False, False, False, ..., False, False, False])),
 (array([False, False, False, ...,  True,  True,  True]),
  array([False, False, False, ..., False, False, False])),
 (array([False, False, False, ...,  True,  True,  True]),
  array([ True,  True,  True, ..., False, False, False])),
 (array([ True,  True,  True, ...,  True,  True,  True]),
  arra

In [9]:
# Drop categorical features (already one-hot-encoded)
train_df = train_df.drop(cat_feature,axis=1)
test_df = test_df.drop(cat_feature,axis=1)

## Training Random Forest Classifier

In [12]:
# Hyperparameter tuning (num trees)
n_trees = [2**6,2**7,2**8,2**9,2**10]
acc_score = []
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n, random_state=RANDOM_STATE)
    scores = cross_validate(clf,X=train_df,y=train_label,cv=cv_fold)
    acc_score.append((n,scores['test_score'].mean()))
acc_score

[(256, 0.505), (512, 0.4989473684210527), (1024, 0.49973684210526315)]

In [14]:
# Hyperparameter tuning (max depth)
max_depth = [2**4,2**5,2**6,2**7]
acc_score = []
for depth in max_depth:
    clf = RandomForestClassifier(max_depth=depth, n_estimators=2**8, random_state=RANDOM_STATE)
    scores = cross_validate(clf,X=train_df,y=train_label,cv=cv_fold)
    acc_score.append((depth,scores['test_score'].mean()))
acc_score

[(16, 0.503421052631579),
 (32, 0.5042105263157894),
 (64, 0.5047368421052632),
 (128, 0.505)]

In [15]:
# Train random forest classifier
clf = RandomForestClassifier(max_depth=2**7, n_estimators=2**8, random_state=RANDOM_STATE)
clf = clf.fit(X=train_df,y=train_label)

In [16]:
# Test decision tree classifier
clf.score(test_df,test_label)

0.5473684210526316

In [17]:
feature_importance = []
for i,f in enumerate(list(train_df.columns)):
    feature_importance.append((f,clf.feature_importances_[i]))
feature_importance.sort(key=(lambda x: x[1]), reverse=True)
feature_importance

[('away_prev_3_gd', 0.05026762675637532),
 ('home_prev_3_gd', 0.04986946295079396),
 ('home_prev_3_gs', 0.04422415601093636),
 ('away_prev_3_gs', 0.04390998145080014),
 ('home_prev_3_gc', 0.04192587732865479),
 ('away_prev_3_gc', 0.041100261613743555),
 ('away_prev_3_cs', 0.021883588467510488),
 ('home_prev_3_cs', 0.021511146574798235),
 ('home_prev_3_draw', 0.01637730147242599),
 ('away_prev_3_draw', 0.014570665067377711),
 ('home_prev_3_loss_pct', 0.014441166688238615),
 ('home_prev_3_win_pct', 0.014303758216627082),
 ('home_prev_3_win', 0.01412082631740414),
 ('home_prev_3_loss', 0.01408857037177153),
 ('away_prev_3_win_pct', 0.013632883257297512),
 ('away_prev_3_win', 0.013622312990466932),
 ('away_prev_3_loss', 0.013560877370016693),
 ('away_prev_3_loss_pct', 0.01320192659801215),
 ('x2_1516', 0.009834173540351538),
 ('x2_1213', 0.008963885605340978),
 ('x2_1011', 0.00890528538737261),
 ('x2_1112', 0.00872898590093225),
 ('x2_1718', 0.00872126710934842),
 ('x2_0910', 0.00871429244