In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.metrics import classification_report

In [2]:
matches = pd.read_csv('matches.csv', index_col=0)

print(matches.shape)

(2244, 28)


In [3]:
matches['Team'].value_counts()

Team
Chelsea                     113
Aston Villa                 113
Tottenham Hotspur           113
Manchester United           113
Newcastle United            112
Manchester City             112
Arsenal                     112
Liverpool                   112
Brentford                   112
Nottingham Forest           112
Fulham                      112
Brighton and Hove Albion    112
Crystal Palace              112
Everton                     112
Wolverhampton Wanderers     112
Bournemouth                 112
West Ham United             112
Leicester City               74
Southampton                  74
Luton Town                   38
Sheffield United             38
Burnley                      38
Leeds United                 38
Ipswich Town                 36
Name: count, dtype: int64

In [4]:
matches.dtypes

Date              object
Time              object
Comp              object
Round             object
Day               object
Venue             object
Result            object
GF                 int64
GA                 int64
Opponent          object
xG               float64
xGA              float64
Poss               int64
Attendance       float64
Captain           object
Formation         object
Opp Formation     object
Referee           object
Match Report      object
Notes            float64
Sh                 int64
SoT                int64
Dist             float64
FK                 int64
PK                 int64
PKatt              int64
Season             int64
Team              object
dtype: object

In [5]:
matches['Date'] = pd.to_datetime(matches['Date'])
matches['Venue_code'] = matches['Venue'].astype('category').cat.codes
matches['Opp_code'] = matches['Opponent'].astype('category').cat.codes
matches['Hour'] = matches['Time'].str.replace(":.+", "", regex=True).astype('int')
matches['Day_code'] = matches['Date'].dt.dayofweek
matches['Target'] = matches['Result'].astype('category').cat.codes

In [6]:
train = matches[matches['Date'] < '2025-01-01']
test = matches[matches['Date'] > '2025-01-01']
predictors = ['Venue_code', 'Opp_code', 'Hour', 'Day_code']

In [7]:
rf_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': np.arange(30,50,5), 
    'min_samples_split': np.arange(5,10)
}

clf = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='accuracy', n_jobs=-1)
clf.fit(train[predictors], train['Result'])
refined_rf_model = clf.best_estimator_

In [8]:
clf.best_params_

{'min_samples_split': np.int64(8), 'n_estimators': np.int64(35)}

In [15]:
preds = refined_rf_model.predict(test[predictors])
print(classification_report(test['Result'], preds))

              precision    recall  f1-score   support

           D       0.38      0.19      0.25        78
           L       0.46      0.51      0.48       134
           W       0.48      0.55      0.51       134

    accuracy                           0.46       346
   macro avg       0.44      0.42      0.42       346
weighted avg       0.45      0.46      0.44       346



In [None]:
team_matches = matches.groupby('Team')
