In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
# Read in data
matches = pd.read_csv("matches2.csv", index_col=0)
# Delete any duplicates
matches = matches.drop_duplicates(['Date', 'Time', 'Team', 'Season'])
matches.head()

Unnamed: 0,Team,Opponent,Round,Season,Result,Date,Time,GF,GA,Sh,...,Poss,Captain,Referee,Attendance,Formation,Match Report,Notes,Day,Venue,Comp
1,Manchester City,Tottenham,Matchweek 1,2022,L,2021-08-15,16:30,0,1,18,...,64,Fernandinho,Anthony Taylor,58262.0,4-3-3,Match Report,,Sun,Away,Premier League
2,Manchester City,Norwich City,Matchweek 2,2022,W,2021-08-21,15:00,5,0,16,...,67,İlkay Gündoğan,Graham Scott,51437.0,4-3-3,Match Report,,Sat,Home,Premier League
3,Manchester City,Arsenal,Matchweek 3,2022,W,2021-08-28,12:30,5,0,25,...,80,İlkay Gündoğan,Martin Atkinson,52276.0,4-3-3,Match Report,,Sat,Home,Premier League
4,Manchester City,Leicester City,Matchweek 4,2022,W,2021-09-11,15:00,1,0,25,...,61,İlkay Gündoğan,Paul Tierney,32087.0,4-3-3,Match Report,,Sat,Away,Premier League
6,Manchester City,Southampton,Matchweek 5,2022,D,2021-09-18,15:00,0,0,16,...,63,Fernandinho,Jonathan Moss,52698.0,4-3-3,Match Report,,Sat,Home,Premier League


In [3]:
matches.dtypes

Team             object
Opponent         object
Round            object
Season            int64
Result           object
Date             object
Time             object
GF                int64
GA                int64
Sh                int64
SoT               int64
Dist            float64
FK              float64
PK                int64
PKatt             int64
xG              float64
xGA             float64
Poss              int64
Captain          object
Referee          object
Attendance      float64
Formation        object
Match Report     object
Notes           float64
Day              object
Venue            object
Comp             object
dtype: object

In [4]:
# Convert objects to numeric values and create predictors
matches["Date"] = pd.to_datetime(matches["Date"])
matches["venueCode"] = matches["Venue"].astype("category").cat.codes
matches["oppCode"] = matches["Opponent"].astype("category").cat.codes
matches["hour"] = matches["Time"].str.replace(":.+", "", regex = True).astype("int")
matches["dayCode"] = matches["Date"].dt.dayofweek
matches["target"] = (matches["Result"] == "W").astype("int")
matches

Unnamed: 0,Team,Opponent,Round,Season,Result,Date,Time,GF,GA,Sh,...,Match Report,Notes,Day,Venue,Comp,venueCode,oppCode,hour,dayCode,target
1,Manchester City,Tottenham,Matchweek 1,2022,L,2021-08-15,16:30,0,1,18,...,Match Report,,Sun,Away,Premier League,0,15,16,6,0
2,Manchester City,Norwich City,Matchweek 2,2022,W,2021-08-21,15:00,5,0,16,...,Match Report,,Sat,Home,Premier League,1,13,15,5,1
3,Manchester City,Arsenal,Matchweek 3,2022,W,2021-08-28,12:30,5,0,25,...,Match Report,,Sat,Home,Premier League,1,0,12,5,1
4,Manchester City,Leicester City,Matchweek 4,2022,W,2021-09-11,15:00,1,0,25,...,Match Report,,Sat,Away,Premier League,0,9,15,5,1
6,Manchester City,Southampton,Matchweek 5,2022,D,2021-09-18,15:00,0,0,16,...,Match Report,,Sat,Home,Premier League,1,14,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,Huddersfield Town,Leeds United,Matchweek 35,2021,W,2022-04-30,17:30,4,0,19,...,Match Report,,Sat,Away,Premier League,0,8,17,5,1
54,Huddersfield Town,Newcastle Utd,Matchweek 36,2021,W,2022-05-08,16:30,5,0,21,...,Match Report,,Sun,Home,Premier League,1,12,16,6,1
55,Huddersfield Town,Wolves,Matchweek 33,2021,W,2022-05-11,20:15,5,1,15,...,Match Report,,Wed,Away,Premier League,0,18,20,2,1
56,Huddersfield Town,West Ham,Matchweek 37,2021,D,2022-05-15,14:00,2,2,30,...,Match Report,,Sun,Away,Premier League,0,17,14,6,0


In [5]:
# Create initial model using random forest
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [6]:
# Train the model on data from before 2022
train = matches[matches["Date"] < "2022-01-01"]

In [7]:
# Test the model on data in (or after) 2022
test = matches[matches["Date"] >= "2022-01-01"]

In [8]:
# Assign predictors
predictors = ["venueCode", "oppCode", "hour", "dayCode"]

In [9]:
# Fit the model using the train predictors
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [10]:
# Generate predictions
preds = rf.predict(test[predictors])

In [11]:
# Determine the accuracy of the model
accuracy = accuracy_score(test["target"], preds)
accuracy

0.6111111111111112

In [14]:
# Use precision_score to show the percentage of time the model correctly predicted a win
# Predicted a win correctly 71% of the time
precision_score(test["target"], preds)

0.7142857142857143

In [15]:
# Try to improve precision by adding more predictors
# Split dataframe up by grouping them by team 
groupedMatches = matches.groupby("Team")
group = groupedMatches.get_group("Manchester City")
group

Unnamed: 0,Team,Opponent,Round,Season,Result,Date,Time,GF,GA,Sh,...,Match Report,Notes,Day,Venue,Comp,venueCode,oppCode,hour,dayCode,target
1,Manchester City,Tottenham,Matchweek 1,2022,L,2021-08-15,16:30,0,1,18,...,Match Report,,Sun,Away,Premier League,0,15,16,6,0
2,Manchester City,Norwich City,Matchweek 2,2022,W,2021-08-21,15:00,5,0,16,...,Match Report,,Sat,Home,Premier League,1,13,15,5,1
3,Manchester City,Arsenal,Matchweek 3,2022,W,2021-08-28,12:30,5,0,25,...,Match Report,,Sat,Home,Premier League,1,0,12,5,1
4,Manchester City,Leicester City,Matchweek 4,2022,W,2021-09-11,15:00,1,0,25,...,Match Report,,Sat,Away,Premier League,0,9,15,5,1
6,Manchester City,Southampton,Matchweek 5,2022,D,2021-09-18,15:00,0,0,16,...,Match Report,,Sat,Home,Premier League,1,14,15,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,Manchester City,Leeds United,Matchweek 35,2021,W,2022-04-30,17:30,4,0,19,...,Match Report,,Sat,Away,Premier League,0,8,17,5,1
54,Manchester City,Newcastle Utd,Matchweek 36,2021,W,2022-05-08,16:30,5,0,21,...,Match Report,,Sun,Home,Premier League,1,12,16,6,1
55,Manchester City,Wolves,Matchweek 33,2021,W,2022-05-11,20:15,5,1,15,...,Match Report,,Wed,Away,Premier League,0,18,20,2,1
56,Manchester City,West Ham,Matchweek 37,2021,D,2022-05-15,14:00,2,2,30,...,Match Report,,Sun,Away,Premier League,0,17,14,6,0


In [16]:
# Implement rolling averages into the model
# Create function to create new columns based off of existing columns

def rollingAverages(group, cols, newCols):
# Group by date
    group = group.sort_values("Date")
# Compute rolling averages for previous 3 weeks and do not include the current week
    rollingStats = group[cols].rolling(3, closed = "left").mean()
# Assign the rolling averages back to the dataframe
    group[newCols] = rollingStats
# Drop any missing values
    group = group.dropna(subset = newCols)
    return group

In [17]:
# Define list of columns
cols = ["GF", "GA", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]
# Create new list of columns with _Rolling at the end of the names
newCols = [f"{c}_Rolling" for c in cols]
newCols

['GF_Rolling',
 'GA_Rolling',
 'Sh_Rolling',
 'SoT_Rolling',
 'Dist_Rolling',
 'FK_Rolling',
 'PK_Rolling',
 'PKatt_Rolling']

In [18]:
# Now the rolling averages can be viewed and it shows the averages for the previous 3 matches
rollingAverages(group, cols, newCols)

Unnamed: 0,Team,Opponent,Round,Season,Result,Date,Time,GF,GA,Sh,...,dayCode,target,GF_Rolling,GA_Rolling,Sh_Rolling,SoT_Rolling,Dist_Rolling,FK_Rolling,PK_Rolling,PKatt_Rolling
2,Manchester City,Norwich City,Matchweek 2,2021,W,2021-08-21,15:00,5,0,16,...,5,1,1.666667,0.666667,17.333333,4.000000,17.033333,1.000000,0.0,0.000000
3,Manchester City,Arsenal,Matchweek 3,2022,W,2021-08-28,12:30,5,0,25,...,5,1,3.333333,0.333333,16.666667,4.000000,17.166667,1.000000,0.0,0.000000
3,Manchester City,Arsenal,Matchweek 3,2021,W,2021-08-28,12:30,5,0,25,...,5,1,5.000000,0.000000,19.000000,6.000000,16.300000,0.666667,0.0,0.000000
4,Manchester City,Leicester City,Matchweek 4,2022,W,2021-09-11,15:00,1,0,25,...,5,1,5.000000,0.000000,22.000000,8.000000,15.300000,0.333333,0.0,0.000000
4,Manchester City,Leicester City,Matchweek 4,2021,W,2021-09-11,15:00,1,0,25,...,5,1,3.666667,0.000000,25.000000,9.333333,14.200000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,Manchester City,Wolves,Matchweek 33,2022,W,2022-05-11,20:15,5,1,15,...,2,1,5.000000,0.333333,19.000000,7.666667,14.966667,0.666667,0.0,0.000000
56,Manchester City,West Ham,Matchweek 37,2021,D,2022-05-15,14:00,2,2,30,...,6,0,5.000000,0.666667,17.000000,6.333333,14.433333,0.333333,0.0,0.000000
56,Manchester City,West Ham,Matchweek 37,2022,D,2022-05-15,14:00,2,2,30,...,6,0,4.000000,1.333333,20.000000,5.666667,15.166667,0.666667,0.0,0.333333
57,Manchester City,Aston Villa,Matchweek 38,2022,W,2022-05-22,16:00,3,2,24,...,6,1,3.000000,1.666667,25.000000,6.333333,16.433333,1.333333,0.0,0.666667


In [19]:
# Apply rolling averages to all of the matches in the dataframe
matchesRolling = matches.groupby("Team").apply(lambda x: rollingAverages(x, cols, newCols))
# Drop extra index level
matchesRolling = matchesRolling.droplevel("Team")
# Create unique values for each index to match the amount of rows
matchesRolling.index = range(matchesRolling.shape[0])
matchesRolling

Unnamed: 0,Team,Opponent,Round,Season,Result,Date,Time,GF,GA,Sh,...,dayCode,target,GF_Rolling,GA_Rolling,Sh_Rolling,SoT_Rolling,Dist_Rolling,FK_Rolling,PK_Rolling,PKatt_Rolling
0,Arsenal,Norwich City,Matchweek 2,2021,W,2021-08-21,15:00,5,0,16,...,5,1,1.666667,0.666667,17.333333,4.000000,17.033333,1.000000,0.0,0.000000
1,Arsenal,Arsenal,Matchweek 3,2022,W,2021-08-28,12:30,5,0,25,...,5,1,3.333333,0.333333,16.666667,4.000000,17.166667,1.000000,0.0,0.000000
2,Arsenal,Arsenal,Matchweek 3,2021,W,2021-08-28,12:30,5,0,25,...,5,1,5.000000,0.000000,19.000000,6.000000,16.300000,0.666667,0.0,0.000000
3,Arsenal,Leicester City,Matchweek 4,2022,W,2021-09-11,15:00,1,0,25,...,5,1,5.000000,0.000000,22.000000,8.000000,15.300000,0.333333,0.0,0.000000
4,Arsenal,Leicester City,Matchweek 4,2021,W,2021-09-11,15:00,1,0,25,...,5,1,3.666667,0.000000,25.000000,9.333333,14.200000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1554,Wolverhampton Wanderers,Wolves,Matchweek 33,2022,W,2022-05-11,20:15,5,1,15,...,2,1,5.000000,0.333333,19.000000,7.666667,14.966667,0.666667,0.0,0.000000
1555,Wolverhampton Wanderers,West Ham,Matchweek 37,2021,D,2022-05-15,14:00,2,2,30,...,6,0,5.000000,0.666667,17.000000,6.333333,14.433333,0.333333,0.0,0.000000
1556,Wolverhampton Wanderers,West Ham,Matchweek 37,2022,D,2022-05-15,14:00,2,2,30,...,6,0,4.000000,1.333333,20.000000,5.666667,15.166667,0.666667,0.0,0.333333
1557,Wolverhampton Wanderers,Aston Villa,Matchweek 38,2022,W,2022-05-22,16:00,3,2,24,...,6,1,3.000000,1.666667,25.000000,6.333333,16.433333,1.333333,0.0,0.666667


In [20]:
# Create predicition function to retrain machine learning model
# Similar coding as before 
def makePredictions(data, predictors):

# Split data into training and test sets
    train = data[data["Date"] < "2022-01-01"]
    test = data[data["Date"] >= "2022-01-01"]
# Fit the random forest model
    rf.fit(train[predictors], train["target"])
# Create predictions
    preds = rf.predict(test[predictors])
# Combine predictions and actuals
    combined = pd.DataFrame(dict(actual = test["target"], predicted = preds), index = test.index)
# Calculate precision
    precision = precision_score(test["target"], preds)
    return combined, precision

In [21]:
# Call the function and pass in both the original and new predictors 
combined, precision = makePredictions(matchesRolling, predictors + newCols)

In [22]:
# Precision increased slightly from 71% to 75%
precision

0.7574525745257452