In [1]:
import pandas as pd 

In [2]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [3]:
# converting all objects to int or float to be processed by the machine learning software
matches["date"] = pd.to_datetime(matches["date"])
matches["h/a"] = matches["venue"].astype("category").cat.codes # converting venue to a home (1) or away (0) number
matches["opp"] = matches["opponent"].astype("category").cat.codes # converting opponents to a number
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") ## converting hours to number in case a team plays better at a certain time
matches["day"] = matches["date"].dt.dayofweek # converting day of week of game to a number

matches["target"] = (matches["result"] == "W").astype("int") # setting a win to the value 1

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_split=10, random_state=1)
train = matches[matches["date"] < '2022-01-01'] 
test = matches[matches["date"] > '2022-01-01']
predictors = ["h/a", "opp", "hour", "day"]
rf.fit(train[predictors], train["target"])
RandomForestClassifier(min_samples_split = 10, n_estimators = 100, random_state = 1)
preds = rf.predict(test[predictors]) # making prediction

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
acc = accuracy_score(test["target"], preds) # testing accuracy
acc
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,140,32
1,75,29


In [8]:
from sklearn.metrics import precision_score

In [9]:
precision_score(test["target"], preds)

0.47540983606557374

In [10]:
grouped_matches = matches.groupby("team") 
group = grouped_matches.get_group("Manchester United").sort_values("date")

In [11]:
def rolling_averages(group, cols, new_cols): # function to take into consideration form of a team
    group = group.sort_values("date") # sorting games by date 
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) # droping missing values and replacing with empty
    return group 


In [12]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] 
new_cols = [f"{c}_rolling" for c in cols] # creating new columns with rolling average values 

In [54]:
rolling_averages(group, cols, new_cols) # calling function and generating average of last 3 games

matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team') # dropping extra index level

matches_rolling.index = range(matches_rolling.shape[0]) # adding new index
matches_rolling.head()

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,hour,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,6,Home,W,2.0,1.0,Sheffield Utd,...,14,1,2.0,1.333333,7.666667,3.666667,14.733333,0.666667,0.0,0.0
1,2020-10-17,17:30,Premier League,Matchweek 5,5,Away,L,0.0,1.0,Manchester City,...,17,0,1.666667,1.666667,5.333333,3.666667,15.766667,0.0,0.0,0.0
2,2020-10-25,19:15,Premier League,Matchweek 6,6,Home,L,0.0,1.0,Leicester City,...,19,0,1.0,1.666667,7.0,3.666667,16.733333,0.666667,0.0,0.0
3,2020-11-01,16:30,Premier League,Matchweek 7,6,Away,W,1.0,0.0,Manchester Utd,...,16,1,0.666667,1.0,9.666667,4.0,16.033333,1.0,0.0,0.0
4,2020-11-08,19:15,Premier League,Matchweek 8,6,Home,L,0.0,3.0,Aston Villa,...,19,0,0.333333,0.666667,9.666667,2.666667,18.033333,1.0,0.333333,0.333333


In [14]:
def make_predictions(data, predictors): # making the predictions
    train = data[data["date"] < '2022-01-01'] 
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors]) # making prediction
    combined = pd.DataFrame(dict(actual=test["target"], prediction=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision # returning the values for the prediction

In [15]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [16]:
precision

0.6458333333333334

In [52]:
combined.head()

Unnamed: 0,actual,prediction,date_x,team_x,opponent_x,result_x,new_team,date_y,team_y,opponent_y,result_y
55,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,2022-03-06,Arsenal,Watford,W


In [50]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)
combined.head()

Unnamed: 0,actual,prediction,date_x,team_x,opponent_x,result_x,new_team,date_y,team_y,opponent_y,result_y
55,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,2022-03-06,Arsenal,Watford,W


In [19]:
class MissingDict(dict): # creating a class that inherits from the dictionary class
    __missing__ = lambda self, key: key # in case a team name is missing

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}

In [20]:
mapping = MissingDict(**map_values)
mapping["West Ham United"]

'West Ham'

In [48]:
combined["new_team"] = combined["team"].map(mapping)
combined.head()

Unnamed: 0,actual,prediction,date,team,opponent,result,new_team
55,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal
56,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal
57,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal
58,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal
59,1,1,2022-03-06,Arsenal,Watford,W,Arsenal


In [46]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"]) # finding both the home and away team predictions and merging them 
merged.head()

Unnamed: 0,actual_x,prediction_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,prediction_y,team_y,opponent_y,result_y,new_team_y
0,0,1,2022-01-23,Arsenal,Burnley,D,Arsenal,0,0,Burnley,Arsenal,D,Burnley
1,1,0,2022-02-10,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
2,1,0,2022-02-19,Arsenal,Brentford,W,Arsenal,0,0,Brentford,Arsenal,L,Brentford
3,1,1,2022-02-24,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
4,1,1,2022-03-06,Arsenal,Watford,W,Arsenal,0,0,Watford,Arsenal,L,Watford


We can also use more seasons to the model and more variables to increse the precision, as well we could change our values of the Forest 