## Reading match data into DF

In [None]:
import pandas as pd

In [None]:
matches = pd.read_csv("matches.csv", index_col=0)

In [None]:
matches.head()

## Investigating missing data

In [None]:
matches.shape

In [None]:
38 * 20 * 2 # how many matches there are in total

In [None]:
matches["team"].value_counts()

In [None]:
matches[matches["team"]] == "Liverpool" # turns out we are missing data for one season for Liverpool

## Cleaning our data for ML

In [None]:
matches.dtypes

In [None]:
matches["date"] = pd.to_datetime(matches["date"]) 

In [None]:
matches

## Creating Predictors

In [None]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [None]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [None]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int") 

In [None]:
matches["day_code"] = matches["date"].dt.dayofweek

In [None]:
matches["target"] = (matches["result"] == "W").astype("int")

In [None]:
matches

## Creating the Machine Learning Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches["date"] < '2022-01-01']# data in test set comes after training set.Cant use data in future to predict past, but can do vice versa

In [None]:
test = matches[matches["date"] > '2022-01-01'] 

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [None]:
rf.fit(train[predictors], train["target"])

In [None]:
preds = rf.predict(test[predictors]) 

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc = accuracy_score(test["target"], preds)

In [None]:
acc

In [None]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [None]:
pd.crosstab(index=combined["actual"], columns = combined["prediction"]) # predicted losses more accurately compared to wins. we care about wins

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(test["target"], preds)

## Improving precision with rolling averages

In [None]:
grouped_matches = matches.groupby("team") #creates a df for every squad 

In [None]:
group = grouped_matches.get_group("Manchester City")

In [None]:
group

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date") 
    rolling_stats = group(cols).rolling(3, closed='left').mean() 
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) #making sure we dont pass missing values
    return group

In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] 
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
new_cols

In [None]:
rolling_averages(group, cols, new_cols)

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
# apply rolling averages to each team

In [None]:
matches_rolling = matches_rolling.droplevel('team')

In [None]:
matches_rolling

## Retraining out machine learning model

In [None]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01'] 
    rf.fit(train[predictors], train["target"]) 
    preds = rf.predict(test["target"])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds) 
    return combined, precision 

In [None]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [None]:
precision

In [None]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]],left_indez=True, right_index=True)

In [None]:
combined

## Combining Home and Away Predictions

In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key 
    
map_values = {
    "Brighton and Hove Albion": "Brighton", 
    "Manchester United": "Manchester Utd", 
    "Newcastle United": "Newcastle Utd", 
    "Tottenham Hotspur": "Tottenham", 
    "West Ham United": "West Ham", 
    "Wolverhampton Wanderers": "Wolves"
}
mapping = MissingDict(**map_values) 

In [None]:
mapping["West Ham United"]

In [None]:
combined["new_team"] = combined["team"].map(mapping)

In [None]:
combined

In [None]:
merged = combined.merge(combined, left_on = ["date", "new_team"], right_on = ["date", "opponent"])

In [None]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

In [None]:
27 / 40 # final precision