In [None]:
import pandas as pd

In [None]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [None]:
matches.head()

In [None]:
matches.shape

In [None]:
matches["team"].value_counts()

In [None]:
matches["round"].value_counts()

In [None]:
matches.dtypes

In [None]:
matches["date"] = pd.to_datetime(matches["date"])

In [None]:
matches.dtypes

In [None]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [None]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [None]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [None]:
matches["day_code"] = matches["date"].dt.dayofweek

In [None]:
matches["target"] = (matches["result"] == "W").astype("int")

In [217]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2024-08-18,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,Chelsea,...,0.0,0,0,2024,Manchester City,0,6,16,6,1
2,2024-08-24,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,1.0,Ipswich Town,...,1.0,1,1,2024,Manchester City,1,10,15,5,1
3,2024-08-31,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,1.0,West Ham,...,1.0,0,0,2024,Manchester City,0,21,17,5,1
0,2024-08-17,15:00,Premier League,Matchweek 1,Sat,Away,W,3.0,0.0,Everton,...,0.0,0,0,2024,Brighton and Hove Albion,0,8,15,5,1
1,2024-08-24,12:30,Premier League,Matchweek 2,Sat,Home,W,2.0,1.0,Manchester Utd,...,1.0,0,0,2024,Brighton and Hove Albion,1,15,12,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Premier League,Matchweek 29,Wed,Away,L,2.0,4.0,Manchester Utd,...,1.0,0,0,2023,Sheffield United,0,15,20,2,0
37,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Away,L,1.0,5.0,Newcastle Utd,...,0.0,0,0,2023,Sheffield United,0,16,15,5,0
38,2024-05-04,15:00,Premier League,Matchweek 36,Sat,Home,L,1.0,3.0,Nott'ham Forest,...,0.0,1,1,2023,Sheffield United,1,17,15,5,0
39,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Away,L,0.0,1.0,Everton,...,0.0,0,0,2023,Sheffield United,0,8,15,5,0


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches["date"] <= '2024-01-01']

In [None]:
test = matches[matches["date"] > '2024-01-01']

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [None]:
rf.fit(train[predictors], train["target"])

In [None]:
RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc = accuracy_score(test["target"], preds)

In [None]:
acc

In [None]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [None]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(test["target"], preds)

In [None]:
grouped_matches = matches.groupby("team")

In [None]:
group = grouped_matches.get_group("Manchester City")

In [None]:
group

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
new_cols

In [None]:
rolling_averages(group, cols, new_cols)

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [None]:
matches_rolling

In [None]:
matches_rolling = matches_rolling.droplevel('team')

In [None]:
matches_rolling

In [None]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

In [None]:
def make_predictions(data, predictors):
    train = data[data["date"] <= '2024-01-01']
    test = data[data["date"] > '2024-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [None]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [None]:
precision

In [None]:
combined

In [None]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [None]:
combined

In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

In [None]:
mapping["Arsenal"]

In [None]:
mapping["West Ham United"]

In [None]:
combined["new_team"] = combined["team"].map(mapping)

In [None]:
merged = combined.merge(combined, left_on = ["date", "new_team"], right_on=["date","opponent"])

In [216]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,1,1,2024-01-20,Arsenal,Crystal Palace,W,Arsenal,0,0,Crystal Palace,Arsenal,L,Crystal Palace
1,1,1,2024-01-30,Arsenal,Nott'ham Forest,W,Arsenal,0,0,Nottingham Forest,Arsenal,L,Nottingham Forest
2,1,1,2024-02-04,Arsenal,Liverpool,W,Arsenal,0,0,Liverpool,Arsenal,L,Liverpool
3,1,0,2024-02-11,Arsenal,West Ham,W,Arsenal,0,0,West Ham United,Arsenal,L,West Ham
4,1,1,2024-02-17,Arsenal,Burnley,W,Arsenal,0,0,Burnley,Arsenal,L,Burnley
...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,0,0,2024-05-11,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,0,Crystal Palace,Wolves,W,Crystal Palace
366,0,0,2024-05-19,Wolverhampton Wanderers,Liverpool,L,Wolves,1,1,Liverpool,Wolves,W,Liverpool
367,0,0,2024-08-17,Wolverhampton Wanderers,Arsenal,L,Wolves,1,0,Arsenal,Wolves,W,Arsenal
368,0,0,2024-08-25,Wolverhampton Wanderers,Chelsea,L,Wolves,1,0,Chelsea,Wolves,W,Chelsea


In [None]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

In [215]:
last_date = matches_rolling["date"].max()

In [218]:
last_matches = group.sort_values("date").iloc[-3:]

rolling_stats = last_matches[cols].rolling(3, closed='left').mean().iloc[-1]

next_match_stats = rolling_stats.to_dict()


In [228]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Southampton").sort_values("date")

rolling_stats = group[cols].rolling(3, closed='left').mean().iloc[-1]

next_match_data = {
    "date": ["2024-09-14"],
    "team": ["Southampton"],
    "venue_code": [1],
    "opp_code": [15],
    "hour": [14],
    "day_code": [5],
    "gf_rolling": [rolling_stats["gf"]],
    "ga_rolling": [rolling_stats["ga"]],
    "sh_rolling": [rolling_stats["sh"]],
    "sot_rolling": [rolling_stats["sot"]],
    "dist_rolling": [rolling_stats["dist"]],
    "fk_rolling": [rolling_stats["fk"]],
    "pk_rolling": [rolling_stats["pk"]],
    "pkatt_rolling": [rolling_stats["pkatt"]]
}

next_match_df = pd.DataFrame(next_match_data)


next_match_pred = rf.predict(next_match_df[predictors + new_cols])


if next_match_pred[0] == 1:
    print("Prediction: Win")
else:
    print("Prediction: Not Win (Loss/Draw)")


Prediction: Win


In [229]:
next_match_pred = rf.predict(next_match_df[predictors + new_cols])
next_match_proba = rf.predict_proba(next_match_df[predictors + new_cols])


if next_match_pred[0] == 1:
    print(f"Prediction: Win with {next_match_proba[0][1] * 100:.2f}% confidence")
else:
    print(f"Prediction: Not Win (Loss/Draw) with {next_match_proba[0][0] * 100:.2f}% confidence")


Prediction: Win with 57.36% confidence
