## Prediction on 2024/25 Season

### Imports

In [12]:
import json
import pickle

import numpy as np
import pandas as pd

### Prediction on 2024/25 Season

In [None]:
teams_2024 = json.load(open("../encoders/teams_2024.json"))


def rolling_stats(df, team_name):
    """Creates rolling averages statistics"""
    df.dropna(subset=["Date"], inplace=True)

    # Getting rolling averages
    cols = ["GF", "GA", "Sh", "SoT", "PK", "PKatt"]
    new_cols = [f"{c}_rolling" for c in cols]
    rolling_stats = df[cols].rolling(3, closed="left").mean()
    df[new_cols] = rolling_stats
    # df = df.dropna(subset=new_cols)

    df.loc[df["Venue"] == "Home", "home_team"] = team_name
    df.loc[df["Venue"] == "Home", "away_team"] = df["Opponent"]
    df.loc[df["Venue"] == "Away", "home_team"] = df["Opponent"]
    df.loc[df["Venue"] == "Away", "away_team"] = team_name
    df["Date"] = pd.to_datetime(df["Date"], format="%Y-%m-%d")

    # Check if 'Venue' column has correct entries
    if "Home" not in df["Venue"].unique() or "Away" not in df["Venue"].unique():
        print("Error: 'Venue' column does not contain expected values.")
        return df

    SH_ROLLING_HOME_COLS = [f"{c}_rolling_h" for c in cols]
    SH_ROLLING_AWAY_COLS = [f"{c}_rolling_a" for c in cols]

    df.loc[df["Venue"] == "Home", SH_ROLLING_HOME_COLS] = df.loc[
        df["Venue"] == "Home", new_cols
    ].values
    df.loc[df["Venue"] == "Away", SH_ROLLING_AWAY_COLS] = df.loc[
        df["Venue"] == "Away", new_cols
    ].values

    # Filling in missing rolling away stats with 0 so that they don't add an bias to the stats - form at start of season is unknown
    # First week rolling shooting stats are set at 0 so so that they don't add an bias to the stats - form at start of season is unknown
    if "Round" in df.columns and df["Round"].dtype == "object":
        # Extract the numeric part from 'Round' and convert it to an integer
        df["week"] = df["Round"].str.extract(r"(\d+)").astype(int)
    else:
        print("Error: 'Round' column is missing or not in the expected format.")

    # Week 1
    df.loc[(df["week"] == 1) & (df["Venue"] == "Home"), SH_ROLLING_HOME_COLS] = 0
    df.loc[(df["week"] == 1) & (df["Venue"] == "Away"), SH_ROLLING_AWAY_COLS] = 0

    # Week 2 - set at the last week's stats
    df.loc[(df["week"] == 2) & (df["Venue"] == "Home"), SH_ROLLING_HOME_COLS] = 0
    df.loc[(df["week"] == 2) & (df["Venue"] == "Away"), SH_ROLLING_AWAY_COLS] = 0

    # Week 3
    df.loc[(df["week"] == 3) & (df["Venue"] == "Home"), SH_ROLLING_HOME_COLS] = 0
    df.loc[(df["week"] == 3) & (df["Venue"] == "Away"), SH_ROLLING_AWAY_COLS] = 0

    return df


def merge_rolling_stats(teams):
    rolling_dfs = []
    for team in teams:
        df = pd.read_csv("../shooting_data_2024/" + team + ".csv")
        rolling_df = rolling_stats(df, teams[team])
        rolling_dfs.append(rolling_df)

    combined_df = pd.concat(rolling_dfs, ignore_index=False)
    merged_df = combined_df.groupby(
        ["Date", "home_team", "away_team"], as_index=False
    ).first()
    # merged_df = merged_df.drop(['G-xG', 'npxG', 'npxG/Sh', 'np:G-xG', 'xG', 'Match Report', 'Match Report.1'], axis=1)

    return merged_df


rolling_df = merge_rolling_stats(teams_2024)
rolling_df.head()

Unnamed: 0.1,Date,home_team,away_team,Unnamed: 0,Time,Round,Day,Venue,Result,GF,...,SoT_rolling_h,PK_rolling_h,PKatt_rolling_h,GF_rolling_a,GA_rolling_a,Sh_rolling_a,SoT_rolling_a,PK_rolling_a,PKatt_rolling_a,week
0,2024-08-16,Manchester Utd,Fulham,0,20:00,Matchweek 1,Fri,Away,L,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2024-08-17,Arsenal,Wolves,0,15:00,Matchweek 1,Sat,Home,W,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2024-08-17,Everton,Brighton,0,15:00,Matchweek 1,Sat,Away,W,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,2024-08-17,Ipswich Town,Liverpool,0,12:30,Matchweek 1,Sat,Home,L,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,2024-08-17,Newcastle Utd,Southampton,0,15:00,Matchweek 1,Sat,Home,W,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
### Loading matches to predict
future_matches = pd.read_csv("../2024-25.csv", index_col=0).iloc[:18]

### Cleaning matches to predict
future_matches["Date"] = pd.to_datetime(future_matches["Date"], format="%Y-%m-%d")
future_matches.dropna(subset=["Date"], inplace=True)
future_matches["week"] = future_matches["week"].astype(int)

### Encoding features
# Loading team encoder from file
with open("../encoders/team_encoder.pkl", "rb") as file:
    loaded_encoder = pickle.load(file)

# Transform using the loaded encoder
future_matches["home_team"] = future_matches["Home"]
future_matches["away_team"] = future_matches["Away"]
future_matches["home_team_encoded"] = loaded_encoder.transform(future_matches["Home"])
future_matches["away_team_encoded"] = loaded_encoder.transform(future_matches["Away"])

# Loading venue encoder from file
with open("../encoders/venue_encoder.pkl", "rb") as file:
    venue_encoder = pickle.load(file)

# Cleaning up mismatches and changes in Stadium names before encoding
venue_replacements = {
    "The American Express Stadium": "The American Express Community Stadium",
    "St Mary's Stadium": "St. Mary's Stadium",
}
future_matches["Venue"] = future_matches["Venue"].replace(venue_replacements)
future_matches["venue_code"] = venue_encoder.transform(
    future_matches["Venue"]
)  # Transform using the loaded encoder

# Encoding day and hour
future_matches["hour"] = future_matches["Time"].str[:2].astype(int)
future_matches["day_code"] = future_matches[
    "Date"
].dt.dayofweek  # Gives each day of the week a code e.g. Mon = 0, Tues = 2, ....

# Adding FTHG and FTAG to the data
future_matches[["FTHG", "FTAG"]] = future_matches["Score"].str.split(
    "–", expand=True
)  # Full-Time Home Goals and Full-Time Away Goals
future_matches["season"] = "2024/25"
future_matches["season_encoded"] = 11

# Merge with rolling stats
rolling_df = merge_rolling_stats(teams_2024)
future_matches = pd.merge(
    future_matches,
    rolling_df,
    how="left",
    on=["Day", "Date", "Time", "home_team", "away_team"],
    suffixes=("", "_y"),
)

### Model from Training.ipynb

### Features and labels

In [None]:
features = []
labels = []

X = future_matches[features]
y = future_matches[labels]  # Predicting home and away goals

future_scores = model.predict(X)

future_scores = future_scores.astype(int)
future_matches["PredScore"] = [f"{h}–{a}" for h, a in future_scores]
future_matches[["PredFTHG", "PredFTAG"]] = (
    future_matches["PredScore"].str.split("–", expand=True).astype(int)
)


future_matches["Result"] = np.where(
    future_matches["FTHG"] > future_matches["FTAG"],
    "W",
    np.where(future_matches["FTHG"] < future_matches["FTAG"], "L", "D"),
)

future_matches["PredResult"] = [
    "W" if h > a else "D" if h == a else "L" for h, a in future_scores
]

future_matches[
    [
        "Day",
        "Date",
        "Time",
        "Home",
        "Score",
        "PredScore",
        "Result",
        "PredResult",
        "Away",
        "Venue",
    ]
]