# 03 â€“ Feature Engineering on `matches_long`

Goals:
- Load the dataset `matches_long_22_23.csv`
- Create basic target variables (points, goal difference)
- Build **rolling features** for each team
    - `rolling_home_goal_diff_5`, `rolling_away_goal_diff_5`, `rolling_goal_diff_5`
    - `rolling_xg_for_5`, `rolling_xg_against_5`, `rolling_xg_diff_5`
    - `rolling_home_points_5`, `rolling_away_points_5`,`rolling_points_5`
    - `strength_points_5`

- Save an enriched dataset for model training

In [117]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/matches_long_22_23.csv")
df = df.sort_values(["team", "date"]).reset_index(drop=True) # sort by teams then by dates


In [118]:
def get_points(row):
    if row["result"] == "H" and row["is_home"] == 1:
        return 3
    if row["result"] == "A" and row["is_home"] == 0:
        return 3
    if row["result"] == "D":
        return 1
    return 0

df["points"] = df.apply(get_points, axis=1)
df.columns
df.head()

Unnamed: 0,match_id,season,date,team,opponent,is_home,goals_for,goals_against,xg_for,xg_against,shots_on_target_for,shots_on_target_against,poss,prob_book_home,prob_book_draw,prob_book_away,result,points
0,1,2022-2023,2022-08-05,Arsenal,Crystal Palace,0,2,0,1.0,1.2,2,2,44,0.218795,0.266625,0.514581,A,3
1,16,2022-2023,2022-08-13,Arsenal,Leicester City,1,4,2,2.7,0.5,7,2,50,0.616659,0.222323,0.161018,H,3
2,23,2022-2023,2022-08-20,Arsenal,Bournemouth,0,3,0,1.3,0.3,6,1,57,0.121887,0.204302,0.673812,A,3
3,35,2022-2023,2022-08-27,Arsenal,Fulham,1,2,1,2.6,0.8,8,3,71,0.73085,0.167736,0.101414,H,3
4,49,2022-2023,2022-08-31,Arsenal,Aston Villa,1,2,1,2.4,0.4,8,3,59,0.647946,0.209616,0.142438,H,3


In [119]:
df["goal_diff"] = df["goals_for"] - df["goals_against"]

WINDOW = 5  # last 5 home games

# Initialize column with NaN
df["rolling_home_goal_diff_5"] = np.nan

# Loop over each team and compute rolling goal diff at home
for team in df["team"].unique():
    # Only this team AND home games
    team_mask = (df["team"] == team) & (df["is_home"] == 1)
    team_goal_diff_home = df.loc[team_mask, "goal_diff"]

    df.loc[team_mask, "rolling_home_goal_diff_5"] = (
        team_goal_diff_home.shift(1).rolling(WINDOW).mean()
    )

# Quick check
df[["team", "date", "is_home", "goal_diff", "rolling_home_goal_diff_5"]].head(15)

Unnamed: 0,team,date,is_home,goal_diff,rolling_home_goal_diff_5
0,Arsenal,2022-08-05,0,2,
1,Arsenal,2022-08-13,1,2,
2,Arsenal,2022-08-20,0,3,
3,Arsenal,2022-08-27,1,1,
4,Arsenal,2022-08-31,1,1,
5,Arsenal,2022-09-04,0,-2,
6,Arsenal,2022-09-18,0,3,
7,Arsenal,2022-10-01,1,2,
8,Arsenal,2022-10-09,1,1,
9,Arsenal,2022-10-16,0,1,


In [120]:
# Initialize column with NaN
df["rolling_away_goal_diff_5"] = np.nan

# Loop over each team and compute rolling goal diff away
for team in df["team"].unique():
    # Only this team AND away games
    team_mask = (df["team"] == team) & (df["is_home"] == 0)
    team_goal_diff_away = df.loc[team_mask, "goal_diff"]

    df.loc[team_mask, "rolling_away_goal_diff_5"] = (
        team_goal_diff_away.shift(1).rolling(WINDOW).mean()
    )

# Quick check
df[["team", "date", "is_home", "goal_diff", "rolling_away_goal_diff_5"]].head(15)

Unnamed: 0,team,date,is_home,goal_diff,rolling_away_goal_diff_5
0,Arsenal,2022-08-05,0,2,
1,Arsenal,2022-08-13,1,2,
2,Arsenal,2022-08-20,0,3,
3,Arsenal,2022-08-27,1,1,
4,Arsenal,2022-08-31,1,1,
5,Arsenal,2022-09-04,0,-2,
6,Arsenal,2022-09-18,0,3,
7,Arsenal,2022-10-01,1,2,
8,Arsenal,2022-10-09,1,1,
9,Arsenal,2022-10-16,0,1,


In [121]:
WINDOW = 5

# Initialize column
df["rolling_goal_diff_5"] = np.nan

for team in df["team"].unique():
    team_mask = df["team"] == team
    team_goal_diff = df.loc[team_mask, "goal_diff"]

    df.loc[team_mask, "rolling_goal_diff_5"] = (
        team_goal_diff.shift(1).rolling(WINDOW).mean()
    )

df[["team", "date", "goal_diff", "rolling_goal_diff_5"]].head(15)

Unnamed: 0,team,date,goal_diff,rolling_goal_diff_5
0,Arsenal,2022-08-05,2,
1,Arsenal,2022-08-13,2,
2,Arsenal,2022-08-20,3,
3,Arsenal,2022-08-27,1,
4,Arsenal,2022-08-31,1,
5,Arsenal,2022-09-04,-2,1.8
6,Arsenal,2022-09-18,3,1.0
7,Arsenal,2022-10-01,2,1.2
8,Arsenal,2022-10-09,1,1.0
9,Arsenal,2022-10-16,1,1.0


In [122]:
WINDOW = 5  # number of past games to use

# Initialize column with NaN
df["rolling_xg_for_5"] = np.nan

# Loop over each team and compute rolling xG For
for team in df["team"].unique():
    # Select rows for this team
    team_mask = df["team"] == team
    team_xg_for = df.loc[team_mask, "xg_for"]

    # Shift by 1 to avoid leakage, then rolling mean over last 5 games
    df.loc[team_mask, "rolling_xg_for_5"] = (
        team_xg_for.shift(1).rolling(WINDOW).mean()
    )

# Quick check
df[["team", "date", "xg_for", "rolling_xg_for_5"]].head(10)

Unnamed: 0,team,date,xg_for,rolling_xg_for_5
0,Arsenal,2022-08-05,1.0,
1,Arsenal,2022-08-13,2.7,
2,Arsenal,2022-08-20,1.3,
3,Arsenal,2022-08-27,2.6,
4,Arsenal,2022-08-31,2.4,
5,Arsenal,2022-09-04,1.3,2.0
6,Arsenal,2022-09-18,1.5,2.06
7,Arsenal,2022-10-01,2.4,1.82
8,Arsenal,2022-10-09,2.7,2.04
9,Arsenal,2022-10-16,0.5,2.06


In [123]:
WINDOW = 5  # same window size

# Initialize column with NaN
df["rolling_xg_against_5"] = np.nan

# Loop over each team and compute rolling xG Against
for team in df["team"].unique():
    team_mask = df["team"] == team
    team_xg_against = df.loc[team_mask, "xg_against"]

    df.loc[team_mask, "rolling_xg_against_5"] = (
        team_xg_against.shift(1).rolling(WINDOW).mean()
    )

# Quick check
df[["team", "date", "xg_against", "rolling_xg_against_5"]].head(10)

Unnamed: 0,team,date,xg_against,rolling_xg_against_5
0,Arsenal,2022-08-05,1.2,
1,Arsenal,2022-08-13,0.5,
2,Arsenal,2022-08-20,0.3,
3,Arsenal,2022-08-27,0.8,
4,Arsenal,2022-08-31,0.4,
5,Arsenal,2022-09-04,1.5,0.64
6,Arsenal,2022-09-18,0.5,0.7
7,Arsenal,2022-10-01,1.6,0.7
8,Arsenal,2022-10-09,1.1,0.96
9,Arsenal,2022-10-16,1.8,1.02


In [124]:
df["xg_diff"] = df["xg_for"] - df["xg_against"]

# Initialize with NaN
df["rolling_xg_diff_5"] = np.nan

# Compute rolling xG differential (5 games)
for team in df["team"].unique():
    team_mask = df["team"] == team
    team_xg_diff = df.loc[team_mask, "xg_diff"]

    df.loc[team_mask, "rolling_xg_diff_5"] = (
        team_xg_diff.shift(1).rolling(5).mean()
    )

# Quick check
df[["team", "date", "xg_diff", "rolling_xg_diff_5"]].head(10)

Unnamed: 0,team,date,xg_diff,rolling_xg_diff_5
0,Arsenal,2022-08-05,-0.2,
1,Arsenal,2022-08-13,2.2,
2,Arsenal,2022-08-20,1.0,
3,Arsenal,2022-08-27,1.8,
4,Arsenal,2022-08-31,2.0,
5,Arsenal,2022-09-04,-0.2,1.36
6,Arsenal,2022-09-18,1.0,1.36
7,Arsenal,2022-10-01,0.8,1.12
8,Arsenal,2022-10-09,1.6,1.08
9,Arsenal,2022-10-16,-1.3,1.04


In [125]:
# Initialize column with NaN
df["rolling_home_points_5"] = np.nan

# Loop over each team
for team in df["team"].unique():
    # Mask: only this team AND home games
    team_home_mask = (df["team"] == team) & (df["is_home"] == 1)
    
    # Points only for those rows
    team_home_points = df.loc[team_home_mask, "points"]
    
    # Shift to avoid leakage, rolling sum over last 5 home games
    df.loc[team_home_mask, "rolling_home_points_5"] = (
        team_home_points.shift(1).rolling(5).sum()
    )

# Quick check
df[["team", "date", "is_home", "points", "rolling_home_points_5"]].head(15)

Unnamed: 0,team,date,is_home,points,rolling_home_points_5
0,Arsenal,2022-08-05,0,3,
1,Arsenal,2022-08-13,1,3,
2,Arsenal,2022-08-20,0,3,
3,Arsenal,2022-08-27,1,3,
4,Arsenal,2022-08-31,1,3,
5,Arsenal,2022-09-04,0,0,
6,Arsenal,2022-09-18,0,3,
7,Arsenal,2022-10-01,1,3,
8,Arsenal,2022-10-09,1,3,
9,Arsenal,2022-10-16,0,3,


In [126]:
# Initialize column with NaN
df["rolling_away_points_5"] = np.nan

# Loop over each team
for team in df["team"].unique():
    # Mask: only this team AND away games
    team_away_mask = (df["team"] == team) & (df["is_home"] == 0)
    
    # Points only for those rows (away games)
    team_away_points = df.loc[team_away_mask, "points"]
    
    # Shift to avoid leakage, rolling sum over last 5 away games
    df.loc[team_away_mask, "rolling_away_points_5"] = (
        team_away_points.shift(1).rolling(5).sum()
    )

# Quick check
df[["team", "date", "is_home", "points", "rolling_away_points_5"]].head(15)

Unnamed: 0,team,date,is_home,points,rolling_away_points_5
0,Arsenal,2022-08-05,0,3,
1,Arsenal,2022-08-13,1,3,
2,Arsenal,2022-08-20,0,3,
3,Arsenal,2022-08-27,1,3,
4,Arsenal,2022-08-31,1,3,
5,Arsenal,2022-09-04,0,0,
6,Arsenal,2022-09-18,0,3,
7,Arsenal,2022-10-01,1,3,
8,Arsenal,2022-10-09,1,3,
9,Arsenal,2022-10-16,0,3,


In [127]:
# Compute total rolling points over last 5 matches
df["rolling_points_5"] = np.nan

for team in df["team"].unique():
    team_mask = df["team"] == team
    team_points = df.loc[team_mask, "points"]

    df.loc[team_mask, "rolling_points_5"] = (
        team_points.shift(1).rolling(5).sum()
    )

df[["team", "date", "points", "rolling_points_5"]].head(15)

Unnamed: 0,team,date,points,rolling_points_5
0,Arsenal,2022-08-05,3,
1,Arsenal,2022-08-13,3,
2,Arsenal,2022-08-20,3,
3,Arsenal,2022-08-27,3,
4,Arsenal,2022-08-31,3,
5,Arsenal,2022-09-04,0,15.0
6,Arsenal,2022-09-18,3,12.0
7,Arsenal,2022-10-01,3,12.0
8,Arsenal,2022-10-09,3,12.0
9,Arsenal,2022-10-16,3,12.0


In [128]:
# Contextual strength: home strength if home, away strength if away
df["strength_points_5"] = np.where(
    df["is_home"] == 1,
    df["rolling_home_points_5"],
    df["rolling_away_points_5"],
)

df[["team", "date", "is_home", "points",
    "rolling_home_points_5",
    "rolling_away_points_5",
    "strength_points_5"]].head(15)

Unnamed: 0,team,date,is_home,points,rolling_home_points_5,rolling_away_points_5,strength_points_5
0,Arsenal,2022-08-05,0,3,,,
1,Arsenal,2022-08-13,1,3,,,
2,Arsenal,2022-08-20,0,3,,,
3,Arsenal,2022-08-27,1,3,,,
4,Arsenal,2022-08-31,1,3,,,
5,Arsenal,2022-09-04,0,0,,,
6,Arsenal,2022-09-18,0,3,,,
7,Arsenal,2022-10-01,1,3,,,
8,Arsenal,2022-10-09,1,3,,,
9,Arsenal,2022-10-16,0,3,,,
