# 06 - Add Elo feature to long features

Goals:
- merge `elo_rating_22_23.csv` and `features_matches_long_22_23.csv` to avoid repetitions after

In [13]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 50)

In [14]:
# Load long-format features (two rows per match: one per team)
df_long = pd.read_csv("../data/processed/features_matches_long_22_23.csv")
print("Long features shape:", df_long.shape)
df_long.head()

Long features shape: (760, 27)


Unnamed: 0,match_id,season,date,team,opponent,is_home,result,goals_for,goals_against,points,prob_book_home,prob_book_draw,prob_book_away,goal_diff,xg_diff,xg_for,xg_against,rolling_xg_for_5,rolling_xg_against_5,rolling_xg_diff_5,rolling_points_5,rolling_home_points_5,rolling_away_points_5,strength_points_5,rolling_goal_diff_5,rolling_home_goal_diff_5,rolling_away_goal_diff_5
0,1,2022-2023,2022-08-05,Arsenal,Crystal Palace,0,A,2,0,3,0.218795,0.266625,0.514581,2,-0.2,1.0,1.2,,,,,,,,,,
1,16,2022-2023,2022-08-13,Arsenal,Leicester City,1,H,4,2,3,0.616659,0.222323,0.161018,2,2.2,2.7,0.5,,,,,,,,,,
2,23,2022-2023,2022-08-20,Arsenal,Bournemouth,0,A,3,0,3,0.121887,0.204302,0.673812,3,1.0,1.3,0.3,,,,,,,,,,
3,35,2022-2023,2022-08-27,Arsenal,Fulham,1,H,2,1,3,0.73085,0.167736,0.101414,1,1.8,2.6,0.8,,,,,,,,,,
4,49,2022-2023,2022-08-31,Arsenal,Aston Villa,1,H,2,1,3,0.647946,0.209616,0.142438,1,2.0,2.4,0.4,,,,,,,,,,


In [15]:
# Load Elo ratings (one row per match, wide format)
df_elo = pd.read_csv("../data/processed/elo_rating_22_23.csv", parse_dates=["date"])
print("Elo shape:", df_elo.shape)
df_elo.head()

Elo shape: (380, 8)


Unnamed: 0,match_id,date,home_team,away_team,result,elo_home_before,elo_away_before,elo_diff_home
0,1,2022-08-05,Crystal Palace,Arsenal,A,1500.0,1500.0,0.0
1,2,2022-08-06,Tottenham Hotspur,Southampton,H,1500.0,1500.0,0.0
2,3,2022-08-06,Fulham,Liverpool,D,1500.0,1500.0,0.0
3,4,2022-08-06,Bournemouth,Aston Villa,H,1500.0,1500.0,0.0
4,5,2022-08-06,Newcastle United,Nottingham Forest,H,1500.0,1500.0,0.0


In [16]:
# Check uniqueness of match_id in Elo (should be one row per match)
duplicates_elo = df_elo["match_id"].duplicated().sum()
print("Number of duplicated match_id in Elo:", duplicates_elo)

# Check how many unique matches in long vs elo
print("Unique match_id in LONG:", df_long["match_id"].nunique())
print("Unique match_id in ELO :", df_elo["match_id"].nunique())

Number of duplicated match_id in Elo: 0
Unique match_id in LONG: 380
Unique match_id in ELO : 380


In [17]:
# Merge Elo (wide) into long-format features using match_id
df_merged = df_long.merge(
    df_elo[[
        "match_id",
        "elo_home_before",
        "elo_away_before",
        "elo_diff_home",
    ]],
    on="match_id",
    how="left",
    validate="many_to_one",  # long: many rows per match_id, elo: one row
)

print("Merged shape:", df_merged.shape)

df_merged[[
    "match_id",
    "team",
    "opponent",
    "is_home",
    "elo_home_before",
    "elo_away_before",
    "elo_diff_home",
]].head(10)

Merged shape: (760, 30)


Unnamed: 0,match_id,team,opponent,is_home,elo_home_before,elo_away_before,elo_diff_home
0,1,Arsenal,Crystal Palace,0,1500.0,1500.0,0.0
1,16,Arsenal,Leicester City,1,1515.328421,1497.171579,18.156841
2,23,Arsenal,Bournemouth,0,1500.192375,1524.387995,-24.19562
3,35,Arsenal,Fulham,1,1538.87863,1509.853905,29.024725
4,49,Arsenal,Aston Villa,1,1547.580022,1473.449924,74.130098
5,60,Arsenal,Manchester United,0,1516.028408,1554.871993,-38.843585
6,67,Arsenal,Brentford,0,1510.478673,1543.845827,-33.367154
7,68,Arsenal,Tottenham Hotspur,1,1558.013576,1552.23986,5.773716
8,85,Arsenal,Liverpool,1,1567.488821,1508.390265,59.098556
9,94,Arsenal,Leeds United,0,1484.212624,1575.235632,-91.023008


In [18]:
# Build team-level Elo features based on is_home

is_home_mask = df_merged["is_home"] == 1

# Elo of the team (home Elo if is_home=1, away Elo otherwise)
df_merged["elo_team_before"] = np.where(
    is_home_mask,
    df_merged["elo_home_before"],
    df_merged["elo_away_before"],
)

# Elo of the opponent (away if team is home, home if team is away)
df_merged["elo_opponent_before"] = np.where(
    is_home_mask,
    df_merged["elo_away_before"],
    df_merged["elo_home_before"],
)

# Elo difference from the team's perspective
df_merged["elo_diff_for_team"] = (
    df_merged["elo_team_before"] - df_merged["elo_opponent_before"]
)

df_merged[[
    "date",
    "team",
    "opponent",
    "is_home",
    "elo_team_before",
    "elo_opponent_before",
    "elo_diff_for_team",
]].head(10)

Unnamed: 0,date,team,opponent,is_home,elo_team_before,elo_opponent_before,elo_diff_for_team
0,2022-08-05,Arsenal,Crystal Palace,0,1500.0,1500.0,0.0
1,2022-08-13,Arsenal,Leicester City,1,1515.328421,1497.171579,18.156841
2,2022-08-20,Arsenal,Bournemouth,0,1524.387995,1500.192375,24.19562
3,2022-08-27,Arsenal,Fulham,1,1538.87863,1509.853905,29.024725
4,2022-08-31,Arsenal,Aston Villa,1,1547.580022,1473.449924,74.130098
5,2022-09-04,Arsenal,Manchester United,0,1554.871993,1516.028408,38.843585
6,2022-09-18,Arsenal,Brentford,0,1543.845827,1510.478673,33.367154
7,2022-10-01,Arsenal,Tottenham Hotspur,1,1558.013576,1552.23986,5.773716
8,2022-10-09,Arsenal,Liverpool,1,1567.488821,1508.390265,59.098556
9,2022-10-16,Arsenal,Leeds United,0,1575.235632,1484.212624,91.023008


In [19]:
# Quick sanity check on one random match_id
sample_match_id = df_merged["match_id"].iloc[0]
print("Sample match_id:", sample_match_id)

df_merged[df_merged["match_id"] == sample_match_id][[
    "match_id",
    "team",
    "opponent",
    "is_home",
    "elo_team_before",
    "elo_opponent_before",
    "elo_diff_for_team",
]]

Sample match_id: 1


Unnamed: 0,match_id,team,opponent,is_home,elo_team_before,elo_opponent_before,elo_diff_for_team
0,1,Arsenal,Crystal Palace,0,1500.0,1500.0,0.0
228,1,Crystal Palace,Arsenal,1,1500.0,1500.0,0.0


In [20]:
df_merged.to_csv("../data/processed/features_matches_long_elo_22_23.csv", index=False)
print(f"Saved long+Elo features to: {OUTPUT_PATH}")

Saved long+Elo features to: ../data/processed/features_matches_long_elo_22_23.csv


In [21]:
df_final = pd.read_csv("../data/processed/features_matches_long_elo_22_23.csv")
print("Final long+Elo shape:", df_final.shape)

df_final[[
    "match_id",
    "date",
    "team",
    "opponent",
    "is_home",
    "rolling_xg_for_5",
    "rolling_points_5",
    "strength_points_5",
    "elo_team_before",
    "elo_diff_for_team",
]].head(10)

Final long+Elo shape: (760, 33)


Unnamed: 0,match_id,date,team,opponent,is_home,rolling_xg_for_5,rolling_points_5,strength_points_5,elo_team_before,elo_diff_for_team
0,1,2022-08-05,Arsenal,Crystal Palace,0,,,,1500.0,0.0
1,16,2022-08-13,Arsenal,Leicester City,1,,,,1515.328421,18.156841
2,23,2022-08-20,Arsenal,Bournemouth,0,,,,1524.387995,24.19562
3,35,2022-08-27,Arsenal,Fulham,1,,,,1538.87863,29.024725
4,49,2022-08-31,Arsenal,Aston Villa,1,,,,1547.580022,74.130098
5,60,2022-09-04,Arsenal,Manchester United,0,2.0,15.0,,1554.871993,38.843585
6,67,2022-09-18,Arsenal,Brentford,0,2.06,12.0,,1543.845827,33.367154
7,68,2022-10-01,Arsenal,Tottenham Hotspur,1,1.82,12.0,,1558.013576,5.773716
8,85,2022-10-09,Arsenal,Liverpool,1,2.04,12.0,,1567.488821,59.098556
9,94,2022-10-16,Arsenal,Leeds United,0,2.06,12.0,,1575.235632,91.023008
