This notebook aims to load the raw football match data set which includes match logs with a variety of statistics from various leagues over many years. The following is filtered in this notebook:

- Leagues: Bundesliga, English Premier League, La Liga, Ligue 1, and Serie A
- Season: 2023/2024
- Statistics: Match score, XG, ball possesion, shot on target, corners, and total passes

The output is a clean DataFrame, where each row represents a single match and its information in preparation for feature engineering.

In [None]:
import pandas as pd

In [2]:
# Load the full dataset

df = pd.read_csv(r"C:\cross-league-match-outcome-drivers\data\Football.csv")
df.shape


  df = pd.read_csv(r"C:\cross-league-match-outcome-drivers\data\Football.csv")


(95384, 91)

In [None]:
# Remove the unwanted columns and rows

# Remove columns
cols = ['League', 'home_score', 'away_score', 'season_year', 'expected_goals_xg_home', 'expected_goals_xg_host', 'Ball_Possession_Home', 'Ball_Possession_Host', 'Shots_on_Goal_Home', 'Shots_on_Goal_Host', 'Corner_Kicks_Home', 'Corner_Kicks_Host', 'Total_Passes_Home', 'Total_Passes_Host']
df = df[cols]

# Remove rows
df = df[
    (df["season_year"] == "2023/2024") &
    (df["League"].isin([
        "Premier-league",
        "Laliga",
        "Bundesliga",
        "Serie-a",
        "Ligue-1"
    ]))
]

Unnamed: 0,League,home_score,away_score,season_year,expected_goals_xg_home,expected_goals_xg_host,Ball_Possession_Home,Ball_Possession_Host,Shots_on_Goal_Home,Shots_on_Goal_Host,Corner_Kicks_Home,Corner_Kicks_Host,Total_Passes_Home,Total_Passes_Host
81,Bundesliga,0,4,2023/2024,1.94,2.76,42%,58%,5.0,8.0,14.0,8.0,363.0,703.0
82,Bundesliga,0,3,2023/2024,1.32,2.25,55%,45%,3.0,8.0,7.0,8.0,482.0,313.0
83,Bundesliga,2,1,2023/2024,2.86,0.77,62%,38%,8.0,2.0,5.0,4.0,728.0,382.0
84,Bundesliga,4,0,2023/2024,2.83,0.42,71%,29%,10.0,1.0,7.0,3.0,727.0,258.0
85,Bundesliga,2,2,2023/2024,2.02,1.73,46%,54%,6.0,5.0,8.0,3.0,432.0,520.0
86,Bundesliga,4,1,2023/2024,0.92,2.56,50%,50%,7.0,9.0,5.0,12.0,370.0,419.0
87,Bundesliga,4,2,2023/2024,2.21,1.59,48%,52%,9.0,4.0,6.0,5.0,469.0,509.0
88,Bundesliga,4,0,2023/2024,1.84,1.45,58%,42%,11.0,5.0,8.0,4.0,716.0,475.0
89,Bundesliga,2,1,2023/2024,2.68,1.04,43%,57%,5.0,5.0,3.0,7.0,303.0,517.0
90,Bundesliga,4,1,2023/2024,3.87,1.98,48%,52%,12.0,8.0,8.0,8.0,355.0,392.0


In [None]:
# Clean up the title names

df = df.rename(columns={
    "home_score": "home_goals",
    "away_score": "away_goals",
    "expected_goals_xg_home": "xg_home",
    "expected_goals_xg_host": "xg_away",
    "Ball_Possession_Home": "possession_home",
    "Ball_Possession_Host": "possession_away",
    "Shots_on_Goal_Home": "shots_on_target_home",
    "Shots_on_Goal_Host": "shots_on_target_away",
    "Corner_Kicks_Home": "corners_home",
    "Corner_Kicks_Host": "corners_away",
    "Total_Passes_Home": "passes_home",
    "Total_Passes_Host": "passes_away"
})

df.head(5)

Unnamed: 0,League,home_goals,away_goals,season_year,xg_home,xg_away,possession_home,possession_away,shots_on_target_home,shots_on_target_away,corners_home,corners_away,passes_home,passes_away
81,Bundesliga,0,4,2023/2024,1.94,2.76,42%,58%,5.0,8.0,14.0,8.0,363.0,703.0
82,Bundesliga,0,3,2023/2024,1.32,2.25,55%,45%,3.0,8.0,7.0,8.0,482.0,313.0
83,Bundesliga,2,1,2023/2024,2.86,0.77,62%,38%,8.0,2.0,5.0,4.0,728.0,382.0
84,Bundesliga,4,0,2023/2024,2.83,0.42,71%,29%,10.0,1.0,7.0,3.0,727.0,258.0
85,Bundesliga,2,2,2023/2024,2.02,1.73,46%,54%,6.0,5.0,8.0,3.0,432.0,520.0
