## Cleaning passing

In [1]:
from fbref_utils import load_fbref_csv, common_cleaning
import pandas as pd

In [2]:
df_passing = load_fbref_csv("../uncleaned_data_csv/passing.csv")
df_passing = common_cleaning(df_passing)

In [3]:
df_passing.head()

26,Player,Nation,Position,Squad,League,Age,Born,90s,Cmp,Att,...,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP,Matches,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,38,60,...,0.0,0.0,0.0,0,1,0,0,5,Matches,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,529,650,...,5.6,3.8,0.4,40,25,24,0,49,Matches,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,1186,1545,...,3.7,2.5,-1.7,36,91,32,21,130,Matches,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,519,698,...,0.6,1.3,-0.6,5,42,18,5,57,Matches,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,1162,1359,...,1.0,0.8,-1.0,13,101,5,0,131,Matches,2017-2018


In [4]:
df_passing.dtypes

26
Player        object
Nation      category
Position    category
Squad       category
League      category
Age            Int64
Born           Int64
90s           object
Cmp           object
Att           object
Cmp%          object
TotDist       object
PrgDist       object
Cmp           object
Att           object
Cmp%          object
Cmp           object
Att           object
Cmp%          object
Cmp           object
Att           object
Cmp%          object
Ast           object
xAG           object
xA            object
A-xAG         object
KP            object
1/3           object
PPA           object
CrsPA         object
PrgP          object
Matches       object
Season      category
dtype: object

In [5]:
list(df_passing.columns)

['Player',
 'Nation',
 'Position',
 'Squad',
 'League',
 'Age',
 'Born',
 '90s',
 'Cmp',
 'Att',
 'Cmp%',
 'TotDist',
 'PrgDist',
 'Cmp',
 'Att',
 'Cmp%',
 'Cmp',
 'Att',
 'Cmp%',
 'Cmp',
 'Att',
 'Cmp%',
 'Ast',
 'xAG',
 'xA',
 'A-xAG',
 'KP',
 '1/3',
 'PPA',
 'CrsPA',
 'PrgP',
 'Matches',
 'Season']

In [8]:
print(df_passing.columns.tolist())

['Player', 'Nation', 'Position', 'Squad', 'League', 'Age', 'Born', '90s', 'Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Ast', 'xAG', 'xA', 'A-xAG', 'KP', '1/3', 'PPA', 'CrsPA', 'PrgP', 'Matches', 'Season']


In [14]:
def clean_passing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Passing-specific cleaning:
    - Disambiguate duplicate columns (Cmp, Att, Cmp%) by section (total, short, medium, long).
    - Rename columns into BigQuery-safe snake_case.
    - Convert to appropriate numeric dtypes.

    Parameters
    ----------
    df : pd.DataFrame
        Passing dataframe after common cleaning.

    Returns
    -------
    pd.DataFrame
        Cleaned dataframe ready for BigQuery/analysis.
    """

    # 1. Assign unique names to duplicates BEFORE renaming
    df.columns = [
        "Player", "Nation", "Position", "Squad", "League", "Age", "Born", "90s",
        "Cmp_total", "Att_total", "Cmp_pct_total", "TotDist", "PrgDist",
        "Cmp_short", "Att_short", "Cmp_pct_short",
        "Cmp_medium", "Att_medium", "Cmp_pct_medium",
        "Cmp_long", "Att_long", "Cmp_pct_long",
        "Ast", "xAG", "xA", "A-xAG", "KP", "final_third", "PPA", "CrsPA", "PrgP",
        "Matches", "Season"
    ]

    # 2. BigQuery-safe renaming (snake_case, no symbols)
    rename_map = {
        # Total
        "Cmp_total": "passes_completed_total",
        "Att_total": "passes_attempted_total",
        "Cmp_pct_total": "passes_completion_pct_total",
        "TotDist": "total_passing_distance",
        "PrgDist": "progressive_passing_distance",

        # Short
        "Cmp_short": "passes_completed_short",
        "Att_short": "passes_attempted_short",
        "Cmp_pct_short": "passes_completion_pct_short",

        # Medium
        "Cmp_medium": "passes_completed_medium",
        "Att_medium": "passes_attempted_medium",
        "Cmp_pct_medium": "passes_completion_pct_medium",

        # Long
        "Cmp_long": "passes_completed_long",
        "Att_long": "passes_attempted_long",
        "Cmp_pct_long": "passes_completion_pct_long",

        # Other
        "Ast": "assists",
        "xAG": "xag", #rename this
        "xA": "xa",
        "A-xAG": "assists_minus_xag",
        "KP": "key_passes",
        "final_third": "passes_into_final_third",
        "PPA": "passes_into_penalty_area",
        "CrsPA": "crosses_into_penalty_area",
        "PrgP": "progressive_passes",
        "Matches": "matches"
    }
    df = df.rename(columns=rename_map)

    # 3. Convert to numeric types
    int_cols = [
        "passes_completed_total", "passes_attempted_total",
        "passes_completed_short", "passes_attempted_short",
        "passes_completed_medium", "passes_attempted_medium",
        "passes_completed_long", "passes_attempted_long",
        "assists", "key_passes", "passes_into_final_third",
        "passes_into_penalty_area", "crosses_into_penalty_area",
        "progressive_passes", "matches"
    ]
    float_cols = [
        "passes_completion_pct_total", "total_passing_distance", "progressive_passing_distance",
        "passes_completion_pct_short", "passes_completion_pct_medium", "passes_completion_pct_long",
        "xag", "xa", "assists_minus_xag"
    ]

    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float")

    return df

In [15]:
df_passing_cleaned = clean_passing(df_passing)

In [16]:
df_passing_cleaned.head()

Unnamed: 0,Player,Nation,Position,Squad,League,Age,Born,90s,passes_completed_total,passes_attempted_total,...,xag,xa,assists_minus_xag,key_passes,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,progressive_passes,matches,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,38,60,...,0.0,0.0,0.0,0,1,0,0,5,,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,529,650,...,5.6,3.8,0.4,40,25,24,0,49,,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,1186,1545,...,3.7,2.5,-1.7,36,91,32,21,130,,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,519,698,...,0.6,1.3,-0.6,5,42,18,5,57,,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,1162,1359,...,1.0,0.8,-1.0,13,101,5,0,131,,2017-2018


In [17]:
list(df_passing_cleaned.columns)

['Player',
 'Nation',
 'Position',
 'Squad',
 'League',
 'Age',
 'Born',
 '90s',
 'passes_completed_total',
 'passes_attempted_total',
 'passes_completion_pct_total',
 'total_passing_distance',
 'progressive_passing_distance',
 'passes_completed_short',
 'passes_attempted_short',
 'passes_completion_pct_short',
 'passes_completed_medium',
 'passes_attempted_medium',
 'passes_completion_pct_medium',
 'passes_completed_long',
 'passes_attempted_long',
 'passes_completion_pct_long',
 'assists',
 'xag',
 'xa',
 'assists_minus_xag',
 'key_passes',
 'passes_into_final_third',
 'passes_into_penalty_area',
 'crosses_into_penalty_area',
 'progressive_passes',
 'matches',
 'Season']