## Cleaning defending data

In [42]:
from fbref_utils import load_fbref_csv, common_cleaning
import pandas as pd1

In [43]:
df_defending = load_fbref_csv("../uncleaned_data_csv/defending.csv")
df_defending = common_cleaning(df_defending)

In [44]:
df_defending.head()

26,Player,Nation,Position,Squad,League,Age,Born,90s,Tkl,TklW,...,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err,Matches,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,0,0,...,1,0,0,0,0,0,2,0,Matches,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,9,6,...,15,12,1,11,6,15,3,0,Matches,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,91,50,...,35,25,3,22,37,128,90,0,Matches,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,26,20,...,20,13,1,12,24,50,25,0,Matches,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,70,34,...,38,29,7,22,75,145,58,1,Matches,2017-2018


In [41]:
print("Matches" in df_defending.columns)
print(df_defending.columns[df_defending.columns.str.contains("Match", case=False)])


True
Index(['Matches'], dtype='object', name=26)


In [26]:
list(df_defending.columns)

['Player',
 'Nation',
 'Position',
 'Squad',
 'League',
 'Age',
 'Born',
 '90s',
 'Tkl',
 'TklW',
 'Def 3rd',
 'Mid 3rd',
 'Att 3rd',
 'Tkl',
 'Att',
 'Tkl%',
 'Lost',
 'Blocks',
 'Sh',
 'Pass',
 'Int',
 'Tkl+Int',
 'Clr',
 'Err',
 'Matches',
 'Season']

In [27]:
df_defending.dtypes

26
Player        object
Nation      category
Position    category
Squad       category
League      category
Age            Int64
Born           Int64
90s           object
Tkl           object
TklW          object
Def 3rd       object
Mid 3rd       object
Att 3rd       object
Tkl           object
Att           object
Tkl%          object
Lost          object
Blocks        object
Sh            object
Pass          object
Int           object
Tkl+Int       object
Clr           object
Err           object
Matches       object
Season      category
dtype: object

In [30]:
def clean_defending(df: pd.DataFrame) -> pd.DataFrame:
    """
    Defending-specific cleaning:
    - Resolve duplicate column names.
    - Rename into BigQuery-safe snake_case.
    - Convert to appropriate numeric dtypes.
    """

    # 1. Assign unique names to duplicates BEFORE renaming
    df.columns = [
        "Player", "Nation", "Position", "Squad", "League", "Age", "Born", "90s",
        "Tkl_tackles", "TklW", "Def 3rd", "Mid 3rd", "Att 3rd",
        "Tkl_challenges", "Att_challenges", "Tkl%_challenges", "Lost",
        "Blocks", "Sh_blocks", "Pass_blocks",
        "Int", "Tkl+Int", "Clr", "Err", "Season"
    ]

    # 2. Rename to BigQuery-safe snake_case
    rename_map = {
        # Tackles
        "Tkl_tackles": "tackles_total",
        "TklW": "tackles_won",
        "Def 3rd": "tackles_def_3rd",
        "Mid 3rd": "tackles_mid_3rd",
        "Att 3rd": "tackles_att_3rd",

        # Challenges
        "Tkl_challenges": "dribblers_tackled",
        "Att_challenges": "dribbles_challenged",
        "Tkl%_challenges": "dribblers_tackled_pct",
        "Lost": "challenges_lost",

        # Blocks
        "Blocks": "blocks_total",
        "Sh_blocks": "blocks_shots",
        "Pass_blocks": "blocks_passes",

        # Other
        "Int": "interceptions",
        "Tkl+Int": "tackles_plus_interceptions",
        "Clr": "clearances",
        "Err": "errors_leading_to_shot",
    }
    df = df.rename(columns=rename_map)

    # 3. Type conversions
    int_cols = [
        "tackles_total", "tackles_won", "tackles_def_3rd", "tackles_mid_3rd", "tackles_att_3rd",
        "dribblers_tackled", "dribbles_challenged", "challenges_lost",
        "blocks_total", "blocks_shots", "blocks_passes",
        "interceptions", "tackles_interceptions", "clearances", "errors_leading_to_shot"
    ]
    float_cols = ["dribblers_tackled_pct"]

    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float")

    return df


In [31]:
df_defending_cleaned = clean_defending(df_defending)

ValueError: Length mismatch: Expected axis has 26 elements, new values have 25 elements

In [None]:
df_defending_cleaned.head()

Unnamed: 0,Player,Nation,Position,Squad,League,Age,Born,90s,tackles_total,tackles_won,...,challenges_lost,blocks_total,blocks_shots,blocks_passes,interceptions,tackles_plus_interceptions,clearances,errors_leading_to_shot,matches,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,0,0,...,1,0,0,0,0,0,2,0,,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,9,6,...,15,12,1,11,6,15,3,0,,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,91,50,...,35,25,3,22,37,128,90,0,,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,26,20,...,20,13,1,12,24,50,25,0,,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,70,34,...,38,29,7,22,75,145,58,1,,2017-2018


In [None]:
list(df_defending_cleaned.columns)

['Player',
 'Nation',
 'Position',
 'Squad',
 'League',
 'Age',
 'Born',
 '90s',
 'tackles_total',
 'tackles_won',
 'tackles_def_3rd',
 'tackles_mid_3rd',
 'tackles_att_3rd',
 'dribblers_tackled',
 'dribbles_challenged',
 'dribblers_tackled_pct',
 'challenges_lost',
 'blocks_total',
 'blocks_shots',
 'blocks_passes',
 'interceptions',
 'tackles_plus_interceptions',
 'clearances',
 'errors_leading_to_shot',
 'matches',
 'Season']

In [None]:
df_defending_cleaned.dtypes

Player                          object
Nation                        category
Position                      category
Squad                         category
League                        category
Age                              Int64
Born                             Int64
90s                             object
tackles_total                    Int64
tackles_won                      Int64
tackles_def_3rd                  Int64
tackles_mid_3rd                  Int64
tackles_att_3rd                  Int64
dribblers_tackled                Int64
dribbles_challenged              Int64
dribblers_tackled_pct          float64
challenges_lost                  Int64
blocks_total                     Int64
blocks_shots                     Int64
blocks_passes                    Int64
interceptions                    Int64
tackles_plus_interceptions      object
clearances                       Int64
errors_leading_to_shot           Int64
matches                          Int64
Season                   