## Where I will clean shooting

In [10]:
from fbref_utils import load_fbref_csv, common_cleaning
import pandas as pd

In [11]:
df_shooting = load_fbref_csv("../uncleaned_data_csv/shooting.csv")
df_shooting = common_cleaning(df_shooting)

In [12]:
df_shooting.head()

26,Player,Nation,Position,Squad,League,Age,Born,90s,Gls,Sh,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,0,6,...,10.4,0,0,0,1.2,1.2,0.19,-1.2,-1.2,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,21,90,...,13.9,1,4,4,16.2,13.1,0.15,4.8,3.9,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,0,16,...,18.6,0,0,0,0.7,0.7,0.05,-0.7,-0.7,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,0,9,...,21.1,0,0,0,0.6,0.6,0.07,-0.6,-0.6,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,5,39,...,23.8,0,0,0,2.2,2.2,0.06,2.8,2.8,2017-2018


In [13]:
list(df_shooting.columns)

['Player',
 'Nation',
 'Position',
 'Squad',
 'League',
 'Age',
 'Born',
 '90s',
 'Gls',
 'Sh',
 'SoT',
 'SoT%',
 'Sh/90',
 'SoT/90',
 'G/Sh',
 'G/SoT',
 'Dist',
 'FK',
 'PK',
 'PKatt',
 'xG',
 'npxG',
 'npxG/Sh',
 'G-xG',
 'np:G-xG',
 'Season']

In [14]:
df_shooting.dtypes

26
Player        object
Nation      category
Position    category
Squad       category
League      category
Age            Int64
Born           Int64
90s           object
Gls           object
Sh            object
SoT           object
SoT%          object
Sh/90         object
SoT/90        object
G/Sh          object
G/SoT         object
Dist          object
FK            object
PK            object
PKatt         object
xG            object
npxG          object
npxG/Sh       object
G-xG          object
np:G-xG       object
Season      category
dtype: object

In [15]:
def clean_shooting(df: pd.DataFrame) -> pd.DataFrame:
    """
    Shooting-specific cleaning:
    - Rename columns to BigQuery-safe snake_case.
    - Convert to appropriate numeric dtypes.
    """
    rename_map = {
        "Gls": "goals",
        "Sh": "shots_total",
        "SoT": "shots_on_target",
        "SoT%": "shots_on_target_pct",
        "Sh/90": "shots_total_per_90",
        "SoT/90": "shots_on_target_per_90",
        "G/Sh": "goals_per_shot",
        "G/SoT": "goals_per_shot_on_target",
        "Dist": "avg_shot_distance",
        "FK": "shots_from_free_kicks",
        "PK": "penalty_kicks_made",
        "PKatt": "penalty_kicks_attempted",
        "xG": "xg",
        "npxG": "npxg",
        "npxG/Sh": "npxg_per_shot",
        "G-xG": "goals_minus_xg",
        "np:G-xG": "nonpen_goals_minus_npxg",
        # Season already cleaned in common_cleaning
    }
    df = df.rename(columns=rename_map)

    # Convert to numeric (safe coercion)
    int_cols = [
        "goals", "shots_total", "shots_on_target",
        "shots_from_free_kicks", "penalty_kicks_made",
        "penalty_kicks_attempted"
    ]
    float_cols = [
        "shots_on_target_pct", "shots_total_per_90", "shots_on_target_per_90",
        "goals_per_shot", "goals_per_shot_on_target", "avg_shot_distance",
        "xg", "npxg", "npxg_per_shot", "goals_minus_xg", "nonpen_goals_minus_npxg"
    ]

    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float")

    return df

In [16]:
df_shooting_cleaned = clean_shooting(df_shooting)

In [17]:
list(df_shooting_cleaned.columns)

['Player',
 'Nation',
 'Position',
 'Squad',
 'League',
 'Age',
 'Born',
 '90s',
 'goals',
 'shots_total',
 'shots_on_target',
 'shots_on_target_pct',
 'shots_total_per_90',
 'shots_on_target_per_90',
 'goals_per_shot',
 'goals_per_shot_on_target',
 'avg_shot_distance',
 'shots_from_free_kicks',
 'penalty_kicks_made',
 'penalty_kicks_attempted',
 'xg',
 'npxg',
 'npxg_per_shot',
 'goals_minus_xg',
 'nonpen_goals_minus_npxg',
 'Season']

In [18]:
df_shooting_cleaned.dtypes

26
Player                        object
Nation                      category
Position                    category
Squad                       category
League                      category
Age                            Int64
Born                           Int64
90s                           object
goals                          Int64
shots_total                    Int64
shots_on_target                Int64
shots_on_target_pct          float64
shots_total_per_90           float64
shots_on_target_per_90       float64
goals_per_shot               float64
goals_per_shot_on_target     float64
avg_shot_distance            float64
shots_from_free_kicks          Int64
penalty_kicks_made             Int64
penalty_kicks_attempted        Int64
xg                           float64
npxg                         float64
npxg_per_shot                float64
goals_minus_xg               float64
nonpen_goals_minus_npxg      float64
Season                      category
dtype: object