## Some things are consistent through each csv. Those consistent columns will be cleaned here
## before they go through their specific cleaning scripts

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
shooting_original = pd.read_csv('../uncleaned_data_csv/shooting.csv')
defending_original = pd.read_csv('../uncleaned_data_csv/defending.csv')
passing_original = pd.read_csv('../uncleaned_data_csv/passing.csv')
standard_original = pd.read_csv('../uncleaned_data_csv/standard.csv')

## Going through the head and columns of each dataset so I can track what needs to be changed

In [4]:
shooting_original.head()
list(shooting_original.columns)

['Unnamed: 0',
 'Unnamed: 1',
 'Unnamed: 2',
 'Unnamed: 3',
 'Rk',
 'Player',
 'Nation',
 'Position',
 'Squad',
 'Competition',
 'Age',
 'Year of birth',
 '90s Played',
 'Goals',
 'Shots Total',
 'Shots on Target',
 'Shots on Target %',
 'Shots Total/90',
 'Shots on target/90',
 'Goals/Shot',
 'Goals/Shot on Target',
 'Average Shot Distance',
 'Shots from Free Kicks',
 'Penalty Kicks Made',
 'Penalty Kicks Attempted',
 'xG: Expected Goals',
 'npxG: Non-Penalty xG',
 'Season']

In [5]:
defending_original.head()
list(defending_original.columns)

['Unnamed: 0',
 'Unnamed: 1',
 'Unnamed: 2',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Rk',
 'Player',
 'Nation',
 'Position',
 'Squad',
 'Competition',
 'Age',
 'Year of birth',
 '90s Played',
 'Tackles',
 'Tackles Won',
 'Tackles (Def 3rd)',
 'Tackles (Mid 3rd)',
 'Tackles (Att 3rd)',
 'Dribblers Tackled',
 'Dribbles Challenged',
 '% of Dribblers Tackled',
 'Challenges Lost',
 'Blocks',
 'Shots Blocked',
 'Season']

In [6]:
passing_original.head()
list(passing_original.columns)

['Unnamed: 0',
 'Unnamed: 1',
 'Unnamed: 2',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Unnamed: 6',
 'Unnamed: 7',
 'Unnamed: 8',
 'Rk',
 'Player',
 'Nation',
 'Position',
 'Squad',
 'Competition',
 'Age',
 'Year of birth',
 '90s Played',
 'Passes Completed',
 'Passes Attempted',
 'Pass Completion %',
 'Total Passing Distance',
 'Progressive Passing Distance',
 'Passes Completed (Short)',
 'Passes Attempted (Short)',
 'Pass Completion % (Short)',
 'Passes Completed (Medium)',
 'Passes Attempted (Medium)',
 'Pass Completion % (Medium)',
 'Passes Completed (Long)',
 'Passes Attempted (Long)',
 'Pass Completion % (Long)',
 'Assists',
 'Season']

In [7]:
standard_original.head()
list(standard_original.columns)

['Unnamed: 0',
 'Unnamed: 1',
 'Unnamed: 2',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Unnamed: 6',
 'Rk',
 'Player',
 'Nation',
 'Position',
 'Squad',
 'Competition',
 'Age',
 'Year of birth',
 'Matches Played',
 'Starts',
 'Minutes',
 '90s Played',
 'Goals',
 'Assists',
 'Goals + Assists',
 'Non-Penalty Goals',
 'Penalty Kicks Made',
 'Penalty Kicks Attempted',
 'Yellow Cards',
 'Red Cards',
 'xG: Expected Goals',
 'npxG: Non-Penalty xG',
 'xAG: Exp. Assisted Goals',
 'npxG + xAG',
 'Progressive Carries',
 'Progressive Passes',
 'Progressive Passes Rec',
 'Goals/90',
 'Assists/90',
 'Goals + Assists/90',
 'Non-Penalty Goals/90',
 'Season']

In [8]:
def load_fbref_csv(path, header_identifier="Rk", raw=False):
    """
    Load an FBref CSV and clean repeated header rows.

    Workflow:
    - If raw=True: return the DataFrame exactly as read (for debugging/inspection).
    - If raw=False: 
        * Find the first row containing header_identifier (e.g., "Rk").
        * Set that row as the header.
        * Drop all repeated header rows inside the dataset.
        * Reset the index so it starts at 0,1,2...
        * Clear or rename the index name (so it never shows up as '26').

    Parameters
    ----------
    path : str
        Path to the CSV file.
    header_identifier : str, optional
        Value in the first column that marks the true header row (default="Rk").
    raw : bool, optional
        If True, returns the unmodified raw DataFrame. Default=False.

    Returns
    -------
    pd.DataFrame
        Cleaned (or raw) DataFrame.
    """
    # Always read with no header (since FBref headers are messy)
    df_raw = pd.read_csv(path, header=None)

    if raw:
        return df_raw

    # Find the header row index
    header_row_idx = df_raw[df_raw.iloc[:, 0] == header_identifier].index
    if header_row_idx.empty:
        raise ValueError(f"No header row found with first column == '{header_identifier}' in {path}")
    header_row_idx = header_row_idx[0]

    # Extract new header
    new_header = df_raw.iloc[header_row_idx]

    # Data starts after header row
    df = df_raw.iloc[header_row_idx+1:].copy()
    df.columns = new_header

    # Drop repeated headers inside the data
    df = df[df.iloc[:, 0] != header_identifier]

    # --- New Step: Reset index & clean index name ---
    df = df.reset_index(drop=True)
    df.index.name = None   # or set to "index" if you prefer

    return df

In [9]:
shooting_original = load_fbref_csv('../uncleaned_data_csv/shooting.csv')
shooting_original.head()

26,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,90s,Gls,...,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Matches,2017-2018
0,26,Imanol Agirretxe,es ESP,FW,Real Sociedad,es La Liga,30,1987,2.6,0,...,0,0,0,1.2,1.2,0.19,-1.2,-1.2,Matches,2017-2018
1,27,Sergio Agüero,ar ARG,FW,Manchester City,eng Premier League,29,1988,21.8,21,...,1,4,4,16.2,13.1,0.15,4.8,3.9,Matches,2017-2018
2,28,Ruben Aguilar,fr FRA,DF,Montpellier,fr Ligue 1,24,1993,27.3,0,...,0,0,0,0.7,0.7,0.05,-0.7,-0.7,Matches,2017-2018
3,29,Matías Aguirregaray,uy URU,DF,Las Palmas,es La Liga,28,1989,12.8,0,...,0,0,0,0.6,0.6,0.07,-0.6,-0.6,Matches,2017-2018
4,30,Jean-Eudes Aholou,ci CIV,MF,Strasbourg,fr Ligue 1,23,1994,33.4,5,...,0,0,0,2.2,2.2,0.06,2.8,2.8,Matches,2017-2018


In [10]:
defending_original = load_fbref_csv('../uncleaned_data_csv/defending.csv')
defending_original.head()

26,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,90s,Tkl,...,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err,Matches,2017-2018
0,26,Imanol Agirretxe,es ESP,FW,Real Sociedad,es La Liga,30,1987,2.6,0,...,1,0,0,0,0,0,2,0,Matches,2017-2018
1,27,Sergio Agüero,ar ARG,FW,Manchester City,eng Premier League,29,1988,21.8,9,...,15,12,1,11,6,15,3,0,Matches,2017-2018
2,28,Ruben Aguilar,fr FRA,DF,Montpellier,fr Ligue 1,24,1993,27.3,91,...,35,25,3,22,37,128,90,0,Matches,2017-2018
3,29,Matías Aguirregaray,uy URU,DF,Las Palmas,es La Liga,28,1989,12.8,26,...,20,13,1,12,24,50,25,0,Matches,2017-2018
4,30,Jean-Eudes Aholou,ci CIV,MF,Strasbourg,fr Ligue 1,23,1994,33.4,70,...,38,29,7,22,75,145,58,1,Matches,2017-2018


In [11]:
passing_original = load_fbref_csv('../uncleaned_data_csv/passing.csv')
passing_original.head()

26,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,90s,Cmp,...,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP,Matches,2017-2018
0,26,Imanol Agirretxe,es ESP,FW,Real Sociedad,es La Liga,30,1987,2.6,38,...,0.0,0.0,0.0,0,1,0,0,5,Matches,2017-2018
1,27,Sergio Agüero,ar ARG,FW,Manchester City,eng Premier League,29,1988,21.8,529,...,5.6,3.8,0.4,40,25,24,0,49,Matches,2017-2018
2,28,Ruben Aguilar,fr FRA,DF,Montpellier,fr Ligue 1,24,1993,27.3,1186,...,3.7,2.5,-1.7,36,91,32,21,130,Matches,2017-2018
3,29,Matías Aguirregaray,uy URU,DF,Las Palmas,es La Liga,28,1989,12.8,519,...,0.6,1.3,-0.6,5,42,18,5,57,Matches,2017-2018
4,30,Jean-Eudes Aholou,ci CIV,MF,Strasbourg,fr Ligue 1,23,1994,33.4,1162,...,1.0,0.8,-1.0,13,101,5,0,131,Matches,2017-2018


In [12]:
standard_original = load_fbref_csv('../uncleaned_data_csv/standard.csv')
standard_original.head()

26,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches,2017-2018
0,26,Imanol Agirretxe,es ESP,FW,Real Sociedad,es La Liga,30,1987,11,2,...,0.0,0.0,0.0,0.45,0.0,0.45,0.45,0.45,Matches,2017-2018
1,27,Sergio Agüero,ar ARG,FW,Manchester City,eng Premier League,29,1988,25,22,...,1.24,0.78,1.05,0.75,0.26,1.0,0.6,0.86,Matches,2017-2018
2,28,Ruben Aguilar,fr FRA,DF,Montpellier,fr Ligue 1,24,1993,31,27,...,0.07,0.0,0.07,0.03,0.13,0.16,0.03,0.16,Matches,2017-2018
3,29,Matías Aguirregaray,uy URU,DF,Las Palmas,es La Liga,28,1989,15,14,...,0.0,0.0,0.0,0.05,0.05,0.1,0.05,0.1,Matches,2017-2018
4,30,Jean-Eudes Aholou,ci CIV,MF,Strasbourg,fr Ligue 1,23,1994,35,35,...,0.15,0.15,0.15,0.07,0.03,0.1,0.07,0.1,Matches,2017-2018


In [13]:
def common_cleaning(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply common cleaning steps across all FBref stat datasets:
    1. Drop the index artifact column (first col, often labeled '26').
    2. Drop the 'Rk' column if present (FBref row counter).
    3. Clean Nation column: keep only the capitalized 3-letter code (e.g., 'es ESP' -> 'ESP').
    4. Rename 'Pos' -> 'Position' if present.
    5. Rename 'Comp' -> 'League', keeping only the league name (e.g., 'es La Liga' -> 'La Liga').
    6. Rename season column (e.g., '2017-2018') to 'Season'.
    7. Convert Age/Born to numeric.
    8. Cast Nation, League, Squad, Position to category dtype.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe after loading with proper headers.

    Returns
    -------
    pd.DataFrame
        Cleaned dataframe with standardized columns and dtypes.
    """
    # 1. Drop first column (index artifact)
    df = df.drop(df.columns[0], axis=1)

    # 2. Drop 'Rk' if present
    if "Rk" in df.columns:
        df = df.drop(columns=["Rk"])

    #added afterwards which is why order is thrown off. Let's call it 2.5
    if "Matches" in df.columns:
        df = df.drop(columns=["Matches"])

    # 3. Clean Nation column
    if "Nation" in df.columns:
        df["Nation"] = df["Nation"].astype(str).str.extract(r"([A-Z]{3})")

    # 4. Rename 'Pos' -> 'Position'
    if "Pos" in df.columns:
        df = df.rename(columns={"Pos": "Position"})

    # 5. Clean Comp column (rename -> League)
    if "Comp" in df.columns:
        df = df.rename(columns={"Comp": "League"})
        df["League"] = df["League"].astype(str).str.extract(r"([A-Z][\w\s]+)")

    # 6. Rename season column
    season_col = [col for col in df.columns if re.match(r"^\d{4}-\d{4}$", str(col))]
    if season_col:
        df = df.rename(columns={season_col[0]: "Season"})

    # 7. Convert Age and Born
    if "Age" in df.columns:
        df["Age"] = pd.to_numeric(df["Age"], errors="coerce").astype("Int64")
    if "Born" in df.columns:
        df["Born"] = pd.to_numeric(df["Born"], errors="coerce").astype("Int64")

    # 8. Cast categorical columns
    for col in ["Nation", "League", "Squad", "Position", "Season"]:
        if col in df.columns:
            df[col] = df[col].astype("category")

    return df


In [14]:
shooting_common_cleaned = common_cleaning(shooting_original)
shooting_common_cleaned.head()

26,Player,Nation,Position,Squad,League,Age,Born,90s,Gls,Sh,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,0,6,...,10.4,0,0,0,1.2,1.2,0.19,-1.2,-1.2,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,21,90,...,13.9,1,4,4,16.2,13.1,0.15,4.8,3.9,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,0,16,...,18.6,0,0,0,0.7,0.7,0.05,-0.7,-0.7,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,0,9,...,21.1,0,0,0,0.6,0.6,0.07,-0.6,-0.6,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,5,39,...,23.8,0,0,0,2.2,2.2,0.06,2.8,2.8,2017-2018


In [15]:
defending_common_cleaned = common_cleaning(defending_original)
defending_common_cleaned.head()

26,Player,Nation,Position,Squad,League,Age,Born,90s,Tkl,TklW,...,Tkl%,Lost,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,0,0,...,0.0,1,0,0,0,0,0,2,0,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,9,6,...,11.8,15,12,1,11,6,15,3,0,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,91,50,...,50.7,35,25,3,22,37,128,90,0,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,26,20,...,37.5,20,13,1,12,24,50,25,0,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,70,34,...,30.9,38,29,7,22,75,145,58,1,2017-2018


In [16]:
passing_common_cleaned = common_cleaning(passing_original)
passing_common_cleaned.head()

26,Player,Nation,Position,Squad,League,Age,Born,90s,Cmp,Att,...,Ast,xAG,xA,A-xAG,KP,1/3,PPA,CrsPA,PrgP,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,2.6,38,60,...,0,0.0,0.0,0.0,0,1,0,0,5,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,21.8,529,650,...,6,5.6,3.8,0.4,40,25,24,0,49,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,27.3,1186,1545,...,2,3.7,2.5,-1.7,36,91,32,21,130,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,12.8,519,698,...,0,0.6,1.3,-0.6,5,42,18,5,57,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,33.4,1162,1359,...,0,1.0,0.8,-1.0,13,101,5,0,131,2017-2018


In [17]:
standard_common_cleaned = common_cleaning(standard_original)
standard_common_cleaned.head()

26,Player,Nation,Position,Squad,League,Age,Born,MP,Starts,Min,...,Ast,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Season
0,Imanol Agirretxe,ESP,FW,Real Sociedad,La Liga,30,1987,11,2,232,...,0.0,0.0,0.0,0.0,0.45,0.0,0.45,0.45,0.45,2017-2018
1,Sergio Agüero,ARG,FW,Manchester City,Premier League,29,1988,25,22,1963,...,0.28,1.24,0.78,1.05,0.75,0.26,1.0,0.6,0.86,2017-2018
2,Ruben Aguilar,FRA,DF,Montpellier,Ligue 1,24,1993,31,27,2459,...,0.07,0.07,0.0,0.07,0.03,0.13,0.16,0.03,0.16,2017-2018
3,Matías Aguirregaray,URU,DF,Las Palmas,La Liga,28,1989,15,14,1153,...,0.0,0.0,0.0,0.0,0.05,0.05,0.1,0.05,0.1,2017-2018
4,Jean-Eudes Aholou,CIV,MF,Strasbourg,Ligue 1,23,1994,35,35,3006,...,0.0,0.15,0.15,0.15,0.07,0.03,0.1,0.07,0.1,2017-2018


## Now to check datatypes and convert what I can in the common portion before getting to the specific portion

In [18]:
shooting_common_cleaned.dtypes

26
Player        object
Nation      category
Position    category
Squad       category
League      category
Age            Int64
Born           Int64
90s           object
Gls           object
Sh            object
SoT           object
SoT%          object
Sh/90         object
SoT/90        object
G/Sh          object
G/SoT         object
Dist          object
FK            object
PK            object
PKatt         object
xG            object
npxG          object
npxG/Sh       object
G-xG          object
np:G-xG       object
Season      category
dtype: object