## Cleanign season stats

In [31]:
from fbref_utils import load_fbref_csv, common_cleaning
import pandas as pd

In [None]:
def load_squad_standard(path: str) -> pd.DataFrame:
    """
    Load squad-level standard stats (team stats).
    Simple loading function that:
    1. Reads the CSV
    2. Drops the first row (old headers)
    3. Assigns proper column names
    """
    # Read the CSV
    df = pd.read_csv(path, header=None)
    
    # Drop the first row (old headers)
    df = df.iloc[1:].reset_index(drop=True)
    
    # Assign proper descriptive column names
    df.columns = [
        "Rk", "Squad", "Competition", 
        "# of Players", "Average Age", "Possession",
        "Matches Played", "Starts", "Minutes", "90s Played",
        "Goals", "Assists", "Goals + Assists", "Non-Penalty Goals",
        "Penalty Kicks Made", "Penalty Kicks Attempted",
        "Yellow Cards", "Red Cards",
        "xG: Expected Goals", "npxG: Non-Penalty xG",
        "xAG: Exp. Assisted Goals", "npxG + xAG: Non-Penalty xG + Expected Assisted Goals",
        "Progressive Carries: Carries into Final 40%", "Progressive Passes: Passes into Final 40%",
        "Goals/90", "Assists/90", "Goals + Assists/90", "Non-Penalty Goals/90",
        "G+A-PK/90: Goals + Assists - Penalties per 90", 
        "xG/90: Expected Goals per 90",
        "xAG/90: Expected Assisted Goals per 90",
        "xG+xAG/90: Expected Goals + Assists per 90",
        "npxG/90: Non-Penalty Expected Goals per 90",
        "npxG+xAG/90: Non-Penalty Expected Goals + Expected Assists per 90",
        "Season"
    ]
    
    return df

In [None]:
def clean_squad_standard(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean squad-level standard stats (team stats):
    1. Drop rank column (no longer needed)
    2. Clean team and competition names
    3. Convert all numeric columns to proper types
    4. Convert categorical columns to category dtype
    """
    # Drop rank column
    df = df.drop(columns=["Rk"])
    
    # Clean team names and competition names
    df["Squad"] = df["Squad"].str.strip()
    df["Competition"] = df["Competition"].str.extract(r"([A-Z][\w\s]+)")  # Extracts just the league name
    
    # Convert numeric columns
    int_columns = [
        "# of Players", "Matches Played", "Starts", "Minutes",
        "Goals", "Assists", "Goals + Assists", "Non-Penalty Goals",
        "Penalty Kicks Made", "Penalty Kicks Attempted",
        "Yellow Cards", "Red Cards"
    ]
    
    float_columns = [
        "Average Age", "Possession", "90s Played",
        "xG: Expected Goals", "npxG: Non-Penalty xG",
        "xAG: Exp. Assisted Goals", "npxG + xAG: Non-Penalty xG + Expected Assisted Goals",
        "Progressive Carries: Carries into Final 40%", "Progressive Passes: Passes into Final 40%",
        "Goals/90", "Assists/90", "Goals + Assists/90", "Non-Penalty Goals/90",
        "G+A-PK/90: Goals + Assists - Penalties per 90",
        "xG/90: Expected Goals per 90", "xAG/90: Expected Assisted Goals per 90",
        "xG+xAG/90: Expected Goals + Assists per 90",
        "npxG/90: Non-Penalty Expected Goals per 90",
        "npxG+xAG/90: Non-Penalty Expected Goals + Expected Assists per 90"
    ]
    
    # Convert to proper types
    for col in int_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
        
    for col in float_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float")
    
    # Convert categorical columns
    categorical_columns = ["Squad", "Competition", "Season"]
    for col in categorical_columns:
        df[col] = df[col].astype("category")
    
    return df

In [None]:
# First load the data
df_seasons = load_squad_standard("../uncleaned_data_csv/seasons_stats.csv")

# Then clean it
df_seasons_clean = clean_squad_standard(df_seasons)

print("Data loaded and cleaned successfully!")
print(f"Final shape: {df_seasons_clean.shape}")

Comparison of columns:
--------------------------------------------------
Number of columns in data: 35
Number of columns we're trying to assign: 35

Side by side comparison:
--------------------------------------------------
    Index  Current Columns          First Row Values  \
0       0                0                       NaN   
1       1                1                       NaN   
2       2                2                       NaN   
3       3                3                       NaN   
4       4                4                       NaN   
5       5                5                       NaN   
6       6                6                        Rk   
7       7                7                     Squad   
8       8                8               Competition   
9       9                9              # of Players   
10     10               10               Average Age   
11     11               11                Possession   
12     12               12            Matches 

In [46]:
df_seasons.head(20)

Unnamed: 0,Rk,Squad,Competition,# of Players,Average Age,Possession,Matches Played,Starts,Minutes,90s Played,...,Assists/90,Goals + Assists/90,Non-Penalty Goals/90,G+A-PK/90: Goals + Assists - Penalties per 90,xG/90: Expected Goals per 90,xAG/90: Expected Assisted Goals per 90,xG+xAG/90: Expected Goals + Assists per 90,npxG/90: Non-Penalty Expected Goals per 90,npxG+xAG/90: Non-Penalty Expected Goals + Expected Assists per 90,Season
0,,,,,,,Rk,Squad,Competition,# of Players,...,npxG: Non-Penalty xG,xAG: Exp. Assisted Goals,npxG + xAG,Progressive Carries,Progressive Passes,Goals/90,Assists/90,Goals + Assists/90,Non-Penalty Goals/90,Season
1,1.0,Alavés,es La Liga,30.0,25.5,40.3,38,418,3420,38.0,...,0.82,1.87,1.03,1.84,1.01,0.73,1.74,0.95,1.68,2017-2018
2,2.0,Amiens,fr Ligue 1,30.0,27.5,43.3,38,418,3420,38.0,...,0.63,1.58,0.87,1.50,0.86,0.56,1.42,0.78,1.35,2017-2018
3,3.0,Angers,fr Ligue 1,27.0,27.1,45.1,38,418,3420,38.0,...,0.79,1.87,0.97,1.76,1.25,0.93,2.18,1.17,2.10,2017-2018
4,4.0,Arsenal,eng Premier League,30.0,26.8,61.4,38,418,3420,38.0,...,1.61,3.53,1.82,3.42,1.80,1.40,3.20,1.69,3.10,2017-2018
5,5.0,Atalanta,it Serie A,25.0,25.7,55.4,38,418,3420,38.0,...,1.00,2.50,1.37,2.37,1.69,1.18,2.86,1.50,2.68,2017-2018
6,6.0,Athletic Club,es La Liga,26.0,26.7,49.6,38,418,3420,38.0,...,0.66,1.68,0.87,1.53,1.31,0.89,2.19,1.15,2.04,2017-2018
7,7.0,Atlético Madrid,es La Liga,26.0,26.6,47.5,38,418,3420,38.0,...,1.08,2.58,1.39,2.47,1.32,0.96,2.28,1.22,2.18,2017-2018
8,8.0,Augsburg,de Bundesliga,27.0,26.6,45.7,34,374,3060,34.0,...,0.88,2.09,1.09,1.97,1.32,0.95,2.27,1.22,2.17,2017-2018
9,9.0,Barcelona,es La Liga,25.0,27.7,62.8,38,418,3420,38.0,...,1.95,4.47,2.45,4.39,2.20,1.70,3.90,2.11,3.81,2017-2018


In [23]:
list(df_seasons.columns)

['Rk',
 'Squad',
 'Competition',
 '# of Players',
 'Average Age',
 'Possession',
 'Matches Played',
 'Starts',
 'Minutes',
 '90s Played',
 'Goals',
 'Assists',
 'Goals + Assists',
 'Non-Penalty Goals',
 'Penalty Kicks Made',
 'Penalty Kicks Attempted',
 'Yellow Cards',
 'Red Cards',
 'xG: Expected Goals',
 'npxG: Non-Penalty xG',
 'xAG: Exp. Assisted Goals',
 'npxG + xAG',
 'Progressive Carries',
 'Progressive Passes',
 'Goals/90',
 'Assists/90',
 'Goals + Assists/90',
 'Non-Penalty Goals/90',
 'Season']

In [15]:
df_seasons.dtypes

0     float64
1      object
2      object
3     float64
4     float64
5     float64
6      object
7      object
8      object
9      object
10     object
11     object
12     object
13     object
14     object
15     object
16     object
17     object
18     object
19     object
20     object
21     object
22     object
23     object
24     object
25     object
26     object
27     object
28     object
29     object
30     object
31     object
32     object
33     object
34     object
dtype: object

In [7]:
def clean_squad_standard(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean squad-level standard stats (team stats):
    - Reset header row to proper columns (fixes Unnamed scraping issue).
    - Drop 'Rk' column.
    - Rename with descriptive, BigQuery-safe snake_case.
    - Convert numeric columns to appropriate dtypes.
    - Leave 'Season' untouched.
    """

    # 1. Force correct column order (ignore scraped junk headers)
    df.columns = [
        "Rk", "Squad", "Competition", "Players_used", "Average_age", "Possession",
        "Matches_played", "Starts", "Minutes", "Nineties_played",
        "Goals", "Assists", "Goals_plus_assists", "Nonpen_goals",
        "Penalty_kicks_made", "Penalty_kicks_attempted",
        "Yellow_cards", "Red_cards",
        "Expected_goals", "Nonpen_expected_goals",
        "Expected_assisted_goals", "Nonpen_expected_goals_plus_expected_assists",
        "Progressive_carries", "Progressive_passes",
        "Goals_per90", "Assists_per90", "Goals_plus_assists_per90",
        "Nonpen_goals_per90",
        "Season"
    ]

    # 2. Drop rank column
    df = df.drop(columns=["Rk"])

    # 3. Rename to BigQuery-safe descriptive snake_case
    rename_map = {
        "Squad": "squad",
        "Competition": "league",
        "Players_used": "players_used",
        "Average_age": "average_age",
        "Possession": "possession_pct",
        "Matches_played": "matches_played",
        "Starts": "matches_started",
        "Minutes": "minutes_played",
        "Nineties_played": "nineties_played",
        "Goals": "goals_total",
        "Assists": "assists_total",
        "Goals_plus_assists": "goals_plus_assists_total",
        "Nonpen_goals": "nonpen_goals_total",
        "Penalty_kicks_made": "penalty_kicks_made_total",
        "Penalty_kicks_attempted": "penalty_kicks_attempted_total",
        "Yellow_cards": "yellow_cards",
        "Red_cards": "red_cards",
        "Expected_goals": "expected_goals_total",
        "Nonpen_expected_goals": "nonpen_expected_goals_total",
        "Expected_assisted_goals": "expected_assisted_goals_total",
        "Nonpen_expected_goals_plus_expected_assists": "nonpen_expected_goals_plus_expected_assists_total",
        "Progressive_carries": "progressive_carries",
        "Progressive_passes": "progressive_passes",
        "Goals_per90": "goals_per_90",
        "Assists_per90": "assists_per_90",
        "Goals_plus_assists_per90": "goals_plus_assists_per_90",
        "Nonpen_goals_per90": "nonpen_goals_per_90"
        # 'Season' left unchanged
    }
    df = df.rename(columns=rename_map)

    # 4. Type conversions
    int_cols = [
        "players_used", "matches_played", "matches_started", "minutes_played",
        "goals_total", "assists_total", "goals_plus_assists_total",
        "nonpen_goals_total", "penalty_kicks_made_total", "penalty_kicks_attempted_total",
        "yellow_cards", "red_cards"
    ]

    float_cols = [
        "average_age", "possession_pct", "nineties_played",
        "expected_goals_total", "nonpen_expected_goals_total",
        "expected_assisted_goals_total", "nonpen_expected_goals_plus_expected_assists_total",
        "progressive_carries", "progressive_passes",
        "goals_per_90", "assists_per_90", "goals_plus_assists_per_90",
        "nonpen_goals_per_90"
    ]

    for col in int_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

    for col in float_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float")

    # 5. Cast categorical for efficiency
    for cat_col in ["league", "squad"]:
        if cat_col in df.columns:
            df[cat_col] = df[cat_col].astype("category")

    return df

In [8]:
df_seasons_cleaned = clean_squad_standard(df_seasons)

ValueError: Length mismatch: Expected axis has 35 elements, new values have 29 elements