In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from io import StringIO
import time

In [None]:
#!pip install lxml
#!pip install bs4
#!pip install selenium


ERROR: Could not find a version that satisfies the requirement io (from versions: none)
ERROR: No matching distribution found for io


In [2]:
urls = {
    "standard": "https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats",
    "shooting": "https://fbref.com/en/comps/22/shooting/Major-League-Soccer-Stats",
    "passing": "https://fbref.com/en/comps/22/passing/Major-League-Soccer-Stats",
    "defense": "https://fbref.com/en/comps/22/defense/Major-League-Soccer-Stats",
    "possession": "https://fbref.com/en/comps/22/possession/Major-League-Soccer-Stats",
    "gca": "https://fbref.com/en/comps/22/gca/Major-League-Soccer-Stats"
}


In [3]:
chrome_options = Options()
chrome_options.add_argument("--headless")  # run without opening a window
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-extensions")

# --- Start Selenium ---
driver = webdriver.Chrome(options=chrome_options)
driver.set_page_load_timeout(120)  # wait up to 2 minutes
wait = WebDriverWait(driver, 30)
all_dfs = {}

for stat_type, url in urls.items():
    print(f"\nScraping {stat_type.upper()} page...")

    # Retry logic for page loading
    max_attempts = 5
    for attempt in range(max_attempts):
        try:
            driver.get(url)
            time.sleep(3)  # allow JS to load
            break
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}. Retrying in 5s...")
            time.sleep(5)
    else:
        print(f"Failed to load {stat_type} page after multiple attempts")
        continue

    button_id = f"stats_{stat_type}_control"
    table_id = f"stats_{stat_type}"

    # --- Click the button if it exists ---
    try:
        button = wait.until(EC.element_to_be_clickable((By.ID, button_id)))
        driver.execute_script("arguments[0].scrollIntoView(true);", button)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", button)
    except Exception:
        print(f"Button {button_id} not clickable, continuing...")

    time.sleep(2)  # wait for JS to reveal table

    # --- Always parse commented HTML first ---
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    table_html = [c for c in comments if table_id in c]

    if table_html:
        df = pd.read_html(str(table_html[0]))[0]
        print(f"Extracted {stat_type} table from commented HTML ({df.shape[0]} rows)")
    else:
        # fallback to visible table if no comment found
        try:
            df = pd.read_html(html)[-1]
            print(f"Using visible table for {stat_type} ({df.shape[0]} rows)")
        except Exception:
            print(f"Could not find table for {stat_type}")
            continue

    # Flatten multi-level columns
    df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]

    # Clean player and team names
    if 'Player' in df.columns and 'Squad' in df.columns:
        df['Player'] = df['Player'].str.strip()
        df['Squad'] = df['Squad'].str.strip()
        df.drop_duplicates(subset=['Player', 'Squad'], inplace=True)

    # Store DataFrame
    all_dfs[stat_type] = df

    # Save individual CSV
    filename = f"mls_2025_players_{stat_type}.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {filename} ({df.shape[0]} rows, {df.shape[1]} cols)")

    time.sleep(2)  # avoid rapid requests

driver.quit()



Scraping STANDARD page...


  df = pd.read_html(str(table_html[0]))[0]


Extracted standard table from commented HTML (917 rows)
Saved mls_2025_players_standard.csv (917 rows, 37 cols)

Scraping SHOOTING page...


  df = pd.read_html(str(table_html[0]))[0]


Extracted shooting table from commented HTML (917 rows)
Saved mls_2025_players_shooting.csv (917 rows, 26 cols)

Scraping PASSING page...


  df = pd.read_html(str(table_html[0]))[0]


Extracted passing table from commented HTML (917 rows)
Saved mls_2025_players_passing.csv (917 rows, 32 cols)

Scraping DEFENSE page...


  df = pd.read_html(str(table_html[0]))[0]


Extracted defense table from commented HTML (917 rows)
Saved mls_2025_players_defense.csv (917 rows, 25 cols)

Scraping POSSESSION page...


  df = pd.read_html(str(table_html[0]))[0]


Extracted possession table from commented HTML (917 rows)
Saved mls_2025_players_possession.csv (917 rows, 31 cols)

Scraping GCA page...


  df = pd.read_html(str(table_html[0]))[0]


Extracted gca table from commented HTML (917 rows)
Saved mls_2025_players_gca.csv (917 rows, 25 cols)


In [5]:
master = all_dfs['standard'].copy()

master['Unnamed: 1_level_0_Player'] = master['Unnamed: 1_level_0_Player'].str.strip()
master['Unnamed: 4_level_0_Squad'] = master['Unnamed: 4_level_0_Squad'].str.strip()
master = master.drop_duplicates(subset=['Unnamed: 1_level_0_Player', 'Unnamed: 4_level_0_Squad'])

for name, df in all_dfs.items():
    if name == 'standard':
        continue

    print(f"\nProcessing {name} table (original rows: {df.shape[0]})")

    # Clean Player and Squad
    df['Unnamed: 1_level_0_Player'] = df['Unnamed: 1_level_0_Player'].str.strip()
    df['Unnamed: 4_level_0_Squad'] = df['Unnamed: 4_level_0_Squad'].str.strip()

    # Drop duplicates
    df = df.drop_duplicates(subset=['Unnamed: 1_level_0_Player', 'Unnamed: 4_level_0_Squad'])
    print(f"{name} table after dropping duplicates: {df.shape[0]} rows")

    # Merge safely on Player + Squad
    master = master.merge(df, on=['Unnamed: 1_level_0_Player', 'Unnamed: 4_level_0_Squad'], how='left', suffixes=('', f'_{name}'))
    print(f"Merged {name}, current shape: {master.shape}")

master = master.loc[:, ~master.columns.duplicated()]

master.to_csv("mls_player_master.csv", index=False)
print(f"\nMaster dataset saved with {master.shape[0]} players and {master.shape[1]} columns")


Processing shooting table (original rows: 917)
shooting table after dropping duplicates: 883 rows
Merged shooting, current shape: (883, 61)

Processing passing table (original rows: 917)
passing table after dropping duplicates: 883 rows
Merged passing, current shape: (883, 91)

Processing defense table (original rows: 917)
defense table after dropping duplicates: 883 rows
Merged defense, current shape: (883, 114)

Processing possession table (original rows: 917)
possession table after dropping duplicates: 883 rows
Merged possession, current shape: (883, 143)

Processing gca table (original rows: 917)
gca table after dropping duplicates: 883 rows
Merged gca, current shape: (883, 166)

Master dataset saved with 883 players and 166 columns


In [8]:
master.columns.tolist()

['Unnamed: 0_level_0_Rk',
 'Unnamed: 1_level_0_Player',
 'Unnamed: 2_level_0_Nation',
 'Unnamed: 3_level_0_Pos',
 'Unnamed: 4_level_0_Squad',
 'Unnamed: 5_level_0_Age',
 'Unnamed: 6_level_0_Born',
 'Playing Time_MP',
 'Playing Time_Starts',
 'Playing Time_Min',
 'Playing Time_90s',
 'Performance_Gls',
 'Performance_Ast',
 'Performance_G+A',
 'Performance_G-PK',
 'Performance_PK',
 'Performance_PKatt',
 'Performance_CrdY',
 'Performance_CrdR',
 'Expected_xG',
 'Expected_npxG',
 'Expected_xAG',
 'Expected_npxG+xAG',
 'Progression_PrgC',
 'Progression_PrgP',
 'Progression_PrgR',
 'Per 90 Minutes_Gls',
 'Per 90 Minutes_Ast',
 'Per 90 Minutes_G+A',
 'Per 90 Minutes_G-PK',
 'Per 90 Minutes_G+A-PK',
 'Per 90 Minutes_xG',
 'Per 90 Minutes_xAG',
 'Per 90 Minutes_xG+xAG',
 'Per 90 Minutes_npxG',
 'Per 90 Minutes_npxG+xAG',
 'Unnamed: 36_level_0_Matches',
 'Unnamed: 0_level_0_Rk_shooting',
 'Unnamed: 2_level_0_Nation_shooting',
 'Unnamed: 3_level_0_Pos_shooting',
 'Unnamed: 5_level_0_Age_shooting

In [9]:
master.head()

Unnamed: 0,Unnamed: 0_level_0_Rk,Unnamed: 1_level_0_Player,Unnamed: 2_level_0_Nation,Unnamed: 3_level_0_Pos,Unnamed: 4_level_0_Squad,Unnamed: 5_level_0_Age,Unnamed: 6_level_0_Born,Playing Time_MP,Playing Time_Starts,Playing Time_Min,...,SCA Types_Def,GCA_GCA,GCA_GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def,Unnamed: 24_level_0_Matches_gca
0,1,Paxten Aaronson,us USA,MF,Colorado Rapids,22-067,2003,7,6,572,...,0,1,0.16,1,0,0,0,0,0,Matches
1,2,Liel Abada,il ISR,"FW,MF",Charlotte,24-029,2001,32,17,1481,...,1,5,0.3,3,0,0,1,1,0,Matches
2,3,Wessam Abou Ali,ps PLE,FW,Columbus Crew,26-301,1999,5,4,303,...,0,2,0.6,1,0,0,0,1,0,Matches
3,4,Luis Abram,pe PER,DF,Atlanta Utd,29-247,1996,21,16,1493,...,0,0,0.0,0,0,0,0,0,0,Matches
4,5,Lalas Abubakar,gh GHA,DF,FC Dallas,30-311,1994,29,18,1656,...,1,1,0.05,1,0,0,0,0,0,Matches


In [21]:
# drop irrelevant columns
drop_cols = ['Unnamed: 0_level_0_Rk', 'Unnamed: 0_level_0_Rk', 'Unnamed: 5_level_0_Age','Unnamed: 6_level_0_Born','Playing Time_MP','Playing Time_Starts','Playing Time_Min',
             'Playing Time_90s','Unnamed: 36_level_0_Matches','Unnamed: 25_level_0_Matches','Unnamed: 0_level_0_Rk_shooting','Unnamed: 2_level_0_Nation_shooting','Unnamed: 3_level_0_Pos_shooting',
             'Unnamed: 5_level_0_Age_shooting','Unnamed: 6_level_0_Born_shooting', 'Unnamed: 0_level_0_Rk_passing','Unnamed: 2_level_0_Nation_passing','Unnamed: 3_level_0_Pos_passing','Unnamed: 5_level_0_Age_passing',
             'Unnamed: 6_level_0_Born_passing', 'Unnamed: 7_level_0_90s_passing','Unnamed: 31_level_0_Matches','Unnamed: 0_level_0_Rk_defense','Unnamed: 2_level_0_Nation_defense',
             'Unnamed: 3_level_0_Pos_defense','Unnamed: 5_level_0_Age_defense','Unnamed: 6_level_0_Born_defense','Unnamed: 7_level_0_90s_defense','Unnamed: 24_level_0_Matches',
             'Unnamed: 0_level_0_Rk_possession','Unnamed: 2_level_0_Nation_possession','Unnamed: 3_level_0_Pos_possession','Unnamed: 5_level_0_Age_possession','Unnamed: 6_level_0_Born_possession',
             'Unnamed: 7_level_0_90s_possession','Unnamed: 30_level_0_Matches','Unnamed: 0_level_0_Rk_gca','Unnamed: 2_level_0_Nation_gca','Unnamed: 3_level_0_Pos_gca','Unnamed: 5_level_0_Age_gca',
             'Unnamed: 6_level_0_Born_gca','Unnamed: 7_level_0_90s_gca', 'Unnamed: 24_level_0_Matches_gca', 'Unnamed: 7_level_0_90s',   ]

player_stats = master.drop(columns=drop_cols)

player_stats.head()

Unnamed: 0,Unnamed: 1_level_0_Player,Unnamed: 2_level_0_Nation,Unnamed: 3_level_0_Pos,Unnamed: 4_level_0_Squad,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,...,SCA Types_Fld,SCA Types_Def,GCA_GCA,GCA_GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def
0,Paxten Aaronson,us USA,MF,Colorado Rapids,1,0,1,1,0,0,...,4,0,1,0.16,1,0,0,0,0,0
1,Liel Abada,il ISR,"FW,MF",Charlotte,5,1,6,5,0,0,...,2,1,5,0.3,3,0,0,1,1,0
2,Wessam Abou Ali,ps PLE,FW,Columbus Crew,3,0,3,3,0,0,...,1,0,2,0.6,1,0,0,0,1,0
3,Luis Abram,pe PER,DF,Atlanta Utd,0,0,0,0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
4,Lalas Abubakar,gh GHA,DF,FC Dallas,1,0,1,1,0,0,...,0,1,1,0.05,1,0,0,0,0,0


In [14]:
player_stats.describe()

Unnamed: 0,Unnamed: 1_level_0_Player,Unnamed: 2_level_0_Nation,Unnamed: 3_level_0_Pos,Unnamed: 4_level_0_Squad,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,...,SCA Types_Fld,SCA Types_Def,GCA_GCA,GCA_GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def
count,883,881,883,883,883,883,883,883,883,883,...,883,883,883,883.0,883,883,883,883,883,883
unique,851,87,11,31,23,17,30,22,8,10,...,18,6,26,103.0,20,8,6,7,9,4
top,Caden Clark,us USA,DF,Colorado Rapids,0,0,0,0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
freq,2,320,248,34,467,464,338,469,824,817,...,492,628,315,315.0,361,794,780,718,765,853


In [23]:
player_stats['Unnamed: 2_level_0_Nation'] = player_stats['Unnamed: 2_level_0_Nation'].str[2:]

player_stats

Unnamed: 0,Unnamed: 1_level_0_Player,Unnamed: 2_level_0_Nation,Unnamed: 3_level_0_Pos,Unnamed: 4_level_0_Squad,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,...,SCA Types_Fld,SCA Types_Def,GCA_GCA,GCA_GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def
0,Paxten Aaronson,USA,MF,Colorado Rapids,1,0,1,1,0,0,...,4,0,1,0.16,1,0,0,0,0,0
1,Liel Abada,ISR,"FW,MF",Charlotte,5,1,6,5,0,0,...,2,1,5,0.30,3,0,0,1,1,0
2,Wessam Abou Ali,PLE,FW,Columbus Crew,3,0,3,3,0,0,...,1,0,2,0.60,1,0,0,0,1,0
3,Luis Abram,PER,DF,Atlanta Utd,0,0,0,0,0,0,...,0,0,0,0.00,0,0,0,0,0,0
4,Lalas Abubakar,GHA,DF,FC Dallas,1,0,1,1,0,0,...,0,1,1,0.05,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878,Walker Zimmerman,USA,DF,Nashville SC,0,2,2,0,0,0,...,1,0,2,0.12,1,0,0,1,0,0
879,Philip Zinckernagel,DEN,"FW,MF",Chicago Fire,15,12,27,15,0,0,...,6,3,21,0.75,8,6,3,2,0,2
880,Rida Zouhir,CAN,"MF,FW",D.C. United,0,0,0,0,0,0,...,0,0,2,0.54,2,0,0,0,0,0
881,Dario Župarić,BIH,DF,Portland Timbers,0,0,0,0,0,0,...,0,1,0,0.00,0,0,0,0,0,0


In [None]:


player_stats = player_stats.rename(columns={'Unnamed: 1_level_0_Player': 'Player', 'Unnamed: 2_level_0_Nation': 'Nation',
                                            'Unnamed: 4_level_0_Squad': 'Squad', 'Unnamed: 3_level_0_Pos': 'Position'})

player_stats

Unnamed: 0,Player,Nation,Position,Squad,Performance_Gls,Performance_Ast,Performance_G+A,Performance_G-PK,Performance_PK,Performance_PKatt,...,SCA Types_Fld,SCA Types_Def,GCA_GCA,GCA_GCA90,GCA Types_PassLive,GCA Types_PassDead,GCA Types_TO,GCA Types_Sh,GCA Types_Fld,GCA Types_Def
0,Paxten Aaronson,USA,MF,Colorado Rapids,1,0,1,1,0,0,...,4,0,1,0.16,1,0,0,0,0,0
1,Liel Abada,ISR,"FW,MF",Charlotte,5,1,6,5,0,0,...,2,1,5,0.30,3,0,0,1,1,0
2,Wessam Abou Ali,PLE,FW,Columbus Crew,3,0,3,3,0,0,...,1,0,2,0.60,1,0,0,0,1,0
3,Luis Abram,PER,DF,Atlanta Utd,0,0,0,0,0,0,...,0,0,0,0.00,0,0,0,0,0,0
4,Lalas Abubakar,GHA,DF,FC Dallas,1,0,1,1,0,0,...,0,1,1,0.05,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878,Walker Zimmerman,USA,DF,Nashville SC,0,2,2,0,0,0,...,1,0,2,0.12,1,0,0,1,0,0
879,Philip Zinckernagel,DEN,"FW,MF",Chicago Fire,15,12,27,15,0,0,...,6,3,21,0.75,8,6,3,2,0,2
880,Rida Zouhir,CAN,"MF,FW",D.C. United,0,0,0,0,0,0,...,0,0,2,0.54,2,0,0,0,0,0
881,Dario Župarić,BIH,DF,Portland Timbers,0,0,0,0,0,0,...,0,1,0,0.00,0,0,0,0,0,0


In [28]:
player_stats.columns.tolist()

['Player',
 'Nation',
 'Position',
 'Squad',
 'Performance_Gls',
 'Performance_Ast',
 'Performance_G+A',
 'Performance_G-PK',
 'Performance_PK',
 'Performance_PKatt',
 'Performance_CrdY',
 'Performance_CrdR',
 'Expected_xG',
 'Expected_npxG',
 'Expected_xAG',
 'Expected_npxG+xAG',
 'Progression_PrgC',
 'Progression_PrgP',
 'Progression_PrgR',
 'Per 90 Minutes_Gls',
 'Per 90 Minutes_Ast',
 'Per 90 Minutes_G+A',
 'Per 90 Minutes_G-PK',
 'Per 90 Minutes_G+A-PK',
 'Per 90 Minutes_xG',
 'Per 90 Minutes_xAG',
 'Per 90 Minutes_xG+xAG',
 'Per 90 Minutes_npxG',
 'Per 90 Minutes_npxG+xAG',
 'Standard_Gls',
 'Standard_Sh',
 'Standard_SoT',
 'Standard_SoT%',
 'Standard_Sh/90',
 'Standard_SoT/90',
 'Standard_G/Sh',
 'Standard_G/SoT',
 'Standard_Dist',
 'Standard_FK',
 'Standard_PK',
 'Standard_PKatt',
 'Expected_xG_shooting',
 'Expected_npxG_shooting',
 'Expected_npxG/Sh',
 'Expected_G-xG',
 'Expected_np:G-xG',
 'Total_Cmp',
 'Total_Att',
 'Total_Cmp%',
 'Total_TotDist',
 'Total_PrgDist',
 'Short_C

In [29]:
rename_dict = {
    # Performance
    'Performance_Gls': 'goals',
    'Performance_Ast': 'assists',
    'Performance_G+A': 'goals_assists',
    'Performance_G-PK': 'non_pk_goals',
    'Performance_PK': 'pks_made',
    'Performance_PKatt': 'pks_att',
    'Performance_CrdY': 'yellow_cards',
    'Performance_CrdR': 'red_cards',

    # Expected
    'Expected_xG': 'xg',
    'Expected_npxG': 'npxg',
    'Expected_xAG': 'xag',
    'Expected_npxG+xAG': 'npxg_xag',

    # Progression
    'Progression_PrgC': 'prog_carries',
    'Progression_PrgP': 'prog_passes',
    'Progression_PrgR': 'prog_receives',

    # Per 90
    'Per 90 Minutes_Gls': 'goals_per90',
    'Per 90 Minutes_Ast': 'assists_per90',
    'Per 90 Minutes_G+A': 'goals_assists_per90',
    'Per 90 Minutes_G-PK': 'non_pk_goals_per90',
    'Per 90 Minutes_G+A-PK': 'non_pk_goals_assists_per90',
    'Per 90 Minutes_xG': 'xg_per90',
    'Per 90 Minutes_xAG': 'xag_per90',
    'Per 90 Minutes_xG+xAG': 'xg_xag_per90',
    'Per 90 Minutes_npxG': 'npxg_per90',
    'Per 90 Minutes_npxG+xAG': 'npxg_xag_per90',

    # Shooting (Standard)
    'Standard_Gls': 'gls',
    'Standard_Sh': 'shots',
    'Standard_SoT': 'shots_on_target',
    'Standard_SoT%': 'sot_pct',
    'Standard_Sh/90': 'shots_per90',
    'Standard_SoT/90': 'sot_per90',
    'Standard_G/Sh': 'goals_per_shot',
    'Standard_G/SoT': 'goals_per_sot',
    'Standard_Dist': 'avg_shot_dist',
    'Standard_FK': 'fk_shots',
    'Standard_PK': 'pk_goals',
    'Standard_PKatt': 'pk_att',

    # Expected (shooting)
    'Expected_xG_shooting': 'xg_shoot',
    'Expected_npxG_shooting': 'npxg_shoot',
    'Expected_npxG/Sh': 'npxg_per_shot',
    'Expected_G-xG': 'g_minus_xg',
    'Expected_np:G-xG': 'npg_minus_xg',

    # Passing totals
    'Total_Cmp': 'passes_cmp',
    'Total_Att': 'passes_att',
    'Total_Cmp%': 'passes_cmp_pct',
    'Total_TotDist': 'pass_tot_dist',
    'Total_PrgDist': 'pass_prog_dist',

    # Short/Med/Long passing
    'Short_Cmp': 'short_cmp',
    'Short_Att': 'short_att',
    'Short_Cmp%': 'short_cmp_pct',
    'Medium_Cmp': 'med_cmp',
    'Medium_Att': 'med_att',
    'Medium_Cmp%': 'med_cmp_pct',
    'Long_Cmp': 'long_cmp',
    'Long_Att': 'long_att',
    'Long_Cmp%': 'long_cmp_pct',

    # Misc passing
    'Unnamed: 22_level_0_Ast': 'pass_ast',
    'Unnamed: 23_level_0_xAG': 'pass_xag',
    'Expected_xA': 'xA',
    'Expected_A-xAG': 'a_minus_xag',
    'Unnamed: 26_level_0_KP': 'key_passes',
    'Unnamed: 27_level_0_1/3': 'passes_final_third',
    'Unnamed: 28_level_0_PPA': 'ppa',
    'Unnamed: 29_level_0_CrsPA': 'crosses_pen_area',
    'Unnamed: 30_level_0_PrgP': 'prog_passes_2',

    # Defending - Tackles/Challenges
    'Tackles_Tkl': 'tackles',
    'Tackles_TklW': 'tackles_won',
    'Tackles_Def 3rd': 'tkl_def3',
    'Tackles_Mid 3rd': 'tkl_mid3',
    'Tackles_Att 3rd': 'tkl_att3',
    'Challenges_Tkl': 'challenges_tkl',
    'Challenges_Att': 'challenges_att',
    'Challenges_Tkl%': 'challenges_tkl_pct',
    'Challenges_Lost': 'challenges_lost',

    # Blocks
    'Blocks_Blocks': 'blocks',
    'Blocks_Sh': 'blocks_shots',
    'Blocks_Pass': 'blocks_passes',
    'Unnamed: 20_level_0_Int': 'interceptions',
    'Unnamed: 21_level_0_Tkl+Int': 'tkl_plus_int',
    'Unnamed: 22_level_0_Clr': 'clearances',
    'Unnamed: 23_level_0_Err': 'errors',

    # Possession - Touches, Carries, Take-Ons, Receiving
    'Touches_Touches': 'touches',
    'Touches_Def Pen': 'touches_def_pen',
    'Touches_Def 3rd': 'touches_def3',
    'Touches_Mid 3rd': 'touches_mid3',
    'Touches_Att 3rd': 'touches_att3',
    'Touches_Att Pen': 'touches_att_pen',
    'Touches_Live': 'touches_live',

    'Take-Ons_Att': 'takeons_att',
    'Take-Ons_Succ': 'takeons_succ',
    'Take-Ons_Succ%': 'takeons_succ_pct',
    'Take-Ons_Tkld': 'takeons_tkld',
    'Take-Ons_Tkld%': 'takeons_tkld_pct',

    'Carries_Carries': 'carries',
    'Carries_TotDist': 'carry_tot_dist',
    'Carries_PrgDist': 'carry_prog_dist',
    'Carries_PrgC': 'carry_prog_carries',
    'Carries_1/3': 'carry_final_third',
    'Carries_CPA': 'carry_pen_area',
    'Carries_Mis': 'carry_miscontrols',
    'Carries_Dis': 'carry_dispossessed',

    'Receiving_Rec': 'receiving',
    'Receiving_PrgR': 'prog_receives_2',

    # SCA / GCA (Shot and Goal Creating Actions)
    'SCA_SCA': 'sca',
    'SCA_SCA90': 'sca_per90',
    'SCA Types_PassLive': 'sca_pass_live',
    'SCA Types_PassDead': 'sca_pass_dead',
    'SCA Types_TO': 'sca_takeon',
    'SCA Types_Sh': 'sca_shot',
    'SCA Types_Fld': 'sca_fouled',
    'SCA Types_Def': 'sca_defense',

    'GCA_GCA': 'gca',
    'GCA_GCA90': 'gca_per90',
    'GCA Types_PassLive': 'gca_pass_live',
    'GCA Types_PassDead': 'gca_pass_dead',
    'GCA Types_TO': 'gca_takeon',
    'GCA Types_Sh': 'gca_shot',
    'GCA Types_Fld': 'gca_fouled',
    'GCA Types_Def': 'gca_defense'
}

# Apply the renaming
player_stats = player_stats.rename(columns=rename_dict)

In [31]:
player_stats.columns.tolist()

['Player',
 'Nation',
 'Position',
 'Squad',
 'goals',
 'assists',
 'goals_assists',
 'non_pk_goals',
 'pks_made',
 'pks_att',
 'yellow_cards',
 'red_cards',
 'xg',
 'npxg',
 'xag',
 'npxg_xag',
 'prog_carries',
 'prog_passes',
 'prog_receives',
 'goals_per90',
 'assists_per90',
 'goals_assists_per90',
 'non_pk_goals_per90',
 'non_pk_goals_assists_per90',
 'xg_per90',
 'xag_per90',
 'xg_xag_per90',
 'npxg_per90',
 'npxg_xag_per90',
 'gls',
 'shots',
 'shots_on_target',
 'sot_pct',
 'shots_per90',
 'sot_per90',
 'goals_per_shot',
 'goals_per_sot',
 'avg_shot_dist',
 'fk_shots',
 'pk_goals',
 'pk_att',
 'xg_shoot',
 'npxg_shoot',
 'npxg_per_shot',
 'g_minus_xg',
 'npg_minus_xg',
 'passes_cmp',
 'passes_att',
 'passes_cmp_pct',
 'pass_tot_dist',
 'pass_prog_dist',
 'short_cmp',
 'short_att',
 'short_cmp_pct',
 'med_cmp',
 'med_att',
 'med_cmp_pct',
 'long_cmp',
 'long_att',
 'long_cmp_pct',
 'pass_ast',
 'pass_xag',
 'xA',
 'a_minus_xag',
 'key_passes',
 'passes_final_third',
 'ppa',
 'cr

In [44]:
player_stats.isnull().sum()[player_stats.isnull().sum() > 0]

Nation                  2
sot_pct               165
goals_per_shot        165
goals_per_sot         275
avg_shot_dist         165
npxg_per_shot         165
passes_cmp_pct          7
short_cmp_pct          13
med_cmp_pct            19
long_cmp_pct           58
challenges_tkl_pct    110
takeons_succ_pct      137
takeons_tkld_pct      137
dtype: int64

In [46]:
player_stats.shape

(883, 123)

In [47]:
# Fill with 0 for performance ratios where NaN means "no attempts"
cols_fill_zero = [
    'sot_pct', 'goals_per_shot', 'goals_per_sot', 'avg_shot_dist', 'npxg_per_shot',
    'passes_cmp_pct', 'short_cmp_pct', 'med_cmp_pct', 'long_cmp_pct',
    'challenges_tkl_pct', 'takeons_succ_pct', 'takeons_tkld_pct'
]

player_stats[cols_fill_zero] = player_stats[cols_fill_zero].fillna(0)

In [49]:
player_stats.isnull().sum()

Player           0
Nation           2
Position         0
Squad            0
goals            0
                ..
gca_pass_dead    0
gca_takeon       0
gca_shot         0
gca_fouled       0
gca_defense      0
Length: 123, dtype: int64

In [50]:
player_stats = player_stats.drop(columns=['Nation'], errors='ignore')

In [52]:
player_stats.isnull().sum().sum()

np.int64(0)

# Feature Selection

# Feature Scaling

# Clustering Model

# Interpret and Label Clusters