In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd

url = "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats#stats_standard"
# Set up Selenium with ChromeDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no browser UI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Initialize WebDriver with automatic management (no need for Service or ChromeDriver path)
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# Tìm các <tr> từ data-row="0" đến data-row="579"
rows = []
for i in range(0, 580):
    row = soup.find('tr', {'data-row': str(i)})
    if row:
        rows.append(row)

# Trích xuất dữ liệu
data = []
for row in rows:
    cells = row.find_all(['td', 'th'])
    row_data = [cell.get_text(strip=True) if cell.get_text(strip=True) else "N/a" for cell in cells]
    if row_data:
        data.append(row_data)

# Định nghĩa tiêu đề
headers = [
    'Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min',
    '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG',
    'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls/90', 'Ast/90',
    'G+A/90', 'G-PK/90', 'G+A-PK/90', 'xG/90', 'xAG/90', 'xG+xAG/90', 'npxG/90',
    'npxG+xAG/90', 'Matches'
]

# Tạo DataFrame
if data:
    max_cols = max(len(row) for row in data)
    if len(headers) > max_cols:
        headers = headers[:max_cols]
    elif len(headers) < max_cols:
        headers = headers + [f'Col_{i}' for i in range(len(headers), max_cols)]

    df = pd.DataFrame(data, columns=headers)

    df['Min'] = pd.to_numeric(df['Min'].str.replace(',', ''), errors='coerce').fillna(0)
    df = df[df['Min'] > 90]
    df = df.sort_values(by='Player')
    df = df.fillna("N/a")

    print(df)
    df.to_csv('premier_league_player_stats_filtered.csv', index=False)

      Rk              Player  Nation    Pos            Squad     Age  Born  \
113  110     Aaron Cresswell  engENG     DF         West Ham  35-124  1989   
430  415      Aaron Ramsdale  engENG     GK      Southampton  26-339  1998   
553  533   Aaron Wan-Bissaka  engENG     DF         West Ham  27-143  1997   
147  143  Abdoulaye Doucouré   mlMLI     MF          Everton  32-107  1993   
284  275  Abdukodir Khusanov   uzUZB     DF  Manchester City  21-048  2004   
..   ...                 ...     ...    ...              ...     ...   ...   
504  486   Yukinari Sugawara   jpJPN  DF,MF      Southampton  24-294  2000   
59    58       Yves Bissouma   mlMLI     MF        Tottenham  28-231  1996   
365  352         Álex Moreno   esESP  DF,MF  Nott'ham Forest  31-314  1993   
222  215      İlkay Gündoğan   deGER     MF  Manchester City  34-176  1990   
175  170    Łukasz Fabiański   plPOL     GK         West Ham  40-000  1985   

     MP Starts     Min  ... Ast/90 G+A/90 G-PK/90 G+A-PK/90 xG/

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# URLs to scrape
player_url = "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats#stats_standard"
keeper_url = "https://fbref.com/en/comps/9/2024-2025/keepers/2024-2025-Premier-League-Stats#stats_keeper"

# Set up Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = webdriver.Chrome(options=chrome_options)

try:
    # Function to scrape a table into a DataFrame
    def scrape_table(url, table_id, stat_columns, ordered_columns):
        driver.get(url)
        time.sleep(3)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        # Find table
        table = soup.find('table', id=table_id)
        headers = [th.get_text().strip() for th in table.find('thead').find_all('th')]
        
        # Find column indices
        col_indices = {}
        for stat, header in stat_columns.items():
            try:
                col_indices[stat] = headers.index(header)
            except ValueError:
                print(f"Warning: Column '{header}' not found for '{stat}'.")
        
        # Extract rows
        data = []
        for row in table.find('tbody').find_all('tr'):
            if row.find('th'):
                cells = row.find_all(['th', 'td'])
                row_data = {}
                for stat, idx in col_indices.items():
                    cell_text = cells[idx].get_text().strip()
                    if stat not in ['Player', 'Nation', 'Team', 'Position']:
                        cell_text = cell_text.replace(',', '')
                        try:
                            cell_text = float(cell_text) if cell_text else 0.0
                        except ValueError:
                            cell_text = 0.0
                    row_data[stat] = cell_text
                data.append(row_data)
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Clean data
        if 'Nation' in df.columns:
            df['Nation'] = df['Nation'].apply(lambda x: x[-3:] if isinstance(x, str) else x)
        if 'Age' in df.columns:
            df['Age'] = df['Age'].apply(lambda x: x.split('-')[0] if isinstance(x, str) and '-' in x else x)
        
        return df[ordered_columns]

    # Player stats columns
    player_stat_columns = {
        'Player': 'Player',
        'Nation': 'Nation',
        'Team': 'Squad',
        'Position': 'Pos',
        'Age': 'Age',
        'Matches Played': 'MP',
        'Starts': 'Starts',
        'Minutes': 'Min',
        'Goals': 'Gls',
        'Assists': 'Ast',
        'Yellow Cards': 'CrdY',
        'Red Cards': 'CrdR',
        'xG': 'xG',
        'xAG': 'xAG',
        'PrgC': 'PrgC',
        'PrgP': 'PrgP',
        'PrgR': 'PrgR',
        'Gls/90': 'Gls/90',  # Exact column names
        'Ast/90': 'Ast/90',
        'xG/90': 'xG/90',
        'xAG/90': 'xAG/90'
    }
    player_ordered_columns = [
        'Player', 'Nation', 'Team', 'Position', 'Age',
        'Matches Played', 'Starts', 'Minutes',
        'Goals', 'Assists', 'Yellow Cards', 'Red Cards',
        'xG', 'xAG', 'PrgC', 'PrgP', 'PrgR',
        'Gls/90', 'Ast/90', 'xG/90', 'xAG/90'
    ]

    # Keeper stats columns
    keeper_stat_columns = {
        'Player': 'Player',
        'Nation': 'Nation',
        'Team': 'Squad',
        'GA90': 'GA90',
        'Save%': 'Save%',
        'CS%': 'CS%',
        'PK Save%': 'PKSv%'
    }
    keeper_ordered_columns = ['Player', 'Nation', 'Team', 'GA90', 'Save%', 'CS%', 'PK Save%']

    # Scrape player stats
    print("Scraping player stats...")
    player_df = scrape_table(player_url, 'stats_standard', player_stat_columns, player_ordered_columns)

    # Scrape keeper stats
    print("Scraping goalkeeper stats...")
    keeper_df = scrape_table(keeper_url, 'stats_keeper', keeper_stat_columns, keeper_ordered_columns)

    # Merge DataFrames
    combined_df = pd.merge(
        player_df,
        keeper_df,
        on=['Player', 'Team', 'Nation'],
        how='left'
    )

    # Fill NaN for keeper columns
    combined_df[['GA90', 'Save%', 'CS%', 'PK Save%']] = combined_df[['GA90', 'Save%', 'CS%', 'PK Save%']].fillna(0.0)

    # Print first 5 rows
    print("\nCombined DataFrame (first 5 rows):")
    print(combined_df.head())

    # Save to CSV
    combined_df.to_csv('combined_premier_league_stats.csv', index=False)
    print("Data saved to 'combined_premier_league_stats.csv'")

finally:
    driver.quit()

Scraping player stats...
Headers for stats_standard: ['', 'Playing Time', 'Performance', 'Expected', 'Progression', 'Per 90 Minutes', '', 'Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR', 'Gls', 'Ast', 'G+A', 'G-PK', 'G+A-PK', 'xG', 'xAG', 'xG+xAG', 'npxG', 'npxG+xAG', 'Matches']
Found 586 rows for stats_standard.
Player stats scraped. Rows: 586
Player HTML saved to 'player_rows.html'
Scraping goalkeeper stats...
Headers for stats_keeper: ['', 'Playing Time', 'Performance', 'Penalty Kicks', '', 'Rk', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP', 'Starts', 'Min', '90s', 'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%', 'PKatt', 'PKA', 'PKsv', 'PKm', 'Save%', 'Matches']
Found 43 rows for stats_keeper.
Goalkeeper stats scraped. Rows: 43
Keeper HTML saved to 'keeper_rows.html'

Combined Data (first 5 rows):
  

  combined_df[keeper_cols] = combined_df[keeper_cols].fillna(0.0)
