In [3]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep, time
import pandas as pd
import warnings
import numpy as np
import time
from datetime import datetime
import json
import re
from bs4 import BeautifulSoup  # Import for HTML parsing

In [5]:
# approach 1 - XPaths
warnings.filterwarnings('ignore')

# Set up empty dataframe in a list for storage. Errors are set up to handle any matches that don't scrape.
dataframe = []
errors = []

match_ids = [116075]

for match_id in match_ids:
    base_url = f'https://www.premierleague.com/match/{match_id}'

    option = Options()
    option.headless = False  # Set to False for debugging
    driver = webdriver.Chrome(options=option)

    driver.get(base_url)

    try:
        # ✅ Click Cookie Popup
        try:
            cookie_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Accept')]"))
            )
            cookie_button.click()
            print("✅ Cookie pop-up clicked.")
        except TimeoutException:
            print("⚠️ No cookie pop-up appeared.")

        # ✅ Click Stats Tab
        try:
            stats_tab = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']//li[contains(.,'Stats')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView();", stats_tab)  # Scroll to make it visible
            time.sleep(2)
            driver.execute_script("arguments[0].click();", stats_tab)  # Click via JavaScript
            print("✅ Clicked Stats tab via JavaScript.")
        except TimeoutException:
            print("❌ Stats tab not found.")

        # ✅ Extract Home & Away Teams
        try:
            print("📌 Debugging Team Name Extraction...")

            home_team_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']//th[1]/a"))
            )
            away_team_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']//th[3]/a"))
            )

            home_team = home_team_element.text.strip() or home_team_element.get_attribute("textContent").strip()
            away_team = away_team_element.text.strip() or away_team_element.get_attribute("textContent").strip()

            print(f"✅ Home Team: {home_team}")
            print(f"✅ Away Team: {away_team}")

        except TimeoutException:
            print("❌ Failed to extract team names.")
            home_team, away_team = "Unknown", "Unknown"

        # ✅ Wait for Stats Table to Load
        try:
            print("📌 Waiting for Stats Table to Load...")

            stats_table = WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']/div/section[2]/div[2]/div/div[2]/section[3]/div[2]/div[2]/table/tbody"))
            )
            driver.execute_script("arguments[0].scrollIntoView();", stats_table)  # Scroll to load content
            print("✅ Stats Table is now visible.")

            example = driver.find_elements(By.XPATH, "//*[@id='mainContent']/div/section[2]/div[2]/div/div[2]/section[3]/div[2]/div[2]/table/tbody/tr[1]/td[1]/p")

            print(example)

            # ✅ Locate all rows inside `<tbody>`
            rows = driver.find_elements(By.XPATH, "//*[@id='mainContent']/div/section[2]/div[2]/div/div[2]/section[3]/div[2]/div[2]/table/tbody/tr")

            if not rows:
                raise ValueError("❌ No rows found in stats table.")

            print(f"✅ Found {len(rows)} rows in the table.")

            # ✅ Create Dictionaries to Store Stats
            home_stats = {}
            away_stats = {}

            # ✅ Loop through each row to extract data
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td")  # Find columns in the row
                if len(cols) == 3:
                    home_value = cols[0].find_element(By.TAG_NAME, "p").text.strip()  # Extract <p> value
                    stat_name = cols[1].find_element(By.TAG_NAME, "p").text.strip().replace(" ", "_").lower()  # Normalize stat name
                    away_value = cols[2].find_element(By.TAG_NAME, "p").text.strip()  # Extract <p> value

                    home_stats[stat_name] = home_value
                    away_stats[stat_name] = away_value

            # ✅ Define Important Stats (Set to 0 if Missing)
            stats_cols = ['possession_%', 'shots_on_target', 'shots', 'touches', 'passes',
                          'tackles', 'clearances', 'corners', 'offsides', 'yellow_cards',
                          'red_cards', 'fouls_conceded']

            for stat in stats_cols:
                home_stats.setdefault(stat, "0")
                away_stats.setdefault(stat, "0")

            print(f"✅ Successfully scraped stats for {home_team} vs {away_team}")
            print("📊 Match Stats:", home_stats, away_stats)

        except Exception as e:
            print(f"❌ Error extracting stats: {e}")
            errors.append(match_id)

    finally:
        driver.quit()  # Close the browser


✅ Cookie pop-up clicked.
✅ Clicked Stats tab via JavaScript.
📌 Debugging Team Name Extraction...
✅ Home Team: Spurs
✅ Away Team: Man Utd
📌 Waiting for Stats Table to Load...
✅ Stats Table is now visible.
[]
❌ Error extracting stats: ❌ No rows found in stats table.


In [None]:
# approach 2 - inspired approach from footballdotpy using pd.read_html(driver.page_source)

warnings.filterwarnings('ignore')

# Set up empty dataframe in a list for storage. Errors are set up to handle any matches that don't scrape.
dataframe = []
errors = []

match_ids = [116075]

for match_id in match_ids:
    base_url = f'https://www.premierleague.com/match/{match_id}'

    option = Options()
    option.headless = False  # Set to False for debugging
    driver = webdriver.Chrome(options=option)

    driver.get(base_url)

    try:
        # Cookies
        try:
            cookie_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Accept')]"))
            )
            cookie_button.click()
            print("✅ Cookie pop-up clicked.")
        except:
            print("⚠️ No cookie pop-up appeared.")

        # Stats tab access
        try:
            stats_tab = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']//li[contains(.,'Stats')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView();", stats_tab)
            time.sleep(2)
            driver.execute_script("arguments[0].click();", stats_tab)
            print("✅ Clicked Stats tab via JavaScript.")
        except:
            print("❌ Stats tab not found.")

        # Home & Away Teams
        try:

            home_team_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']//th[1]/a"))
            )
            away_team_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//*[@id='mainContent']//th[3]/a"))
            )

            home_team = home_team_element.text.strip() or home_team_element.get_attribute("textContent").strip()
            away_team = away_team_element.text.strip() or away_team_element.get_attribute("textContent").strip()

            print(f"✅ Home Team: {home_team}")
            print(f"✅ Away Team: {away_team}")

        except:
            print("❌ Failed to extract team names.")
            home_team, away_team = "Unknown", "Unknown"

        # Stats Table
        try:

            stats_page = pd.read_html(driver.page_source)
            print(f"✅ Found {len(stats_page)} tables on the page, but the most relevant is:")

            game_stats = stats_page[-1]  # Last table is usually the relevant one

            print(f"the Stats Table:\n{game_stats}")

            # ✅ Check column names
            print("📌 Table Columns:", game_stats.columns)

            # ✅ Create Empty Dicts for Home & Away Stats
            home_stats = {}
            away_stats = {}

            # ✅ Extract Data
            home_series = game_stats.iloc[:, 0]  # First column (Home team stats)
            stat_names = game_stats.iloc[:, 1]  # Middle column (Stat names)
            away_series = game_stats.iloc[:, 2]  # Third column (Away team stats)

            for home_value, stat_name, away_value in zip(home_series, stat_names, away_series):
                stat_name = stat_name.replace(' ', '_').lower()  # Convert stat name to lowercase with underscores
                home_stats[stat_name] = home_value
                away_stats[stat_name] = away_value

            # ✅ Define Stats We Need (Set to 0 if Missing)
            stats_cols = ['possession_%', 'shots_on_target', 'shots', 'touches', 'passes',
                          'tackles', 'clearances', 'corners', 'offsides', 'yellow_cards',
                          'red_cards', 'fouls_conceded']

            for stat in stats_cols:
                if stat not in home_stats:
                    home_stats[stat] = 0
                    away_stats[stat] = 0

            print(f"✅ Successfully scraped stats for {home_team} vs {away_team}")
            print("📊 Match Stats:", home_stats, away_stats)

        except Exception as e:
            print(f"❌ Error extracting stats: {e}")

    finally:
        driver.quit()  # Close the browser


✅ Cookie pop-up clicked.
✅ Clicked Stats tab via JavaScript.
✅ Home Team: Spurs
✅ Away Team: Man Utd
✅ Found 3 tables on the page, but the most relevant is:
the Stats Table:
Empty DataFrame
Columns: [Spurs, Unnamed: 1, Man Utd]
Index: []
📌 Table Columns: Index(['Spurs', 'Unnamed: 1', 'Man Utd'], dtype='object')
✅ Successfully scraped stats for Spurs vs Man Utd
📊 Match Stats: {'possession_%': 0, 'shots_on_target': 0, 'shots': 0, 'touches': 0, 'passes': 0, 'tackles': 0, 'clearances': 0, 'corners': 0, 'offsides': 0, 'yellow_cards': 0, 'red_cards': 0, 'fouls_conceded': 0} {'possession_%': 0, 'shots_on_target': 0, 'shots': 0, 'touches': 0, 'passes': 0, 'tackles': 0, 'clearances': 0, 'corners': 0, 'offsides': 0, 'yellow_cards': 0, 'red_cards': 0, 'fouls_conceded': 0}
