In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    ElementClickInterceptedException,
    TimeoutException,
    NoSuchElementException
)
import time
import logging
import pandas as pd

In [2]:
# Set root logger level higher than DEBUG
logging.basicConfig(level=logging.DEBUG)  # DEBUG, INFO, WARNING, ERROR, CRITICAL

# Specifically silence noisy loggers
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

In [3]:
def start_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # If chromedriver is in PATH, you don’t need to specify it
    driver = webdriver.Chrome(options=chrome_options)

    return driver

In [4]:
def build_soccerway_url(country, league, season_start, season_end, stage_name, stage_id):
    """
    Build Soccerway matches URL for a given league and season.

    Args:
        country (str): Country name in lowercase (e.g., 'france')
        league (str): League name in lowercase with hyphens (e.g., 'ligue-1')
        season_start (int): Start year (e.g., 2024)
        season_end (int): End year (e.g., 2025)
        stage_name (str): Stage name in lowercase with hyphens (e.g., 'regular-season')
        stage_id (str): Stage ID (e.g., 'r81802')

    Returns:
        str: Constructed URL
    """
    return f"https://ca.soccerway.com/national/{country}/{league}/{season_start}{season_end}/{stage_name}/{stage_id}/matches/"


# Example usage:
url_ligue1 = build_soccerway_url("france", "ligue-1", 2024, 2025, "regular-season", "r81802")
url_premierleague = build_soccerway_url("england", "premier-league", 2024, 2025, "regular-season", "r74823")

print(url_ligue1)
print(url_premierleague)

https://ca.soccerway.com/national/france/ligue-1/20242025/regular-season/r81802/matches/
https://ca.soccerway.com/national/england/premier-league/20242025/regular-season/r74823/matches/


In [5]:
from datetime import datetime

def build_current_season_url(country, league, stage_name, stage_id):
    # year = datetime.now().year
    for year in range(2021, 2025):
        url = build_soccerway_url(country, league, year, year+1, stage_name, stage_id)
    return url

build_current_season_url("france", "ligue-1", "regular-season", "r81802")

'https://ca.soccerway.com/national/france/ligue-1/20242025/regular-season/r81802/matches/'

In [36]:
# def scrape_data():
url = "https://ca.soccerway.com/national/france/ligue-1/20242025/regular-season/r81802/matches/"
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_experimental_option("detach", True)
driver = start_driver()
driver.get(url)

# Load all matches
for _ in range(40):
    try:
        # Load previous button
        btn = driver.find_element(By.XPATH, f"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button")
        btn.click()
        time.sleep(2)
    except:
        break
    
# single match div
match = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[2]/div/div/div[3]/div/div/div/a")
# return match

In [7]:
print(match.text)
print(match.get_attribute('outerHTML'))


<a class="sc-22ef6ec-0 sc-a1a6abf-2 boVFdS gAkJuB" href="/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/"></a>


In [8]:
href = match.get_attribute('href')
print(href)

https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/


In [9]:
matches = driver.find_elements(By.XPATH, "//a[contains(@href, '/match/')]")
print(matches)

[]


In [10]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(driver.page_source, 'html.parser')
all_links = soup.find_all('a', href=True)

matches = [a for a in all_links if '/matches/' and '/france/ligue-1' in a['href']]
matches

[<a class="sc-22ef6ec-0 boVFdS" href="/national/france/ligue-1/2025-2026/regular-season/2d58bc25-77ec-4425-bed7-30f1839e3f8f/" style="width: 100%;"><div class="sc-575aeb00-1 gTtAsv"><span class="sc-600113f9-0 bzoXQy"><span class="sc-600113f9-1 klwUeB"><img alt="" data-nimg="1" decoding="async" height="18" loading="lazy" src="https://static.soccerway.com/flags/svg/france.svg" style="color: transparent;" width="18"/></span></span><span class="sc-4e4c9eab-2 jgDxUJ label label">Ligue 1</span></div></a>,
 <a class="sc-22ef6ec-0 boVFdS" href="/national/france/ligue-1/20242025/regular-season/r81802/" style="width: 100%;"><div class="sc-94e50ec4-0 fxiSqU total"><div class="sc-94e50ec4-0 qzEgh sc-d7965a66-0 hxTmOI"><span class="sc-4e4c9eab-2 etMNoc label">Summary</span></div></div></a>,
 <a class="sc-22ef6ec-0 boVFdS" href="/national/france/ligue-1/20242025/regular-season/r81802/matches/" style="width: 100%;"><div class="sc-94e50ec4-0 fxiSqU total"><div class="sc-94e50ec4-0 qzEgh sc-d7965a66-0 

In [11]:
match_hrefs = set()
europe_ligues = ['france/ligue-1', 'germany/bundesliga']

def get_match_hrefs(years):
    all_hrefs = set()
    for year in years:
        for ligue in europe_ligues:
            match_links = driver.find_elements(By.CSS_SELECTOR, f"a[href*='{ligue}'][href*='/matches/{year}/']")
            hrefs = {a.get_attribute('href') for a in match_links}
            all_hrefs.update(hrefs)
    return all_hrefs

# Example: load matches for 2024 only
target_years = [2023, 2024, 2025]  # or [2023, 2024, 2025] if you want multiple
hrefs = get_match_hrefs(target_years)

match_hrefs.update(hrefs)

for href in hrefs:
    print(href)

print(f"Total unique matches collected: {len(match_hrefs)}")


https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/08/15/france/ligue-1/rennes/marseille/4685099/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/
Total unique matches collected: 10


In [12]:
print(match_hrefs)

{'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/', 'https://ca.soccerway.com/matches/2025/08/15/france/ligue-1/rennes/marseille/4685099/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/'}


In [13]:
def get_teams(driver):
    try:
        team_a = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div/div/div/h1").text.strip()
        team_b = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div/div/div/h1").text.strip()
        return f"{team_a} vs {team_b}"
    except:
        return "Unknown vs Unknown"

In [14]:
get_teams(driver)

'Unknown vs Unknown'

In [15]:
def safe_click(xpath):
    """Wait and safely click an element via JavaScript if needed."""
    try:
        btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        driver.execute_script("arguments[0].click();", btn)  # JS click bypasses overlays
        time.sleep(2)
        return True
    except Exception:
        return False

In [16]:
while True:
    # 1. Handle consent popup (only if it appears)
    try:
        consent_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Consent')]"))
        )
        consent_button.click()
        print("✅ Consent button clicked.")
    except:
        pass

    try:
        previous_count = len(match_hrefs)

        # 2. Ensure overlays are gone
        WebDriverWait(driver, 10).until(
            EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
        )

        # 3. Find and click 'Load Previous'
        load_prev_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//div[@class='block_match_list']//button"))
        )

        try:
            load_prev_btn.click()
        except:
            driver.execute_script("arguments[0].click();", load_prev_btn)  # fallback
        print("🔄 Clicked 'Load Previous'")

        # 4. Wait for new matches to appear
        WebDriverWait(driver, 10).until(
            lambda d: len(get_match_hrefs([2023, 2024, 2025])) > previous_count
        )

        # 5. Update match set
        new_hrefs = get_match_hrefs([2023, 2024, 2025])
        added = len(new_hrefs - match_hrefs)
        match_hrefs.update(new_hrefs)
        print(f"📈 Loaded {added} new matches. Total: {len(match_hrefs)}")

        # 6. Stop if no new matches are added
        if added == 0:
            print("⚠️ No new matches after click. Stopping.")
            break

    except Exception as e:
        print("❌ No more 'Load Previous' button or stopped due to error:", e)
        break

print(f"✅ Total matches found: {len(match_hrefs)}")
for href in sorted(match_hrefs):
    print(href)


✅ Consent button clicked.
❌ No more 'Load Previous' button or stopped due to error: Message: 
Stacktrace:
#0 0x5e3296f1d2ca <unknown>
#1 0x5e32969c4550 <unknown>
#2 0x5e3296a160f0 <unknown>
#3 0x5e3296a162e1 <unknown>
#4 0x5e3296a645e4 <unknown>
#5 0x5e3296a3bbed <unknown>
#6 0x5e3296a619e6 <unknown>
#7 0x5e3296a3b993 <unknown>
#8 0x5e3296a07d6b <unknown>
#9 0x5e3296a09141 <unknown>
#10 0x5e3296ee22ab <unknown>
#11 0x5e3296ee60b9 <unknown>
#12 0x5e3296ec9139 <unknown>
#13 0x5e3296ee6c68 <unknown>
#14 0x5e3296ead60f <unknown>
#15 0x5e3296f0b1f8 <unknown>
#16 0x5e3296f0b3d6 <unknown>
#17 0x5e3296f1c5e6 <unknown>
#18 0x792040c9caa4 <unknown>
#19 0x792040d29c3c <unknown>

✅ Total matches found: 10
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-

In [17]:
def scrape_data(driver):
    try:
        consent_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Consent')]"))
        )
        consent_button.click()
        print("✅ Consent button clicked.")
    except:
        print("❌ No consent button found.")
    
    match_hrefs = set()
    # single match div
    # match = driver.find_element(By.XPATH, f"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[2]/div/div/div[3]/div/div/div/a")
    while True:
    # 1. Handle consent popup (only if it appears)
        

        try:
            previous_count = len(match_hrefs)

            # 2. Ensure overlays are gone
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
            )

            # 3. Find and click 'Load Previous'
            load_prev_btn = "//button[contains(., 'Show previous')]"   # "//div[@class='block_match_list']//button"
            # load_prev_btn = safe_click(load_prev_btn)
            try:
                safe_click(load_prev_btn)
            except:
                driver.execute_script("arguments[0].click();", load_prev_btn)  # fallback
            print("🔄 Clicked 'Load Previous'")

            # 4. Wait for new matches to appear
            WebDriverWait(driver, 10).until(
                lambda d: len(get_match_hrefs([2023, 2024, 2025])) > previous_count
            )

            # 5. Update match set
            new_hrefs = get_match_hrefs([2023, 2024, 2025])
            before_count = len(match_hrefs)
            match_hrefs.update(new_hrefs)
            print(f"Collected so far: {len(match_hrefs)} matches")

            # 6. Stop if no new matches are added
            if len(match_hrefs) == before_count:
                print("⚠️ No new matches after click. Stopping.")
                break

        except Exception as e:
            print("❌ No more 'Load Previous' button or stopped due to error:", e)
            break
        
    return match_hrefs

match_links = scrape_data(driver)

❌ No consent button found.
🔄 Clicked 'Load Previous'
Collected so far: 10 matches
🔄 Clicked 'Load Previous'
❌ No more 'Load Previous' button or stopped due to error: Message: 



In [18]:
# while True:
#     try:
#         consent_button = WebDriverWait(driver, 5).until(
#                 EC.element_to_be_clickable((By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button"))
#             )
#         consent_button.click()    # trigger consent button to remove modal
#         print("✅ Consent button clicked.")
#     except:
#         pass    # consent may have been handled
        
#     try:
#         previous_count = len(match_hrefs)
#         # remove the pop-up modal, cookie modal, consent button
#         WebDriverWait(driver, 15).until(
#             EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
#         )
#         load_prev_btn = WebDriverWait(driver, 15).until(
#             EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button"))
#         )
#         load_prev_btn.click()   # trigger load previous button
#         # driver.execute_script("arguments[0].click();", load_prev_btn)
        
#         # Wait for new content to load
#         time.sleep(4)  # Or better: wait for a new match element to appear
#         # Get new hrefs
#         new_hrefs = get_match_hrefs([2023, 2024, 2025])
#         print(f"Loaded {len(new_hrefs - match_hrefs)} matches")
#         match_hrefs.update(new_hrefs)
#     except Exception as e:
#         print("No more 'Load Previous' button or loading stopped:", e)
#         break
    
# print(f"Total matches found for 2025 in france/ligue-1: {len(match_hrefs)}")
# for href in sorted(match_hrefs):
#     print(href)

In [19]:
def load_all_matches(driver, match_hrefs, get_match_hrefs, years=[2023, 2024, 2025]):
    """
    Continuously clicks 'Load Previous' until no new matches are found.
    
    Args:
        driver: Selenium WebDriver instance.
        match_hrefs: Set to store unique match hrefs.
        get_match_hrefs: Function that returns a set of match hrefs for given years.
        years: List of years to filter matches (default [2023, 2024, 2025]).

    Returns:
        match_hrefs: Final set of match hrefs.
    """
    while True:
        # 1. Handle consent popup (only if it appears)
        try:
            consent_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Consent')]"))
            )
            consent_button.click()
            print("✅ Consent button clicked.")
        except:
            pass

        try:
            previous_count = len(match_hrefs)

            # 2. Ensure overlays are gone
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
            )

            # 3. Find and click 'Load Previous'
            load_prev_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='block_match_list']//button"))
            )

            try:
                load_prev_btn.click()
            except:
                driver.execute_script("arguments[0].click();", load_prev_btn)  # fallback
            print("🔄 Clicked 'Load Previous'")

            # 4. Wait for new matches to appear
            WebDriverWait(driver, 10).until(
                lambda d: len(get_match_hrefs(years)) > previous_count
            )

            # 5. Update match set
            new_hrefs = get_match_hrefs(years)
            added = len(new_hrefs - match_hrefs)
            match_hrefs.update(new_hrefs)
            print(f"📈 Loaded {added} new matches. Total: {len(match_hrefs)}")

            # 6. Stop if no new matches are added
            if added == 0:
                print("⚠️ No new matches after click. Stopping.")
                break

        except Exception as e:
            print("❌ No more 'Load Previous' button or stopped due to error:", e)
            break

    print(f"✅ Total matches found: {len(match_hrefs)}")
    for href in sorted(match_hrefs):
        print(href)

    return match_hrefs

In [20]:
def load_all_matches(driver, get_match_hrefs, years=[2023, 2024, 2025], match_hrefs=None):
    """
    Clicks 'Load Previous' until no more matches are found.
    
    Args:
        driver: Selenium WebDriver instance
        get_match_hrefs: Function returning a set of hrefs for given years
        years: List of years (default [2023, 2024, 2025])
        match_hrefs: Existing set of match hrefs; if None, will be initialized
    """
    if match_hrefs is None:
        match_hrefs = set(get_match_hrefs(years))  # start with what’s already loaded

    while True:
        # Handle consent popup (if it appears)
        try:
            consent_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Consent')]"))
            )
            consent_button.click()
            print("✅ Consent button clicked.")
        except:
            pass

        try:
            previous_count = len(match_hrefs)

            # Wait for overlays to disappear
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
            )

            # Find and click 'Load Previous'
            load_prev_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='block_match_list']//button"))
            )
            try:
                load_prev_btn.click()
            except:
                driver.execute_script("arguments[0].click();", load_prev_btn)
            print("🔄 Clicked 'Load Previous'")

            # Wait for more matches
            WebDriverWait(driver, 10).until(
                lambda d: len(get_match_hrefs(years)) > previous_count
            )

            # Update matches
            new_hrefs = get_match_hrefs(years)
            added = len(new_hrefs - match_hrefs)
            match_hrefs.update(new_hrefs)
            print(f"📈 Loaded {added} new matches. Total: {len(match_hrefs)}")

            if added == 0:
                print("⚠️ No new matches after click. Stopping.")
                break

        except Exception as e:
            print("❌ No more 'Load Previous' button or stopped due to error:", e)
            break

    print(f"✅ Total matches found: {len(match_hrefs)}")
    for href in sorted(match_hrefs):
        print(href)

    return match_hrefs


In [21]:
match_hrefs

{'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/',
 'https://ca.soccerway.com/matches/2025/08/15/france/ligue-1/rennes/marseille/4685099/'}

In [22]:
match_links = load_all_matches(driver, get_match_hrefs)

❌ No more 'Load Previous' button or stopped due to error: Message: 
Stacktrace:
#0 0x5e3296f1d2ca <unknown>
#1 0x5e32969c4550 <unknown>
#2 0x5e3296a160f0 <unknown>
#3 0x5e3296a162e1 <unknown>
#4 0x5e3296a645e4 <unknown>
#5 0x5e3296a3bbed <unknown>
#6 0x5e3296a619e6 <unknown>
#7 0x5e3296a3b993 <unknown>
#8 0x5e3296a07d6b <unknown>
#9 0x5e3296a09141 <unknown>
#10 0x5e3296ee22ab <unknown>
#11 0x5e3296ee60b9 <unknown>
#12 0x5e3296ec9139 <unknown>
#13 0x5e3296ee6c68 <unknown>
#14 0x5e3296ead60f <unknown>
#15 0x5e3296f0b1f8 <unknown>
#16 0x5e3296f0b3d6 <unknown>
#17 0x5e3296f1c5e6 <unknown>
#18 0x792040c9caa4 <unknown>
#19 0x792040d29c3c <unknown>

✅ Total matches found: 10
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043

In [23]:
match_links

{'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/',
 'https://ca.soccerway.com/matches/2025/08/15/france/ligue-1/rennes/marseille/4685099/'}

In [24]:
def load_all_matches(driver, years=[2023, 2024, 2025]):
    """
    Continuously clicks 'Load Previous' until no new matches are found.
    Automatically handles consent popups and updates match_hrefs.
    
    Args:
        driver: Selenium WebDriver instance.
        years: List of years to filter matches (default [2023, 2024, 2025]).
    
    Returns:
        set: Final set of match hrefs.
    """
    match_hrefs = set(get_match_hrefs(years))  # start with what's already visible

    while True:
        # 1. Handle consent popup (only if it appears)
        try:
            consent_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Consent')]"))
            )
            consent_button.click()
            print("✅ Consent button clicked.")
        except:
            pass

        try:
            previous_count = len(match_hrefs)

            # 2. Ensure overlays are gone
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
            )

            # 3. Find and click 'Load Previous'
            load_prev_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='block_match_list']//button"))
            )

            try:
                load_prev_btn.click()
            except:
                driver.execute_script("arguments[0].click();", load_prev_btn)  # fallback
            print("🔄 Clicked 'Load Previous'")

            # 4. Wait for new matches to appear
            WebDriverWait(driver, 10).until(
                lambda d: len(get_match_hrefs(years)) > previous_count
            )

            # 5. Update match set
            new_hrefs = get_match_hrefs(years)
            added = len(new_hrefs - match_hrefs)
            match_hrefs.update(new_hrefs)
            print(f"📈 Loaded {added} new matches. Total: {len(match_hrefs)}")

            # 6. Stop if no new matches are added
            if added == 0:
                print("⚠️ No new matches after click. Stopping.")
                break

        except Exception as e:
            print("❌ No more 'Load Previous' button or stopped due to error:", e)
            break

    print(f"✅ Total matches found: {len(match_hrefs)}")
    for href in sorted(match_hrefs):
        print(href)

    return match_hrefs

In [25]:
matches = load_all_matches(driver)
matches

❌ No more 'Load Previous' button or stopped due to error: Message: 
Stacktrace:
#0 0x5e3296f1d2ca <unknown>
#1 0x5e32969c4550 <unknown>
#2 0x5e3296a160f0 <unknown>
#3 0x5e3296a162e1 <unknown>
#4 0x5e3296a645e4 <unknown>
#5 0x5e3296a3bbed <unknown>
#6 0x5e3296a619e6 <unknown>
#7 0x5e3296a3b993 <unknown>
#8 0x5e3296a07d6b <unknown>
#9 0x5e3296a09141 <unknown>
#10 0x5e3296ee22ab <unknown>
#11 0x5e3296ee60b9 <unknown>
#12 0x5e3296ec9139 <unknown>
#13 0x5e3296ee6c68 <unknown>
#14 0x5e3296ead60f <unknown>
#15 0x5e3296f0b1f8 <unknown>
#16 0x5e3296f0b3d6 <unknown>
#17 0x5e3296f1c5e6 <unknown>
#18 0x792040c9caa4 <unknown>
#19 0x792040d29c3c <unknown>

✅ Total matches found: 10
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043

{'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/',
 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/',
 'https://ca.soccerway.com/matches/2025/08/15/france/ligue-1/rennes/marseille/4685099/'}

In [15]:
driver.get("https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/")
scored_minute = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]")
# scored_minute = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[2]")
# scored_minute = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[4]/div[2]")
# scored_minute = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]")
# scored_minute = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[6]/div[2]/span/span")
print(scored_minute.text)
# print(type(scored_minute.text))

20'
1
0


In [95]:
xpaths = [
    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]/span/span",
    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[2]/span/span",
    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[4]/div[2]/span/span",
    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span",
    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[6]/div[2]/span/span"
]

# === Extract and print scored minutes ===
minutes_list = []
for xpath in xpaths:
    try:
        scored_minute = driver.find_element(By.XPATH, xpath)
        scored_minute_text = scored_minute.text.strip()
        # print([scored_minute_text])
        if scored_minute_text:
            minutes_list.append(scored_minute_text)
            print(minutes_list)
    except Exception as e:
        print(f"Could not find element at {xpath} - {e}")

Could not find element at /html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]/span/span - Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]/span/span"}
  (Session info: chrome=138.0.7204.183); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
#0 0x5e4a6fe512ca <unknown>
#1 0x5e4a6f8f8550 <unknown>
#2 0x5e4a6f94a0f0 <unknown>
#3 0x5e4a6f94a2e1 <unknown>
#4 0x5e4a6f9985e4 <unknown>
#5 0x5e4a6f96fbed <unknown>
#6 0x5e4a6f9959e6 <unknown>
#7 0x5e4a6f96f993 <unknown>
#8 0x5e4a6f93bd6b <unknown>
#9 0x5e4a6f93d141 <unknown>
#10 0x5e4a6fe162ab <unknown>
#11 0x5e4a6fe1a0b9 <unknown>
#12 0x5e4a6fdfd139 <unknown>
#13 0x5e4a6fe1ac68 <unknown>
#14 0x5e4a6fde160f <unknown>
#15 0x5e4a6fe3f1f8 <unknown>
#

In [96]:
type(scored_minute_text)

str

In [43]:
minutes_list

["55'", "72'"]

In [44]:
def parse_minute(minute_str):
    # Handle "90+3'" -> 93, "45'" -> 45
    minute_str = minute_str.replace("'", "").strip()
    if "+" in minute_str:
        base, extra = minute_str.split("+")
        return int(base) + int(extra)
    return int(minute_str)

parsed_minutes = [parse_minute(minute) for minute in minutes_list]
print(parsed_minutes)  # e.g., [20, 43, 53]

[55, 72]


In [None]:
import re

valid_minutes = [entry for entry in scored_minute.text if re.search(r"Scored minute: \d+'", entry)]
valid_minutes

In [76]:
valid_minutes = [entry for entry in scored_minute.text.split() if entry.isdigit() and 0 < int(entry) <= 70]
print("Valid minutes:", valid_minutes)

Valid minutes: []


In [99]:
def get_goal_minutes(goal_events_div_xpath):  # arg: goal_events_div_xpath
    try:
        team_div = driver.find_element(By.XPATH, goal_events_div_xpath)
        spans = team_div.find_elements(By.XPATH, ".//span")
        minutes = []
        for span in spans:
            text = span.text.strip()
            if "'" in text:
                try:
                    minute = int(text.replace("'", "").split('+')[0])
                    minutes.append(minute)
                except ValueError:
                    continue
        return minutes
    except NoSuchElementException:
        print(f"⚠️ No goal data found at {goal_events_div_xpath} for {driver.current_url}")
        return []

In [100]:
get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]")

⚠️ No goal data found at /html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2] for https://ca.soccerway.com/national/france/ligue-1/20242025/regular-season/r81802/matches/


[]

In [101]:
scored_minutes = []
index = 1

while True:
    try:
        xpath = f"(//div[contains(text(), 'Scored minute:')])[{index}]"
        element = driver.find_element(By.XPATH, xpath)
        text = element.text.strip()

        # Check if it's a valid minute using regex
        match = re.search(r"(\d+)'", text)
        if match:
            minute = int(match.group(1))
            scored_minutes.append(minute)
        else:
            # Blank or invalid format → end loop
            break

        index += 1
    except Exception:
        # Element not found or no more matches
        break

print("Valid Scored Minutes:", scored_minutes)

Valid Scored Minutes: []


In [42]:
zero_zero_count = 0
early_goals_count = 0
valid_scores = {'0 - 1', '1 - 0', '1 - 1', '0 - 2', '2 - 0', '1 - 2', '2 - 1'}
early_goal_teams = []
early_goal_matches = []
goal_count = 0

for href in match_hrefs:
    driver.get(href)
    time.sleep(2)
    
    try:
        # Handle consent popup
        consent_button = driver.find_element(By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button")
        consent_button.click()
        time.sleep(1)
    except:
        pass  # Consent may have already been handled

    try:
        home_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[1]/div")
        away_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[3]/div")
        home_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div")
        away_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div")

        home_score = int(home_score_elem.text.strip())
        away_score = int(away_score_elem.text.strip())
        score = f"{home_score} - {away_score}"
        total_goals = home_score + away_score

        if score == "0 - 0":
            zero_zero_count += 1
            print(zero_zero_count)
            continue
        
        if score in valid_scores and 1 <= total_goals <= 3:
            # home_goals_minutes = get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]")
            # away_goals_minutes = get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[2]")
            xpaths = [
                "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]/span/span",
                "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[2]/span/span",
                "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[4]/div[2]/span/span",
                "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span",
                "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[6]/div[2]/span/span"
            ]

            # === Extract and print scored minutes ===
            minutes_list = []
            for xpath in xpaths:
                try:
                    scored_minute = driver.find_element(By.XPATH, xpath)
                    scored_minute_text = scored_minute.text.strip()
                    # print([scored_minute_text])
                    if not scored_minute_text:
                        break  # Stop if no more scored minutes
                    minutes_list.append(scored_minute_text)
                    print(minutes_list)
                except Exception as e:
                    print(f"Could not find element at {xpath} - {e}")
                    
            parsed_minutes = [parse_minute(minute) for minute in minutes_list]  # e.g., [20, 43, 53]
            # print(parsed_minutes)  # e.g., [20, 43, 53]
            
            for minute in parsed_minutes:
                if minute <= 70:
                    goal_count += 1
                
                    # Check if goal_count is valid and all goals are after 70th minute
                if goal_count and 1 <= minute <= 90:
                    early_goals_count += 1
                    early_goal_teams.append(f"{home_team_elem.text.strip()} {int(home_score_elem.text.strip())} vs {int(away_score_elem.text.strip())} {away_team_elem.text.strip()}")

            # all_goals = sorted(home_goals_minutes + away_goals_minutes)

            # if all_goals and all(minute >= 70 for minute in all_goals):
            #     late_goals_count += 1
            #     late_goal_teams.append(f"{home_team_elem.text.strip()} vs {away_team_elem.text.strip()}")
            
            early_goal_matches.append({
                "zero_zero_count": zero_zero_count,
                "late_goals_count": early_goals_count,
                "late_goal_teams": early_goal_teams,
                "goal_count": goal_count
            })
            
            df_zero = pd.DataFrame([{"0-0 Count": zero_zero_count}])
            df_late = pd.DataFrame(early_goal_matches)

            df_zero.to_csv("data/zero_zero_matches.csv", index=False)
            df_late.to_csv("data/late_goal_matches.csv", index=False)

    except Exception as e:
        print(f"Error processing {href}: {e}")
        continue

print(f"\n✅ Total 0-0 games: {zero_zero_count}")
print(f"✅ Total 1-3 goal games with first goal after 70th minute: {early_goals_count}")
print("✅ Matches where all goals were after 70th minute:")
for match in early_goal_teams:
    print(f" - {match}")


["37'"]
["37'", "60'"]
["37'", "60'", "86'"]
Could not find element at /html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span - Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span"}
  (Session info: chrome=138.0.7204.183); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
#0 0x5b8b1fa702ca <unknown>
#1 0x5b8b1f517550 <unknown>
#2 0x5b8b1f5690f0 <unknown>
#3 0x5b8b1f5692e1 <unknown>
#4 0x5b8b1f5b75e4 <unknown>
#5 0x5b8b1f58ebed <unknown>
#6 0x5b8b1f5b49e6 <unknown>
#7 0x5b8b1f58e993 <unknown>
#8 0x5b8b1f55ad6b <unknown>
#9 0x5b8b1f55c141 <unknown>
#10 0x5b8b1fa352ab <unknown>
#11 0x5b8b1fa390b9 <unknown>
#12 0x5b8b1fa1c139 <unknown>
#13 0x5b8b1fa39c68 <unknown>
#14 0x5b8b1fa0

In [26]:
def load_all_previous_data(driver, timeout=10):
    """Click the 'Load Previous' button until it disappears or becomes inactive."""
    while True:
        try:
            # Adjust based on real button class or text
            load_button = WebDriverWait(driver, timeout).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Load Previous')]"))
            )
            driver.execute_script("arguments[0].click();", load_button)
            print("Clicked Load Previous")
            time.sleep(2)  # Let content load
        except (TimeoutException, NoSuchElementException):
            print("No more Load Previous button found.")
            break
        except ElementClickInterceptedException as e:
            print(f"Click intercepted: {e}")
            # Optionally scroll into view or dismiss overlays
            driver.execute_script("arguments[0].scrollIntoView();", load_button)
            time.sleep(1)

In [27]:
def load_all_matches_hrefs(driver, get_match_hrefs, years=[2023, 2024, 2025], match_hrefs=None):
    """
    Clicks 'Load Previous' until no more matches are found.
    
    Args:
        driver: Selenium WebDriver instance
        get_match_hrefs: Function returning a set of hrefs for given years
        years: List of years (default [2023, 2024, 2025])
        match_hrefs: Existing set of match hrefs; if None, will be initialized
    """
    if match_hrefs is None:
        match_hrefs = set(get_match_hrefs(years))  # start with what’s already loaded

    while True:
        # Handle consent popup (if it appears)
        try:
            consent_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Consent')]"))
            )
            consent_button.click()
            print("✅ Consent button clicked.")
        except:
            pass

        try:
            previous_count = len(match_hrefs)

            # Wait for overlays to disappear
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
            )

            # Find and click 'Load Previous'
            load_prev_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='block_match_list']//button"))
            )
            try:
                load_prev_btn.click()
            except:
                driver.execute_script("arguments[0].click();", load_prev_btn)
            print("🔄 Clicked 'Load Previous'")

            # Wait for more matches
            WebDriverWait(driver, 10).until(
                lambda d: len(get_match_hrefs(years)) > previous_count
            )

            # Update matches
            new_hrefs = get_match_hrefs(years)
            added = len(new_hrefs - match_hrefs)
            match_hrefs.update(new_hrefs)
            print(f"📈 Loaded {added} new matches. Total: {len(match_hrefs)}")

            if added == 0:
                print("⚠️ No new matches after click. Stopping.")
                break

        except Exception as e:
            print("❌ No more 'Load Previous' button or stopped due to error:", e)
            break

    print(f"✅ Total matches found: {len(match_hrefs)}")
    for href in sorted(match_hrefs):
        print(href)

    return match_hrefs


In [28]:
def extract_match_data(driver, match_hrefs):
    zero_zero_count = 0
    early_goals_count = 0
    late_goals_count = 0
    valid_scores = {'0 - 1', '1 - 0', '1 - 1', '0 - 2', '2 - 0', '1 - 2', '2 - 1'}
    early_goal_teams = []
    early_goal_matches = []
    late_goal_teams = []
    late_goal_matches = []
    goal_count = 0

    for href in match_hrefs:
        driver.get(href)
        time.sleep(2)
        
        try:
            # Handle consent popup
            consent_button = driver.find_element(By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button")
            consent_button.click()
            time.sleep(1)
        except:
            pass  # Consent may have already been handled

        try:
            home_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[1]/div")
            away_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[3]/div")
            home_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div")
            away_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div")

            home_score = int(home_score_elem.text.strip())
            away_score = int(away_score_elem.text.strip())
            score = f"{home_score} - {away_score}"
            total_goals = home_score + away_score

            if score == "0 - 0":
                zero_zero_count += 1
                print(zero_zero_count)
                continue
            
            if score in valid_scores and 1 <= total_goals <= 3:
                load_all_previous_data(driver)
                # home_goals_minutes = get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]")
                # away_goals_minutes = get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[2]")
                xpaths = [
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[4]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[6]/div[2]/span/span"
                ]

                # === Extract and print scored minutes ===
                minutes_list = []
                for xpath in xpaths:
                    try:
                        scored_minute = driver.find_element(By.XPATH, xpath)
                        scored_minute_text = scored_minute.text.strip()
                        # print([scored_minute_text])
                        if not scored_minute_text:
                            break  # Stop if no more scored minutes
                        minutes_list.append(scored_minute_text)
                        print(minutes_list)
                    except Exception as e:
                        print(f"Could not find element at {xpath} - {e}")
                        
                parsed_minutes = [parse_minute(minute) for minute in minutes_list]  # e.g., [20, 43, 53]
                # print(parsed_minutes)  # e.g., [20, 43, 53]
                
                for minute in parsed_minutes:
                    if minute <= 70:
                        goal_count += 1
                    
                        # Check if goal_count is valid and all goals are after 70th minute
                    if goal_count and 1 <= minute <= 90:
                        early_goals_count += 1
                        early_goal_teams.append(f"{home_team_elem.text.strip()} {int(home_score_elem.text.strip())} vs {int(away_score_elem.text.strip())} {away_team_elem.text.strip()}")

                # all_goals = sorted(home_goals_minutes + away_goals_minutes)

                # if all_goals and all(minute >= 70 for minute in all_goals):
                #     late_goals_count += 1
                #     late_goal_teams.append(f"{home_team_elem.text.strip()} vs {away_team_elem.text.strip()}")
                early_goal_matches.append({
                    "zero_zero_count": zero_zero_count,
                    "early_goals_count": early_goals_count,
                    "early_goal_teams": early_goal_teams,
                    "goal_count": goal_count
                })
                
                df_zero = pd.DataFrame([{"0-0 Count": zero_zero_count}])
                df_early = pd.DataFrame(early_goal_matches)

                df_zero.to_csv("data/zero_zero_matches.csv", index=False)
                df_early.to_csv("data/early_goal_matches.csv", index=False)
                
            if score not in valid_scores and total_goals > 3:
                load_all_previous_data(driver)
                
                all_goals = home_score + away_score
                late_goals_count += 1
                late_goal_teams.append(f"{home_team_elem.text.strip()} {int(home_score_elem.text.strip())} vs {int(away_score_elem.text.strip())} {away_team_elem.text.strip()}")
                
                late_goal_matches.append({
                    "late_goals_count": late_goals_count,
                    "late_goal_teams": late_goal_teams,
                    "home_score": home_score,
                    "away_score": away_score,
                    "all_goals": all_goals,
                })
                
                df_late = pd.DataFrame(late_goal_matches)
                df_late.to_csv("data/late_goal_matches.csv", index=False)
                
            complete_data = {
                "zero_zero_count": zero_zero_count,
                "early_goals_count": early_goals_count,
                "late_goals_count": late_goals_count,
                "goal_count": goal_count,
                "early_goal_teams": early_goal_teams,
                "late_goal_teams": late_goal_teams
            }
            df_complete = pd.DataFrame([complete_data])
            df_complete.to_csv("data/complete_match_data.csv", index=False)

        except Exception as e:
            print(f"Error processing {href}: {e}")
            continue
        
    return df_zero, df_early, df_late, df_complete

In [29]:
extract_match_data(driver, match_hrefs)

No more Load Previous button found.
No more Load Previous button found.
No more Load Previous button found.
No more Load Previous button found.
No more Load Previous button found.
No more Load Previous button found.
["37'"]
["37'", "60'"]
["37'", "60'", "86'"]
Could not find element at /html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span - Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span"}
  (Session info: chrome=138.0.7204.183); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
#0 0x5e3296f1d2ca <unknown>
#1 0x5e32969c4550 <unknown>
#2 0x5e3296a160f0 <unknown>
#3 0x5e3296a162e1 <unknown>
#4 0x5e3296a645e4 <unknown>
#5 0x5e3296a3bbed <unknown>
#6 0x5e3296a619e6 <unknown

UnboundLocalError: cannot access local variable 'df_zero' where it is not associated with a value

In [76]:
df_zero, df_late = extract_match_data(driver, match_hrefs)

print(df_late)

["37'"]
["37'", "60'"]
["37'", "60'", "86'"]
Could not find element at /html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span - Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span"}
  (Session info: chrome=138.0.7204.183); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
#0 0x55cd901972ca <unknown>
#1 0x55cd8fc3e550 <unknown>
#2 0x55cd8fc900f0 <unknown>
#3 0x55cd8fc902e1 <unknown>
#4 0x55cd8fcde5e4 <unknown>
#5 0x55cd8fcb5bed <unknown>
#6 0x55cd8fcdb9e6 <unknown>
#7 0x55cd8fcb5993 <unknown>
#8 0x55cd8fc81d6b <unknown>
#9 0x55cd8fc83141 <unknown>
#10 0x55cd9015c2ab <unknown>
#11 0x55cd901600b9 <unknown>
#12 0x55cd90143139 <unknown>
#13 0x55cd90160c68 <unknown>
#14 0x55cd9012

In [48]:
def extract_match_data(driver):
    # === Step 1: Get all match links ===
    match_hrefs = load_all_matches_hrefs(driver, get_match_hrefs)
    print(f"Found {len(match_hrefs)} matches to scrape.")

    zero_zero_count = 0
    early_goals_count = 0
    late_goals_count = 0
    valid_scores = {'0 - 1', '1 - 0', '1 - 1', '0 - 2', '2 - 0', '1 - 2', '2 - 1'}
    early_goal_teams = []
    early_goal_matches = []
    late_goal_teams = []
    late_goal_matches = []
    goal_count = 0

    df_zero = pd.DataFrame()
    df_early = pd.DataFrame()
    df_late = pd.DataFrame()
    df_complete = pd.DataFrame()

    for href in match_hrefs:
        driver.get(href)
        time.sleep(2)
        
        try:
            # Handle consent popup
            consent_button = driver.find_element(By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button")
            consent_button.click()
            time.sleep(1)
        except:
            pass

        try:
            home_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[1]/div")
            away_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[3]/div")
            home_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div")
            away_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div")

            home_score = int(home_score_elem.text.strip())
            away_score = int(away_score_elem.text.strip())
            score = f"{home_score} - {away_score}"
            total_goals = home_score + away_score

            if score == "0 - 0":
                zero_zero_count += 1
                continue
            
            if score in valid_scores and 1 <= total_goals <= 3:
                load_all_previous_data(driver)

                xpaths = [
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[4]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[5]/div[2]/span/span",
                    "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]/div[6]/div[2]/span/span"
                ]

                minutes_list = []
                for xpath in xpaths:
                    try:
                        scored_minute = driver.find_element(By.XPATH, xpath)
                        scored_minute_text = scored_minute.text.strip()
                        # print([scored_minute_text])
                        if not scored_minute_text:
                            break  # Stop if no more scored minutes
                        minutes_list.append(scored_minute_text)
                        print(minutes_list)
                    except Exception as e:
                        print(f"Could not find element at {xpath} - {e}")
                        
                parsed_minutes = [parse_minute(minute) for minute in minutes_list]
                
                for minute in parsed_minutes:
                    if minute <= 70:
                        goal_count += 1
                    if goal_count and 1 <= minute <= 90:
                        early_goals_count += 1
                        early_goal_teams.append(f"{home_team_elem.text.strip()} {home_score} vs {away_score} {away_team_elem.text.strip()}")

                early_goal_matches.append({
                    "zero_zero_count": zero_zero_count,
                    "early_goals_count": early_goals_count,
                    "early_goal_teams": early_goal_teams,
                    "goal_count": goal_count
                })
                
                df_zero = pd.DataFrame([{"0-0 Count": zero_zero_count}])
                df_early = pd.DataFrame(early_goal_matches)
                df_zero.to_csv("data/zero_zero_matches.csv", index=False)
                df_early.to_csv("data/early_goal_matches.csv", index=False)
                
            if score not in valid_scores and total_goals > 3:
                load_all_previous_data(driver)
                late_goals_count += 1
                late_goal_teams.append(f"{home_team_elem.text.strip()} {home_score} vs {away_score} {away_team_elem.text.strip()}")
                
                late_goal_matches.append({
                    "late_goals_count": late_goals_count,
                    "late_goal_teams": late_goal_teams,
                    "home_score": home_score,
                    "away_score": away_score,
                    "all_goals": total_goals,
                })
                
                df_late = pd.DataFrame(late_goal_matches)
                df_late.to_csv("data/late_goal_matches.csv", index=False)
                
            complete_data = {
                "home_team": home_team_elem.text.strip(),
                "away_team": away_team_elem.text.strip(),
                "home_score": home_score,
                "away_score": away_score,
                "score": score,
                "zero_zero_count": zero_zero_count,
                "early_goals_count": early_goals_count,
                "late_goals_count": late_goals_count,
                "goal_count": goal_count,
                "early_goal_teams": early_goal_teams,
                "late_goal_teams": late_goal_teams,
                "minutes_list": parsed_minutes,
                "first_goal_minute": min(parsed_minutes) if parsed_minutes else None,
                "last_goal_minute": max(parsed_minutes) if parsed_minutes else None,
                "match_link": href
            }
            
            df_complete = pd.concat([df_complete, pd.DataFrame([complete_data])], ignore_index=True)
            df_complete.to_csv("data/complete_match_data.csv", index=False)

        except Exception as e:
            print(f"Error processing {href}: {e}")
            continue
        
    return df_zero, df_early, df_late, df_complete


In [38]:
extract_match_data(driver)

✅ Consent button clicked.
❌ No more 'Load Previous' button or stopped due to error: Message: 
Stacktrace:
#0 0x5b8b1fa702ca <unknown>
#1 0x5b8b1f517550 <unknown>
#2 0x5b8b1f5690f0 <unknown>
#3 0x5b8b1f5692e1 <unknown>
#4 0x5b8b1f5b75e4 <unknown>
#5 0x5b8b1f58ebed <unknown>
#6 0x5b8b1f5b49e6 <unknown>
#7 0x5b8b1f58e993 <unknown>
#8 0x5b8b1f55ad6b <unknown>
#9 0x5b8b1f55c141 <unknown>
#10 0x5b8b1fa352ab <unknown>
#11 0x5b8b1fa390b9 <unknown>
#12 0x5b8b1fa1c139 <unknown>
#13 0x5b8b1fa39c68 <unknown>
#14 0x5b8b1fa0060f <unknown>
#15 0x5b8b1fa5e1f8 <unknown>
#16 0x5b8b1fa5e3d6 <unknown>
#17 0x5b8b1fa6f5e6 <unknown>
#18 0x7f149829caa4 <unknown>
#19 0x7f1498329c3c <unknown>

✅ Total matches found: 10
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-

(Empty DataFrame
 Columns: []
 Index: [],
 Empty DataFrame
 Columns: []
 Index: [],
    late_goals_count                                    late_goal_teams  \
 0                 1  [Marseille 4 vs 2 Rennes, Lens 4 vs 0 Monaco, ...   
 1                 2  [Marseille 4 vs 2 Rennes, Lens 4 vs 0 Monaco, ...   
 2                 3  [Marseille 4 vs 2 Rennes, Lens 4 vs 0 Monaco, ...   
 3                 4  [Marseille 4 vs 2 Rennes, Lens 4 vs 0 Monaco, ...   
 4                 5  [Marseille 4 vs 2 Rennes, Lens 4 vs 0 Monaco, ...   
 5                 6  [Marseille 4 vs 2 Rennes, Lens 4 vs 0 Monaco, ...   
 
    home_score  away_score  all_goals  
 0           4           2          6  
 1           4           0          4  
 2           6           0          6  
 3           2           3          5  
 4           3           1          4  
 5           2           3          5  ,
 Empty DataFrame
 Columns: []
 Index: [])

In [49]:
df_zero, df_early, df_late, df_complete = extract_match_data(driver)
df_complete.shape

❌ No more 'Load Previous' button or stopped due to error: Message: 
Stacktrace:
#0 0x5b8b1fa702ca <unknown>
#1 0x5b8b1f517550 <unknown>
#2 0x5b8b1f5690f0 <unknown>
#3 0x5b8b1f5692e1 <unknown>
#4 0x5b8b1f5b75e4 <unknown>
#5 0x5b8b1f58ebed <unknown>
#6 0x5b8b1f5b49e6 <unknown>
#7 0x5b8b1f58e993 <unknown>
#8 0x5b8b1f55ad6b <unknown>
#9 0x5b8b1f55c141 <unknown>
#10 0x5b8b1fa352ab <unknown>
#11 0x5b8b1fa390b9 <unknown>
#12 0x5b8b1fa1c139 <unknown>
#13 0x5b8b1fa39c68 <unknown>
#14 0x5b8b1fa0060f <unknown>
#15 0x5b8b1fa5e1f8 <unknown>
#16 0x5b8b1fa5e3d6 <unknown>
#17 0x5b8b1fa6f5e6 <unknown>
#18 0x7f149829caa4 <unknown>
#19 0x7f1498329c3c <unknown>

✅ Total matches found: 19
https://ca.soccerway.com/matches/2025/03/07/france/ligue-1/toulouse/monaco/4372965/
https://ca.soccerway.com/matches/2025/03/09/france/ligue-1/nantes/strasbourg/4372964/
https://ca.soccerway.com/matches/2025/03/15/france/ligue-1/nantes/lille/4372978/
https://ca.soccerway.com/matches/2025/03/16/france/ligue-1/strasbourg/to

(11, 15)

In [51]:
type(df_complete)

pandas.core.frame.DataFrame

In [55]:
df_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   home_team          11 non-null     object
 1   away_team          11 non-null     object
 2   home_score         11 non-null     int64 
 3   away_score         11 non-null     int64 
 4   score              11 non-null     object
 5   zero_zero_count    11 non-null     int64 
 6   early_goals_count  11 non-null     int64 
 7   late_goals_count   11 non-null     int64 
 8   goal_count         11 non-null     int64 
 9   early_goal_teams   11 non-null     object
 10  late_goal_teams    11 non-null     object
 11  minutes_list       11 non-null     object
 12  first_goal_minute  4 non-null      object
 13  last_goal_minute   4 non-null      object
 14  match_link         11 non-null     object
dtypes: int64(6), object(9)
memory usage: 1.4+ KB


In [None]:
df_complete.head()

Unnamed: 0,home_team,away_team,home_score,away_score,score,zero_zero_count,early_goals_count,late_goals_count,goal_count,early_goal_teams,late_goal_teams,minutes_list,first_goal_minute,last_goal_minute,match_link
0,Rennes,Nantes,2,1,2 - 1,1,3,1,2,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...","[Marseille 3 vs 2 Toulouse, Le Havre 3 vs 2 Na...","[23, 54, 86]",23.0,86.0,https://ca.soccerway.com/matches/2025/04/18/fr...
1,Nantes,PSG,1,1,1 - 1,2,5,1,3,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...","[Marseille 3 vs 2 Toulouse, Le Havre 3 vs 2 Na...","[33, 83]",33.0,83.0,https://ca.soccerway.com/matches/2025/04/22/fr...
2,Nantes,Lille,1,0,1 - 0,3,5,1,3,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...","[Marseille 3 vs 2 Toulouse, Le Havre 3 vs 2 Na...",[],,,https://ca.soccerway.com/matches/2025/03/15/fr...
3,Toulouse,Monaco,1,1,1 - 1,3,5,1,3,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...","[Marseille 3 vs 2 Toulouse, Le Havre 3 vs 2 Na...",[],,,https://ca.soccerway.com/matches/2025/03/07/fr...
4,Reims,Toulouse,1,0,1 - 0,3,5,1,3,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...","[Marseille 3 vs 2 Toulouse, Le Havre 3 vs 2 Na...",[],,,https://ca.soccerway.com/matches/2025/04/20/fr...


In [52]:
df_complete.columns

Index(['home_team', 'away_team', 'home_score', 'away_score', 'score',
       'zero_zero_count', 'early_goals_count', 'late_goals_count',
       'goal_count', 'early_goal_teams', 'late_goal_teams', 'minutes_list',
       'first_goal_minute', 'last_goal_minute', 'match_link'],
      dtype='object')

In [54]:
df_complete['first_goal_minute']

0       23
1       33
2     None
3     None
4     None
5     None
6       21
7     None
8     None
9     None
10      11
Name: first_goal_minute, dtype: object

In [None]:
df_complete[
    (df_complete["first_goal_minute"] >= range(70, 90)) &
    (
        df_complete["home_team"].str.lower().str.contains(team_filter_late) |
        df_complete["away_team"].str.lower().str.contains(team_filter_late)
    )
]

In [53]:
df_early.head()

Unnamed: 0,zero_zero_count,early_goals_count,early_goal_teams,goal_count
0,1,3,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...",2
1,2,5,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...",3
2,3,5,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...",3
3,3,5,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...",3
4,3,5,"[Rennes 2 vs 1 Nantes, Rennes 2 vs 1 Nantes, R...",3


In [98]:
df_zero.columns.tolist()

['0-0 Count']

In [100]:
df_zero.keys()

Index(['0-0 Count'], dtype='object')

In [101]:
df_zero.columns = df_zero.columns.str.strip() \
    .str.lower().str.replace('-', '_').str.replace(' ', '_')
df_zero.columns

Index(['0_0_count'], dtype='object')

In [99]:
df_zero['0-0 Count']

0    0
Name: 0-0 Count, dtype: int64

In [96]:
df_zero.head()

Unnamed: 0,0-0 Count
0,0


In [102]:
df_early.head()

Unnamed: 0,zero_zero_count,early_goals_count,early_goal_teams,goal_count
0,0,0,[],0


In [51]:
def close_overlay_if_present():
    try:
        overlay = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
        )
        # overlay = driver.find_element(By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button")
        driver.execute_script("arguments[0].remove();", overlay)
        print("Overlay removed.")
    except:
        pass  # Overlay not present

In [54]:
close_overlay_if_present()

In [58]:
def remove_all_overlays():
    driver.execute_script("""
        document.querySelectorAll('.fc-dialog-overlay, .fc-dialog-container').forEach(el => el.remove());
    """)
    print("All overlays removed.")
    
remove_all_overlays()

All overlays removed.


In [None]:
def handle_consent_popup():
    try:
        # wait = WebDriverWait(driver, 10)
        # load_button = wait.until(
        #     EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Load Previous')]"))
        # )
        consent_popup = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.CLASS_NAME, "fc-consent-root"))
        )
        driver.execute_script("arguments[0].remove();", consent_popup)
        print("Consent popup removed.")
    except:
        print("No consent popup found.")

handle_consent_popup()

In [66]:
def remove_all_overlays():
    blocking_classes = [
        "fc-dialog-overlay",         # Consent
        "fc-consent-root",           # Cookie pop
        # "sc-94e50ec4-0",             # Custom site overlay
        # "sc-e15f478f-1",             # Another wrapper layer
    ]
    for cls in blocking_classes:
        try:
            overlays = driver.find_elements(By.CLASS_NAME, cls)
            for overlay in overlays:
                driver.execute_script("arguments[0].remove();", overlay)
                print(f"✅ Removed overlay: {cls}")
        except Exception as e:
            print(f"⚠️ Could not remove overlay {cls}: {e}")


In [78]:
def click_previous_button():
    
    handle_consent_popup()  # Ensure consent popup is handled
    remove_all_overlays()  # Ensure button isn't blocked

    try:
        prev_btn = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button"))
        )
        # prev_btn = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button")
        driver.execute_script("arguments[0].scrollIntoView(true);", prev_btn)
        time.sleep(0.5)  # Ensure visibility
        prev_btn.click()
        time.sleep(2)  # Wait for data to load
        return True
    except Exception as e:
        print("No more previous matches or error occurred:", e)
        return False

# Usage
while True:
    success = click_previous_button()
    if not success:
        break
    print(extract_match_data(driver, match_hrefs))


No consent popup found.
✅ Removed overlay: fc-dialog-overlay
✅ Removed overlay: fc-consent-root
No more previous matches or error occurred: Message: 
Stacktrace:
#0 0x55cd901972ca <unknown>
#1 0x55cd8fc3e550 <unknown>
#2 0x55cd8fc900f0 <unknown>
#3 0x55cd8fc902e1 <unknown>
#4 0x55cd8fcde5e4 <unknown>
#5 0x55cd8fcb5bed <unknown>
#6 0x55cd8fcdb9e6 <unknown>
#7 0x55cd8fcb5993 <unknown>
#8 0x55cd8fc81d6b <unknown>
#9 0x55cd8fc83141 <unknown>
#10 0x55cd9015c2ab <unknown>
#11 0x55cd901600b9 <unknown>
#12 0x55cd90143139 <unknown>
#13 0x55cd90160c68 <unknown>
#14 0x55cd9012760f <unknown>
#15 0x55cd901851f8 <unknown>
#16 0x55cd901853d6 <unknown>
#17 0x55cd901965e6 <unknown>
#18 0x7d080149caa4 <unknown>
#19 0x7d0801529c3c <unknown>



In [56]:
# extract_match_data(driver, match_hrefs)

# Repeat via Previous button
while True:
    try:
        prev_btn = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button")
        driver.execute_script("arguments[0].scrollIntoView(true);", prev_btn)
        time.sleep(0.5)  # Ensure visibility
        prev_btn.click()
        time.sleep(2)  # allow page to load/update

        extracted_data = extract_match_data(driver, match_hrefs)
        print(extracted_data)

    except Exception as e:
        print("No more previous matches or error occurred:", e)
        break

No more previous matches or error occurred: Message: element click intercepted: Element <button class="sc-41ba8c7a-0 dGqIGb undefined">...</button> is not clickable at point (518, 14). Other element would receive the click: <div class="fc-consent-root">...</div>
  (Session info: chrome=138.0.7204.183); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#elementclickinterceptedexception
Stacktrace:
#0 0x55cd901972ca <unknown>
#1 0x55cd8fc3e550 <unknown>
#2 0x55cd8fc96f5c <unknown>
#3 0x55cd8fc94dc2 <unknown>
#4 0x55cd8fc92462 <unknown>
#5 0x55cd8fc91b80 <unknown>
#6 0x55cd8fc842ea <unknown>
#7 0x55cd8fcb5bc2 <unknown>
#8 0x55cd8fc83c6a <unknown>
#9 0x55cd8fcb5d8e <unknown>
#10 0x55cd8fcdb9e6 <unknown>
#11 0x55cd8fcb5993 <unknown>
#12 0x55cd8fc81d6b <unknown>
#13 0x55cd8fc83141 <unknown>
#14 0x55cd9015c2ab <unknown>
#15 0x55cd901600b9 <unknown>
#16 0x55cd90143139 <unknown>
#17 0x55cd90160c68 <unknown>
#18 0x55cd9012760f <

In [42]:
driver.get(url)
prev_btn = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button")
print(prev_btn.text)

Load previous


In [72]:
[1,2,3] + [4,5,6]

[1, 2, 3, 4, 5, 6]

In [47]:
print(int(home_score.text.strip()) + int(away_score.text.strip()))

DEBUG:selenium.webdriver.remote.remote_connection:GET http://localhost:40069/session/9d086d71c326b352c92572dda3d12b61/element/f.01E220F94D55318FAFB30F863D80B09F.d.F2E77589189FA1545767DB1810D30491.e.10032/text {}
DEBUG:urllib3.connectionpool:http://localhost:40069 "GET /session/9d086d71c326b352c92572dda3d12b61/element/f.01E220F94D55318FAFB30F863D80B09F.d.F2E77589189FA1545767DB1810D30491.e.10032/text HTTP/1.1" 200 0
DEBUG:selenium.webdriver.remote.remote_connection:Remote response: status=200 | data={"value":"3"} | headers=HTTPHeaderDict({'Content-Length': '13', 'Content-Type': 'application/json; charset=utf-8', 'cache-control': 'no-cache'})
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request
DEBUG:selenium.webdriver.remote.remote_connection:GET http://localhost:40069/session/9d086d71c326b352c92572dda3d12b61/element/f.01E220F94D55318FAFB30F863D80B09F.d.F2E77589189FA1545767DB1810D30491.e.10033/text {}
DEBUG:urllib3.connectionpool:http://localhost:40069 "GET /session/9d086d7

3


In [None]:
# body > div.sc-99237a02-0.emYNNA:nth-child(5) > div.sc-81220147-0.jXeDBc > div.sc-81220147-1.eIIZxi.sc-19cad2e4-0.kkiDdB > div.sc-81220147-2.fgqGfg.simple-grid-item:nth-child(1) > div.sc-94e50ec4-0.dwozFa > div.sc-9ab61def-0.fHlmFM.sc-b4561063-12.hMGjRc:nth-child(1) > div.sc-b4561063-2.dgmodi > div.sc-9ab61def-0.cvqIJV.sc-b4561063-0.bItxxz:nth-child(3) > div.sc-b4561063-1.fzYiaa

# body > div.sc-99237a02-0.emYNNA:nth-child(5) > div.sc-81220147-0.jXeDBc > div.sc-81220147-1.eIIZxi.sc-19cad2e4-0.kkiDdB > div.sc-81220147-2.fgqGfg.simple-grid-item:nth-child(1) > div.sc-94e50ec4-0.dwozFa > div.sc-9ab61def-0.fHlmFM.sc-b4561063-12.hMGjRc:nth-child(1) > div.sc-b4561063-2.dgmodi > div.sc-9ab61def-0.cvqIJV.sc-b4561063-0.bItxxz:nth-child(3) > div.sc-b4561063-1.fzYiaa > div.sc-b4561063-8.lndvMZ:nth-child(2) > div.sc-b4561063-9.ckSzrE:nth-child(2) > div.sc-94e50ec4-0.ksqFna:nth-child(1) > div.sc-4e4c9eab-1.fPbPfi.label.sc-b4561063-6.hJImxc

# body > div.sc-99237a02-0.emYNNA:nth-child(5) > div.sc-81220147-0.jXeDBc > div.sc-81220147-1.eIIZxi.sc-19cad2e4-0.kkiDdB > div.sc-81220147-2.fgqGfg.simple-grid-item:nth-child(1) > div.sc-94e50ec4-0.dwozFa > div.sc-9ab61def-0.fHlmFM.sc-b4561063-12.hMGjRc:nth-child(1) > div.sc-b4561063-2.dgmodi > div.sc-9ab61def-0.cvqIJV.sc-b4561063-0.bItxxz:nth-child(3) > div.sc-b4561063-1.fzYiaa > div.sc-b4561063-8.lndvMZ:nth-child(2) > div.sc-b4561063-9.ckSzrE:nth-child(2) > div.sc-94e50ec4-0.ksqFna:nth-child(3) > div.sc-4e4c9eab-1.fPbPfi.label.sc-b4561063-6.hJImxc

# team   /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div
# team   /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div
#        /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div

# team_name   /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div/div/div/h1
# team_name   /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div/div/div/h1

# team_badge  /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div/span/img
# team_badge  /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div/span/img


# goals  /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[1]/div
# goals  /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[3]/div


#        /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]
#        /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[2]


# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]/div[1]/span
# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]/div[2]/span
# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[2]/div/span


# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]/div[1]
# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]

# events_div_xpath = "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div[2]/div/div[2]/div[1]"