In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import logging
import pandas as pd

In [2]:
# Set root logger level higher than DEBUG
logging.basicConfig(level=logging.DEBUG)  # DEBUG, INFO, WARNING, ERROR, CRITICAL

# Specifically silence noisy loggers
logging.getLogger("selenium").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

In [3]:
def start_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in background
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # If chromedriver is in PATH, you don’t need to specify it
    driver = webdriver.Chrome(options=chrome_options)

    return driver

In [4]:
# def scrape_data():
url = "https://ca.soccerway.com/national/france/ligue-1/20242025/regular-season/r81802/matches/"
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_experimental_option("detach", True)
driver = start_driver()
driver.get(url)

# Load all matches
for _ in range(40):
    try:
        # Load previous button
        btn = driver.find_element(By.XPATH, f"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button")
        btn.click()
        time.sleep(2)
    except:
        break
    
# single match div
match = driver.find_element(By.XPATH, f"/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[2]/div/div/div[3]/div/div/div/a")
# return match

In [8]:
print(match.text)
print(match.get_attribute('outerHTML'))


<a class="sc-22ef6ec-0 sc-a1a6abf-2 boVFdS gAkJuB" href="/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/"></a>


In [9]:
href = match.get_attribute('href')
print(href)

https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/


In [10]:
matches = driver.find_elements(By.XPATH, "//a[contains(@href, '/match/')]")
print(matches)

[]


In [11]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(driver.page_source, 'html.parser')
all_links = soup.find_all('a', href=True)

matches = [a for a in all_links if '/matches/' and '/france/ligue-1' in a['href']]
matches

[<a class="sc-22ef6ec-0 boVFdS" href="/national/france/ligue-1/2025-2026/regular-season/2d58bc25-77ec-4425-bed7-30f1839e3f8f/" style="width: 100%;"><div class="sc-575aeb00-1 gTtAsv"><span class="sc-600113f9-0 bzoXQy"><span class="sc-600113f9-1 klwUeB"><img alt="" data-nimg="1" decoding="async" height="18" loading="lazy" src="https://static.soccerway.com/flags/svg/france.svg" style="color: transparent;" width="18"/></span></span><span class="sc-4e4c9eab-2 jgDxUJ label label">Ligue 1</span></div></a>,
 <a class="sc-22ef6ec-0 boVFdS" href="/national/france/ligue-1/20242025/regular-season/r81802/" style="width: 100%;"><div class="sc-94e50ec4-0 fxiSqU total"><div class="sc-94e50ec4-0 qzEgh sc-d7965a66-0 hxTmOI"><span class="sc-4e4c9eab-2 etMNoc label">Summary</span></div></div></a>,
 <a class="sc-22ef6ec-0 boVFdS" href="/national/france/ligue-1/20242025/regular-season/r81802/matches/" style="width: 100%;"><div class="sc-94e50ec4-0 fxiSqU total"><div class="sc-94e50ec4-0 qzEgh sc-d7965a66-0 

In [12]:
match_hrefs = set()
europe_ligues = ['france/ligue-1', 'germany/bundesliga']

def get_match_hrefs(years):
    all_hrefs = set()
    for year in years:
        for ligue in europe_ligues:
            match_links = driver.find_elements(By.CSS_SELECTOR, f"a[href*='{ligue}'][href*='/matches/{year}/']")
            hrefs = {a.get_attribute('href') for a in match_links}
            all_hrefs.update(hrefs)
    return all_hrefs

# Example: load matches for 2024 only
target_years = [2023, 2024, 2025]  # or [2023, 2024, 2025] if you want multiple
hrefs = get_match_hrefs(target_years)

match_hrefs.update(hrefs)

for href in hrefs:
    print(href)

print(f"Total unique matches collected: {len(match_hrefs)}")


https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/
Total unique matches collected: 9


In [21]:
print(match_hrefs)

{'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/', 'https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/'}


In [None]:
def get_teams(driver):
    try:
        team_a = driver.find_element(By.XPATH, "//div[@class='team team-a']//a").text.strip()
        team_b = driver.find_element(By.XPATH, "//div[@class='team team-b']//a").text.strip()
        return f"{team_a} vs {team_b}"
    except:
        return "Unknown vs Unknown"

In [13]:
while True:
    try:
        consent_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button"))
            )
        consent_button.click()    # trigger consent button to remove modal
        print("✅ Consent button clicked.")
    except:
        pass    # consent may have been handled
        
    try:
        previous_count = len(match_hrefs)
        # remove the pop-up modal, cookie modal, consent button
        WebDriverWait(driver, 15).until(
            EC.invisibility_of_element_located((By.CLASS_NAME, "fc-dialog-overlay"))
        )
        load_prev_btn = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[3]/div[2]/div/div/div/div[1]/div/div[2]/div[1]/div/div/div/button"))
        )
        load_prev_btn.click()   # trigger load previous button
        # driver.execute_script("arguments[0].click();", load_prev_btn)
        
        # Wait for new content to load
        time.sleep(4)  # Or better: wait for a new match element to appear
        # Get new hrefs
        new_hrefs = get_match_hrefs([2023, 2024, 2025])
        print(f"Loaded {len(new_hrefs - match_hrefs)} matches")
        match_hrefs.update(new_hrefs)
    except Exception as e:
        print("No more 'Load Previous' button or loading stopped:", e)
        break
    
print(f"Total matches found for 2025 in france/ligue-1: {len(match_hrefs)}")
for href in sorted(match_hrefs):
    print(href)

No more 'Load Previous' button or loading stopped: Message: 

Total matches found for 2025 in france/ligue-1: 9
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lens/monaco/4373050/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lille/reims/4373047/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/lyon/angers/4373044/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/marseille/rennes/4373043/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nantes/montpellier/4373049/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/nice/brest/4373045/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/psg/auxerre/4373051/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/saint-tienne/toulouse/4373046/
https://ca.soccerway.com/matches/2025/05/17/france/ligue-1/strasbourg/le-havre/4373048/


In [71]:
def get_goal_minutes(goal_events_div_xpath):  # arg: goal_events_div_xpath
    try:
        team_div = driver.find_element(By.XPATH, goal_events_div_xpath)
        spans = team_div.find_elements(By.XPATH, ".//span")
        minutes = []
        for span in spans:
            text = span.text.strip()
            if "'" in text:
                try:
                    minute = int(text.replace("'", "").split('+')[0])
                    minutes.append(minute)
                except ValueError:
                    continue
        return minutes
    except NoSuchElementException:
        return []

min(get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]"))

18

In [73]:
zero_zero_count = 0
late_goals_count = 0
valid_scores = {'0 - 1', '1 - 0', '1 - 1', '0 - 2', '2 - 0', '1 - 2', '2 - 1'}
late_goal_teams = []

for href in match_hrefs:
    driver.get(href)
    time.sleep(2)
    
    try:
        # Handle consent popup
        consent_button = driver.find_element(By.XPATH, "/html/body/div[8]/div[2]/div[2]/div[2]/button")
        consent_button.click()
        time.sleep(1)
    except:
        pass  # Consent may have already been handled

    try:
        home_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[1]/div")
        away_score_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[3]/div")
        home_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div")
        away_team_elem = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div")

        home_score = int(home_score_elem.text.strip())
        away_score = int(away_score_elem.text.strip())
        score = f"{home_score} - {away_score}"
        total_goals = home_score + away_score

        if score == "0 - 0":
            zero_zero_count += 1
            continue

        if score in valid_scores and 1 <= total_goals <= 3:
            home_goals_minutes = get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]")
            away_goals_minutes = get_goal_minutes("/html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[2]")

            all_goals = sorted(home_goals_minutes + away_goals_minutes)

            if all_goals and all(minute >= 70 for minute in all_goals):
                late_goals_count += 1
                late_goal_teams.append(f"{home_team_elem.text.strip()} vs {away_team_elem.text.strip()}")

    except Exception as e:
        print(f"Error processing {href}: {e}")
        continue

print(f"\n✅ Total 0-0 games: {zero_zero_count}")
print(f"✅ Total 1-3 goal games with first goal after 70th minute: {late_goals_count}")
print("✅ Matches where all goals were after 70th minute:")
for match in late_goal_teams:
    print(f" - {match}")



✅ Total 0-0 games: 0
✅ Total 1-3 goal games with first goal after 70th minute: 0
✅ Matches where all goals were after 70th minute:


In [72]:
[1,2,3] + [4,5,6]

[1, 2, 3, 4, 5, 6]

In [47]:
print(int(home_score.text.strip()) + int(away_score.text.strip()))

DEBUG:selenium.webdriver.remote.remote_connection:GET http://localhost:40069/session/9d086d71c326b352c92572dda3d12b61/element/f.01E220F94D55318FAFB30F863D80B09F.d.F2E77589189FA1545767DB1810D30491.e.10032/text {}
DEBUG:urllib3.connectionpool:http://localhost:40069 "GET /session/9d086d71c326b352c92572dda3d12b61/element/f.01E220F94D55318FAFB30F863D80B09F.d.F2E77589189FA1545767DB1810D30491.e.10032/text HTTP/1.1" 200 0
DEBUG:selenium.webdriver.remote.remote_connection:Remote response: status=200 | data={"value":"3"} | headers=HTTPHeaderDict({'Content-Length': '13', 'Content-Type': 'application/json; charset=utf-8', 'cache-control': 'no-cache'})
DEBUG:selenium.webdriver.remote.remote_connection:Finished Request
DEBUG:selenium.webdriver.remote.remote_connection:GET http://localhost:40069/session/9d086d71c326b352c92572dda3d12b61/element/f.01E220F94D55318FAFB30F863D80B09F.d.F2E77589189FA1545767DB1810D30491.e.10033/text {}
DEBUG:urllib3.connectionpool:http://localhost:40069 "GET /session/9d086d7

3


In [None]:
# body > div.sc-99237a02-0.emYNNA:nth-child(5) > div.sc-81220147-0.jXeDBc > div.sc-81220147-1.eIIZxi.sc-19cad2e4-0.kkiDdB > div.sc-81220147-2.fgqGfg.simple-grid-item:nth-child(1) > div.sc-94e50ec4-0.dwozFa > div.sc-9ab61def-0.fHlmFM.sc-b4561063-12.hMGjRc:nth-child(1) > div.sc-b4561063-2.dgmodi > div.sc-9ab61def-0.cvqIJV.sc-b4561063-0.bItxxz:nth-child(3) > div.sc-b4561063-1.fzYiaa

# body > div.sc-99237a02-0.emYNNA:nth-child(5) > div.sc-81220147-0.jXeDBc > div.sc-81220147-1.eIIZxi.sc-19cad2e4-0.kkiDdB > div.sc-81220147-2.fgqGfg.simple-grid-item:nth-child(1) > div.sc-94e50ec4-0.dwozFa > div.sc-9ab61def-0.fHlmFM.sc-b4561063-12.hMGjRc:nth-child(1) > div.sc-b4561063-2.dgmodi > div.sc-9ab61def-0.cvqIJV.sc-b4561063-0.bItxxz:nth-child(3) > div.sc-b4561063-1.fzYiaa > div.sc-b4561063-8.lndvMZ:nth-child(2) > div.sc-b4561063-9.ckSzrE:nth-child(2) > div.sc-94e50ec4-0.ksqFna:nth-child(1) > div.sc-4e4c9eab-1.fPbPfi.label.sc-b4561063-6.hJImxc

# body > div.sc-99237a02-0.emYNNA:nth-child(5) > div.sc-81220147-0.jXeDBc > div.sc-81220147-1.eIIZxi.sc-19cad2e4-0.kkiDdB > div.sc-81220147-2.fgqGfg.simple-grid-item:nth-child(1) > div.sc-94e50ec4-0.dwozFa > div.sc-9ab61def-0.fHlmFM.sc-b4561063-12.hMGjRc:nth-child(1) > div.sc-b4561063-2.dgmodi > div.sc-9ab61def-0.cvqIJV.sc-b4561063-0.bItxxz:nth-child(3) > div.sc-b4561063-1.fzYiaa > div.sc-b4561063-8.lndvMZ:nth-child(2) > div.sc-b4561063-9.ckSzrE:nth-child(2) > div.sc-94e50ec4-0.ksqFna:nth-child(3) > div.sc-4e4c9eab-1.fPbPfi.label.sc-b4561063-6.hJImxc

# team   /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[1]/div
# team   /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/a[2]/div

# goals  /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[1]/div
# goals  /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[3]/div/div/div/div[3]/div

# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]/div[1]/span
# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]/div[2]/span
# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[2]/div/span


# /html/body/div[3]/div/div/div[1]/div/div[1]/div/div[5]/div[1]/div[1]
