Collection data Seriea

In [11]:
#! pip install webdriver_manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os
from time import sleep

def setup_driver(headless=True):
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def scrape_table(url, season, headless=True):
    driver = setup_driver(headless)
    driver.get(url)

    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.NAME, "season")))
        select = Select(driver.find_element(By.NAME, "season"))
        select.select_by_value(season)
        sleep(5)  # Adjust based on observed page load times

        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        html = driver.page_source
        tables = pd.read_html(html)
    except Exception as e:
        print(f"Error scraping season {season}: {e}")
        return None
    finally:
        driver.quit()
    return tables

def save_tables(tables, directory, season):
    """Save each table to a separate CSV file in the specified directory with season included in filename."""
    os.makedirs(directory, exist_ok=True)
    for index, table in enumerate(tables):
        file_path = os.path.join(directory, f"seriea-{season.replace('-', '_')}.csv")
        table.to_csv(file_path, index=False)
        print(f"Saved Table for {season} to {file_path}")

def scrape_years():
    url = 'https://www.legaseriea.it/en/serie-a/classifica'
    seasons = [f"{year}-{str(year+1)[-2:]}" for year in range(1986, 2024)]
    for season in seasons:
        print(f"Scraping data for season: {season}")
        tables = scrape_table(url, season, headless=False)
        if tables:
            save_tables(tables, "data/raw/seriea", season)
        else:
            print("No tables found for season:", season)

scrape_years()


Scraping data for season: 2016-17


  tables = pd.read_html(html)


Saved Table for 2016-17 to data/raw/seriea\seriea-2016_17.csv


Collection data bundesliga

In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from io import StringIO
import pandas as pd
import os

def setup_driver(headless=True):
    """Set up Chrome WebDriver with options."""
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)
    options.add_argument("--disable-javascript")
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_page_load_timeout(90)
    return driver

def scrape_table(url, season, headless=True):
    """Scrapes the table for the given season."""
    driver = setup_driver(headless)
    attempts = 0
    max_attempts = 3
    season_url = url.format(season=season.replace('/', '-'))  # Ensure URL format is correct

    while attempts < max_attempts:
        try:
            driver.get(season_url)
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            html = driver.page_source
            tables = pd.read_html(StringIO(html))
            return tables
        except Exception as e:
            print(f"Attempt {attempts + 1}: Error during scraping {season}: {e}")
            attempts += 1
            driver.quit()
            driver = setup_driver(headless)
        finally:
            if attempts == max_attempts:
                driver.quit()

    return None

def save_tables(tables, directory, season):
    """Saves the scraped tables to a CSV file."""
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f"bundesliga-{season.replace('-', '_')}.csv")
    tables[0].to_csv(file_path, index=False)
    print(f"Saved Table for {season} to {file_path}")

def scrape_years():
    """Initiates scraping for a range of seasons."""
    base_url = 'https://www.bundesliga.com/en/bundesliga/table/{season}'
    seasons = [f"{year}-{year+1}" for year in range(1963, 2024)]
    for season in seasons:
        print(f"Scraping data for season: {season}")
        tables = scrape_table(base_url, season, headless=False)
        if tables:
            save_tables(tables, "data/raw/bundesliga", season)
        else:
            print(f"No tables found for {season}.")

scrape_years()


Scraping data for season: 1963-1964
Saved Table for 1963-1964 to data/raw/bundesliga\bundesliga-1963_1964.csv
Scraping data for season: 1964-1965
Saved Table for 1964-1965 to data/raw/bundesliga\bundesliga-1964_1965.csv
Scraping data for season: 1965-1966
Saved Table for 1965-1966 to data/raw/bundesliga\bundesliga-1965_1966.csv
Scraping data for season: 1966-1967
Saved Table for 1966-1967 to data/raw/bundesliga\bundesliga-1966_1967.csv
Scraping data for season: 1967-1968
Saved Table for 1967-1968 to data/raw/bundesliga\bundesliga-1967_1968.csv
Scraping data for season: 1968-1969
Saved Table for 1968-1969 to data/raw/bundesliga\bundesliga-1968_1969.csv
Scraping data for season: 1969-1970
Saved Table for 1969-1970 to data/raw/bundesliga\bundesliga-1969_1970.csv
Scraping data for season: 1970-1971
Saved Table for 1970-1971 to data/raw/bundesliga\bundesliga-1970_1971.csv
Scraping data for season: 1971-1972
Saved Table for 1971-1972 to data/raw/bundesliga\bundesliga-1971_1972.csv
Scraping d

Collection data ligue1

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def setup_driver():
    """Set up and return a Selenium WebDriver."""
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode.
    options.add_argument('--no-sandbox')  # Bypass OS security model, required for Docker.
    options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems.
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def download_data_with_selenium(url, xpath):
    """Download data from the specified URL using Selenium and the specified XPath."""
    driver = setup_driver()
    driver.get(url)
    
    try:
        # Wait until the elements from the XPath are loaded
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
        
        # Collect data
        data = []
        elements = driver.find_elements(By.XPATH, xpath)
        for element in elements:
            data.append(element.text.split('\n'))

        # Create DataFrame
        if data:
            df = pd.DataFrame(data)
            return df
        else:
            return pd.DataFrame()
    finally:
        driver.quit()

def save_data(df, filename):
    """Save the DataFrame to CSV."""
    if not df.empty:
        file_path = f'data/raw/ligue1/{filename}.csv'
        df.to_csv(file_path, index=False)
        print(f'Successfully downloaded and saved data to {file_path}')
    else:
        print(f'No data to save for {filename}.')

def main():
    base_url = 'https://www.ligue1.com/ranking?seasonId='
    seasons = [f"{year}-{year+1}" for year in range(1993, 2024)]  # Adjust range as needed
    xpath = '//a[contains(@class, "GeneralStats-link")]'

    for season in seasons:
        url = f"{base_url}{season}&StatsActiveTab=0"
        filename = f'ligue1_{season}'
        try:
            print(f"Downloading data for season {season}...")
            df = download_data_with_selenium(url, xpath)
            save_data(df, filename)
        except Exception as e:
            print(f'Failed to download data for season {season}: {e}')

if __name__ == '__main__':
    main()


Downloading data for season 1993-1994...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1993-1994.csv
Downloading data for season 1994-1995...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1994-1995.csv
Downloading data for season 1995-1996...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1995-1996.csv
Downloading data for season 1996-1997...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1996-1997.csv
Downloading data for season 1997-1998...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1997-1998.csv
Downloading data for season 1998-1999...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1998-1999.csv
Downloading data for season 1999-2000...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_1999-2000.csv
Downloading data for season 2000-2001...
Successfully downloaded and saved data to data/raw/ligue1/ligue1_2000-2001.csv
Downloading data for season 2001-2002...

Collection laliga

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from io import StringIO
import pandas as pd
import os

def setup_driver(headless=True):
    """Set up Chrome WebDriver with options."""
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)
    options.add_argument("--disable-javascript")
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_page_load_timeout(90)
    return driver

def scrape_table(url, season, headless=True):
    """Scrapes the table for the given season."""
    driver = setup_driver(headless)
    attempts = 0
    max_attempts = 3
    season_url = url.format(season=season.replace('/', '-'))  # Ensure URL format is correct

    while attempts < max_attempts:
        try:
            driver.get(season_url)
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            html = driver.page_source
            tables = pd.read_html(StringIO(html))
            return tables
        except Exception as e:
            print(f"Attempt {attempts + 1}: Error during scraping {season}: {e}")
            attempts += 1
            driver.quit()
            driver = setup_driver(headless)
        finally:
            if attempts == max_attempts:
                driver.quit()

    return None

def save_tables(tables, directory, season):
    """Saves the scraped tables to a CSV file."""
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f"laliga-{season.replace('-', '_')}.csv")
    tables[0].to_csv(file_path, index=False)
    print(f"Saved Table for {season} to {file_path}")

def scrape_years():
    """Initiates scraping for a range of seasons."""
    base_url = 'https://www.footmercato.net/espagne/liga/{season}/classement'
    seasons = [f"{year}-{year+1}" for year in range(2003, 2024)]
    for season in seasons:
        print(f"Scraping data for season: {season}")
        tables = scrape_table(base_url, season, headless=False)
        if tables:
            save_tables(tables, "data/raw/laliga", season)
        else:
            print(f"No tables found for {season}.")

scrape_years()

Scraping data for season: 2003-2004
Saved Table for 2003-2004 to data/raw/laliga\laliga-2003_2004.csv
Scraping data for season: 2004-2005
Saved Table for 2004-2005 to data/raw/laliga\laliga-2004_2005.csv
Scraping data for season: 2005-2006
Saved Table for 2005-2006 to data/raw/laliga\laliga-2005_2006.csv
Scraping data for season: 2006-2007
Saved Table for 2006-2007 to data/raw/laliga\laliga-2006_2007.csv
Scraping data for season: 2007-2008
Saved Table for 2007-2008 to data/raw/laliga\laliga-2007_2008.csv
Scraping data for season: 2008-2009
Saved Table for 2008-2009 to data/raw/laliga\laliga-2008_2009.csv
Scraping data for season: 2009-2010
Saved Table for 2009-2010 to data/raw/laliga\laliga-2009_2010.csv
Scraping data for season: 2010-2011
Saved Table for 2010-2011 to data/raw/laliga\laliga-2010_2011.csv
Scraping data for season: 2011-2012
Saved Table for 2011-2012 to data/raw/laliga\laliga-2011_2012.csv
Scraping data for season: 2012-2013
Saved Table for 2012-2013 to data/raw/laliga\l

Collection premierleague


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from io import StringIO
import os
import time

# Setup environment for WebDriver
os.environ['WDM_LOCAL'] = '1'
os.environ['WDM_LOG_LEVEL'] = '0'
os.environ['WDM_CACHE_DIR'] = os.path.join(os.path.expanduser('~'), '.wdm')

def setup_driver(headless=True):
    manager = ChromeDriverManager()
    service = Service(manager.install())
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--verbose')
    options.add_argument('--log-path=chromedriver.log')
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def dismiss_ads(driver):
    try:
        # Execute JavaScript to close ads
        driver.execute_script("""
            var closeButton = document.getElementById('advertClose');
            if (closeButton) {
                closeButton.click();
            }
        """)
    except Exception as e:
        print("Error dismissing ads: ", str(e))


def filter_by_season(driver, season_id):
    try:
        # Wait until the season dropdown is present
        season_dropdown = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".current[data-dropdown-current='compSeasons']"))
        )
        
        # Click on the season dropdown
        season_dropdown.click()
        time.sleep(5)
        # Find the dropdown options
        dropdown_options = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul[data-dropdown-list='compSeasons'] li"))
        )
        
        # Loop through the dropdown options
        for option in dropdown_options:
            # Check if the option's data-option-id matches the desired season_id
            if option.get_attribute("data-option-id") == season_id:
                # Click on the option
                option.click()
                break

         # Add a small delay before proceeding
        time.sleep(5)

    except Exception as e:
        print("Error clicking season element: ", str(e))

def scrape_table(url, season_id, headless=False):
    driver = setup_driver(headless)
    driver.get(url)
    try:
        dismiss_ads(driver)
        
        # Click on the dropdown to select the season
        filter_by_season(driver, season_id)  # Correction ici
        
        # Wait until the table is present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "table"))
        )
        
        # Read the HTML content and parse it using pandas
        html = driver.page_source
        html_io = StringIO(html)
        tables = pd.read_html(html_io)
        
        if not tables:
            raise ValueError("No tables found on the page")
    except Exception as e:
        print(f"Error scraping season {season_id}: {e}")
        return None
    finally:
        driver.quit()
    return tables


def save_tables(tables, directory, season):
    if tables:
        os.makedirs(directory, exist_ok=True)  # Crée le répertoire s'il n'existe pas
        file_path = os.path.join(directory, f"premierleague-{season.replace('-', '_')}.csv")
        
        # Afficher le chemin pour le débogage
        print("File path:", file_path)
        
        tables[0].to_csv(file_path, index=False)  # Enregistre la première table
        print(f"Saved Table for {season} to {file_path}")



def scrape_years():
    url = 'https://www.premierleague.com/tables'
    season_ids = {
        "1992-93": "1" , "1993-94": "2","1994-95": "3","1995-96": "4", "1996-97": "5","1997-98": "6","1998-99": "7","1999-00": "8",   
        "2000-01": "9","2001-02": "10","2002-03": "11","2003-04": "12","2004-05": "13","2005-06": "14","2006-07": "15","2007-08": "16",
        "2008-09": "17","2009-10": "18","2010-11": "19","2011-12": "20", "2012-13": "21","2013-14": "22","2014-15": "27","2015-16": "42",
        "2016-17": "54","2017-18": "79","2018-19": "210","2019-20": "274","2020-21": "363","2021-22": "418","2022-23": "489","2023-24": "578"
    
    }
   
    for season, id in season_ids.items():
        print(f"Scraping data for season: {season}")
        tables = scrape_table(url, id, headless=False)
        if tables:
            save_tables(tables, "data/raw/premierleague", season)
        else:
            print(f"No tables found for season: {season}")

if __name__ == '__main__':
    scrape_years()



Scraping data for season: 1998-99
File path: data/raw/premierleague\premierleague-1998_99.csv
Saved Table for 1998-99 to data/raw/premierleague\premierleague-1998_99.csv
Scraping data for season: 2006-07
File path: data/raw/premierleague\premierleague-2006_07.csv
Saved Table for 2006-07 to data/raw/premierleague\premierleague-2006_07.csv
Scraping data for season: 2007-08
File path: data/raw/premierleague\premierleague-2007_08.csv
Saved Table for 2007-08 to data/raw/premierleague\premierleague-2007_08.csv
