In [1]:
import os
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



In [2]:
#Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


In [3]:
#Transfermarkt URL
URL = "https://www.transfermarkt.com/scorer/toptorschuetzen/statistik/2024/plus/1/galerie/0"

In [4]:
#Ensure the 'data/raw' directory exists
OUTPUT_DIR = "data/raw"
os.makedirs(OUTPUT_DIR, exist_ok=True)  #Auto-create missing directories

In [5]:
#Function to set up Selenium WebDriver
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  #Run in headless mode (no UI)
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [6]:
# Function to scrape data using Selenium
def scrape_strikers():
    driver = setup_driver()
    logging.info("🚀 Launching browser and fetching Transfermarkt page...")
    driver.get(URL)

    # Wait until the table loads
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "items"))
        )
        logging.info("✅ Table loaded successfully!")
    except:
        logging.error("❌ Table not found. Exiting...")
        driver.quit()
        return

    # Extract rows from the table
    rows = driver.find_elements(By.XPATH, "//table[contains(@class, 'items')]/tbody/tr")
    all_players = []

    for row in rows:
        try:
            name = row.find_element(By.XPATH, ".//td[2]//a").text.strip() if row.find_elements(By.XPATH, ".//td[2]//a") else "N/A"
            age = row.find_element(By.XPATH, ".//td[3]").text.strip() if row.find_elements(By.XPATH, ".//td[3]") else "N/A"
            club = row.find_element(By.XPATH, ".//td[4]//a").text.strip() if row.find_elements(By.XPATH, ".//td[4]//a") else "N/A"
            matches = row.find_element(By.XPATH, ".//td[5]").text.strip() if row.find_elements(By.XPATH, ".//td[5]") else "N/A"
            goals = row.find_element(By.XPATH, ".//td[6]").text.strip() if row.find_elements(By.XPATH, ".//td[6]") else "N/A"
            market_value = row.find_element(By.XPATH, ".//td[last()]").text.strip() if row.find_elements(By.XPATH, ".//td[last()]") else "N/A"
            
            player_data = {
                "Name": name,
                "Age": age,
                "Club": club,
                "Matches": matches,
                "Goals": goals,
                "Market Value": market_value
            }
            all_players.append(player_data)

        except Exception as e:
            logging.warning(f"⚠️ Skipping row due to error: {e}")

    # Close browser session
    driver.quit()

    # Convert to DataFrame
    df_players = pd.DataFrame(all_players)

    # Save data to CSV
    output_file = os.path.join(OUTPUT_DIR, "transfermarkt_top_scorers.csv")
    df_players.to_csv(output_file, index=False)
    logging.info(f"✅ Data saved successfully in {output_file}")

In [7]:
# Run the scraper
if __name__ == "__main__":
    scrape_strikers()

2025-03-26 18:28:26,709 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-26 18:28:26,760 - INFO - Get LATEST chromedriver version for google-chrome
2025-03-26 18:28:26,806 - INFO - Driver [/Users/hachikaruanyakwee/.wdm/drivers/chromedriver/mac64/134.0.6998.165/chromedriver-mac-arm64/chromedriver] found in cache
2025-03-26 18:28:27,953 - INFO - 🚀 Launching browser and fetching Transfermarkt page...
2025-03-26 18:28:29,729 - INFO - ✅ Table loaded successfully!
2025-03-26 18:28:31,416 - INFO - ✅ Data saved successfully in data/raw/transfermarkt_top_scorers.csv
