In [1]:
import os
from pathlib import Path

# Set working directory to project root if running from 'notebooks/'
notebook_dir = Path().resolve()
project_root = notebook_dir.parent  # Assumes notebook is inside 'notebooks/'

os.chdir(project_root)
print(f"✅ Working directory set to: {project_root}")

✅ Working directory set to: F:\JCMDataCenter\Proyectos\Football_analysis


In [2]:
# 📦 Imports
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from pathlib import Path

In [3]:
# ⚙️ Configuration
CHROMEDRIVER_PATH = "C:/Windows/System32/chromedriver.exe"
SLEEP_TIME = 8

# List of players to scrape
players = [
    {"name": "Viktor Gyokeres", "id": "4d5a9185"}
]

# Output CSV file
output_dir = Path("data/raw/")
output_dir.mkdir(parents=True, exist_ok=True)
output_csv = output_dir / "matchlogs_fbref.csv"

In [4]:
from collections import Counter

# 🚀 Set up headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=options)

# Initialize final storage
all_matchlogs = []

# 🔁 Helper: rename duplicated columns like 'Att', 'Cmp'
def disambiguate_columns(columns):
    counts = Counter()
    new_columns = []
    for col in columns:
        counts[col] += 1
        if counts[col] > 1:
            new_columns.append(f"{col}_{counts[col]}")
        else:
            new_columns.append(col)
    return new_columns

# 🔄 Loop through each player
for player in players:
    player_name = player["name"]
    player_id = player["id"]
    profile_url = f"https://fbref.com/en/players/{player_id}/{player_name.replace(' ', '-')}"

    print(f"\n🔎 Scraping match logs for: {player_name}")

    try:
        driver.get(profile_url)
        time.sleep(SLEEP_TIME)
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract season match log links
        nav_section = soup.find("div", id="inner_nav")
        matchlog_links = {
            a["href"] for a in nav_section.find_all("a", href=True)
            if "/matchlogs/" in a["href"] and "Match-Logs" in a["href"] and "summary" in a["href"].lower()
        } if nav_section else set()

        if not matchlog_links:
            print("⚠️ No match log links found.")
            continue

        for relative_url in matchlog_links:
            full_url = "https://fbref.com" + relative_url
            season = relative_url.split("/matchlogs/")[1].split("/")[0]
            print(f"   📅 Season: {season} — {full_url}")

            try:
                driver.get(full_url)
                time.sleep(SLEEP_TIME)
                season_soup = BeautifulSoup(driver.page_source, "html.parser")
                table = season_soup.find("table", {"id": "matchlogs_all"})

                if table:
                    headers = table.find("thead").find_all("tr")[-1]
                    raw_columns = [th.text.strip() for th in headers.find_all("th")]
                    columns = disambiguate_columns(raw_columns)  # ✅ Rename duplicated columns

                    for row in table.find("tbody").find_all("tr"):
                        if "class" in row.attrs and "thead" in row["class"]:
                            continue
                        cells = row.find_all(["th", "td"])
                        values = [cell.text.strip() for cell in cells]
                        row_data = dict(zip(columns, values))
                        row_data["season"] = season
                        row_data["player_name"] = player_name
                        row_data["player_id"] = player_id
                        all_matchlogs.append(row_data)

                else:
                    print("   ⚠️ No matchlogs_all table found.")
            except Exception as e:
                print(f"   ❌ Error scraping season {season}: {e}")
    except Exception as e:
        print(f"❌ Error scraping player {player_name}: {e}")

driver.quit()



🔎 Scraping match logs for: Viktor Gyokeres
   📅 Season: 2015 — https://fbref.com/en/players/4d5a9185/matchlogs/2015/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2019-2020 — https://fbref.com/en/players/4d5a9185/matchlogs/2019-2020/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2017-2018 — https://fbref.com/en/players/4d5a9185/matchlogs/2017-2018/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2018-2019 — https://fbref.com/en/players/4d5a9185/matchlogs/2018-2019/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2023-2024 — https://fbref.com/en/players/4d5a9185/matchlogs/2023-2024/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2017 — https://fbref.com/en/players/4d5a9185/matchlogs/2017/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2020-2021 — https://fbref.com/en/players/4d5a9185/matchlogs/2020-2021/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2021-2022 — https://fbref.com/en/players/4d5a9185/matchlogs/2021-2022/summary/Viktor-Gyokeres-Match-Logs
   📅 Season: 2022-2023 — https:/

In [5]:
# 💾 Save all matchlogs to CSV
if all_matchlogs:
    df_matchlogs = pd.DataFrame(all_matchlogs)
    output_csv = Path("data/raw/matchlogs_fbref.csv")

    if output_csv.exists():
        # Load existing data
        df_existing = pd.read_csv(output_csv, dtype=str)

        # Append new rows
        df_combined = pd.concat([df_existing, df_matchlogs], ignore_index=True)

        # Remove duplicates (based on player_id + Date)
        df_combined.drop_duplicates(subset=["player_id", "Date"], inplace=True)

        # Save combined data
        df_combined.to_csv(output_csv, index=False)
        print(f"\n✅ New data appended to: {output_csv.resolve()} | Total rows: {len(df_combined)}")
    else:
        # Save as new file
        df_matchlogs.to_csv(output_csv, index=False)
        print(f"\n✅ Match logs saved to: {output_csv.resolve()}")
else:
    print("\n🚫 No data to save.")



✅ New data appended to: F:\JCMDataCenter\Proyectos\Football_analysis\data\raw\matchlogs_fbref.csv | Total rows: 527
