In [None]:
# 00 - Web Scraping & Ingestion
# This notebook collects football match data from Transfermarkt and stores it in Delta format for further processing.

In [None]:
# 1. Import Libraries
# Import all required libraries for web scraping, data handling, and Spark session.
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
from pyspark.sql import SparkSession

In [None]:
# 2. Spark Session Initialization
# Start or get the current Spark session for distributed data processing.
spark = SparkSession.builder.getOrCreate()

In [None]:
# 3. Configuration
# Define leagues, scraping parameters, and output paths.
LEAGUES = {
    'Premier League': {'code': 'GB1', 'url_slug': 'premier-league', 'div_output': 'E0'},
    'Championship': {'code': 'GB2', 'url_slug': 'championship', 'div_output': 'E1'},
    'Bundesliga': {'code': 'L1', 'url_slug': 'bundesliga', 'div_output': 'D1'},
    '2. Bundesliga': {'code': 'L2', 'url_slug': '2-bundesliga', 'div_output': 'D2'},
    'Ligue 1': {'code': 'FR1', 'url_slug': 'ligue-1', 'div_output': 'F1'},
    'Ligue 2': {'code': 'FR2', 'url_slug': 'ligue-2', 'div_output': 'F2'}
}
BASE_URL = "https://www.transfermarkt.com"
START_SEASON = 1993
END_SEASON = 2024
RAW_BASE_PATH = "/Volumes/football_matches_catalog/raw/raw_files"
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
]

In [None]:
# 4. Helper Functions
# Utility functions for HTTP headers and date normalization.
def get_headers():
    """Randomize headers for requests."""
    return {
        'User-Agent': random.choice(USER_AGENTS),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.google.com/'
    }

def normalize_date(date_str, season_year):
    """Normalize date string to dd/mm/yyyy or dd.mm.yyyy format."""
    if not date_str or date_str == "Unknown":
        return date_str
    try:
        if "/" in date_str:
            sep = "/"
        elif "." in date_str:
            sep = "."
        else:
            return date_str
        parts = date_str.split(sep)
        if len(parts) != 3:
            return date_str
        day, month, year = parts
        if len(year) == 2:
            year_int = int(year)
            month_int = int(month)
            if month_int >= 8:
                full_year = season_year
            else:
                full_year = season_year + 1
            if full_year % 100 == year_int:
                year = str(full_year)
            else:
                if year_int < 50:
                    year = "20" + year
                else:
                    year = "19" + year
        return f"{day}{sep}{month}{sep}{year}"
    except Exception:
        return date_str

In [None]:
# 4b. Scraping Function
# Scrape all matches for a given league and season from Transfermarkt.
def scrape_season(league_name, league_info, season_year):
    """Scrape all matches for a league and season."""
    code = league_info['code']
    slug = league_info['url_slug']
    div_output = league_info['div_output']
    url = f"{BASE_URL}/{slug}/gesamtspielplan/wettbewerb/{code}/saison_id/{season_year}"
    print(f"Scraping {league_name} {season_year}-{season_year+1} from {url}...")
    try:
        response = requests.get(url, headers=get_headers(), timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return []
    soup = BeautifulSoup(response.content, 'html.parser')
    matches = []
    if soup.title and not any(k in soup.title.text.lower() for k in ["fixtures", "schedule", "spielplan"]):
        print(f"Warning: Page title '{soup.title.text}' might indicate issues.")
    report_links = soup.find_all('a', href=lambda x: x and '/spielbericht/' in x)
    seen_matches = set()
    current_date = "Unknown"
    for link in report_links:
        try:
            match_url_part = link['href']
            match_id = match_url_part.split('/')[-1]
            if match_id in seen_matches:
                continue
            seen_matches.add(match_id)
            row = link.find_parent('tr')
            if not row:
                continue
            cells = row.find_all('td')
            team_links = row.find_all('a', href=lambda x: x and '/verein/' in x)
            teams = [t.get_text(strip=True) for t in team_links if t.get_text(strip=True)]
            if len(teams) < 2:
                imgs = row.find_all('img', alt=True)
                teams_from_imgs = [img['alt'] for img in imgs]
                if len(teams_from_imgs) >= 2:
                    teams = teams_from_imgs[:2]
            if len(teams) < 2:
                continue
            home_team, away_team = teams[0], teams[1]
            result_text = link.get_text(strip=True)
            if ":" not in result_text:
                continue
            fthg, ftag = result_text.split(':')
            found_date_in_row = False
            raw_date_str = ""
            for cell in cells[:2]:
                txt = cell.get_text(separator=" ", strip=True)
                if any(c.isdigit() for c in txt) and len(txt) > 5 and ("/" in txt or "." in txt):
                    raw_date_str = txt
                    found_date_in_row = True
                    break
            if found_date_in_row:
                date_candidate = raw_date_str
                if " " in date_candidate:
                    parts = date_candidate.split(" ")
                    for p in parts:
                        if ("/" in p or "." in p) and any(c.isdigit() for c in p):
                            date_candidate = p
                            break
                current_date = date_candidate
            final_date_str = normalize_date(current_date, season_year)
            try:
                hg = int(fthg)
                ag = int(ftag)
                if hg > ag:
                    ftr = 'H'
                elif ag > hg:
                    ftr = 'A'
                else:
                    ftr = 'D'
            except Exception:
                ftr = 'NA'
            matches.append({
                'Match_ID': match_id,
                'Div': div_output,
                'Season': season_year,
                'Date': final_date_str,
                'HomeTeam': home_team,
                'AwayTeam': away_team,
                'FTHG': fthg,
                'FTAG': ftag,
                'FTR': ftr
            })
        except Exception:
            continue
    return matches

In [None]:
# 5. Main Scraping Loop
# Loop through all seasons and leagues, scrape data, and save as Delta files.
all_seasons = list(range(START_SEASON, END_SEASON + 1))
for season in all_seasons:
    all_matches = []
    for league_name, league_info in LEAGUES.items():
        matches = scrape_season(league_name, league_info, season)
        if matches:
            all_matches.extend(matches)
        time.sleep(random.uniform(1, 3))  # Polite delay
    if all_matches:
        pdf = pd.DataFrame(all_matches)
        sdf = spark.createDataFrame(pdf)
        output_path = f"{RAW_BASE_PATH}/season={season}"
        print(f"Writing season {season} to {output_path} as Delta...")
        sdf.write.format("delta").mode("overwrite").save(output_path)
    else:
        print(f"No matches found for season {season}")