In [1]:
import requests
from bs4 import BeautifulSoup
import os
import csv
import pandas as pd

In [2]:
"""
Get all the match data for each season from www.voetbalstats.nl
"""

for year in range(1956, 2024):

    # Get the URL of the season
    season_id = year - 1956 + 2
    season_id_str = str(season_id) if season_id >= 10 else "0" + str(season_id)
    URL = "https://www.voetbalstats.nl/listjaarere.php?seizoenid=" + season_id_str

    # Exclude covid season 2019/20 as it was aborted
    if year == 2019:
        continue

    # Between 1962 and 1965 there were only 16 teams instead of 18
    teams = 18
    if year >= 1962 and year <= 1965:
        teams = 16

   
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")

    td_elements = soup.find_all("td", class_="linkere")

    thuis_teams = []
    uit_teams = []
    datums = []
    uitslagen = []

    # Loop over all matches of the season
    for i in range(len(td_elements)):
        a_tag = td_elements[i].find("a", href=True)

        # Get the Home and Away teams of the match
        if a_tag and a_tag["href"].startswith("listclubjaarere"):
            if i > teams * 2 and len(thuis_teams) < teams * (teams - 1) and i % 2 != 0:
                thuis_teams.append(a_tag.get_text().strip())
            if i > teams * 2 and len(uit_teams) < teams * (teams - 1) and i % 2 == 0:
                uit_teams.append(a_tag.get_text().strip())

        # Get the results and date of the match
        if a_tag and a_tag["href"].startswith("opstelere"):
            if len(uitslagen) < teams * (teams - 1) and i % 2 != 0:
                uitslagen.append(a_tag.get_text().strip())
            if len(datums) < teams * (teams - 1) and i % 2 == 0:
                datums.append(a_tag.get_text().strip())

    # Sometimes the results and date are flipped
    if uitslagen[0].count("-") == 2:
        uitslagen, datums = datums, uitslagen
        thuis_teams, uit_teams = uit_teams, thuis_teams

    # Check for inconsistencies in the data
    if len(thuis_teams) != teams * (teams - 1) or len(uit_teams) != teams * (teams - 1) or len(datums) != teams * (teams - 1) or len(uitslagen) != teams * (teams - 1):
        print(year)
        print("Something went wrong")

    rows = []

    # Create the rows of the dataframe
    for i in range(teams * (teams - 1)):
        thuis_doelpunten = int(uitslagen[i].split()[0])
        uit_doelpunten = int(uitslagen[i].split()[2])
        if thuis_doelpunten > uit_doelpunten:
            uitslag = "Gewonnen"
        elif thuis_doelpunten == uit_doelpunten:
            uitslag = "Gelijk"
        else:
            uitslag = "Verloren"

        rows.append([datums[i], thuis_teams[i], uit_teams[i], thuis_doelpunten, uit_doelpunten, uitslag])

    # Create the dataframe
    df = pd.DataFrame(rows, columns=["Datum", "Thuis", "Uit", "Thuis Doelpunten", "Uit Doelpunten", "Uitslag"])

    # Define file paths
    raw_dir = f"../raw data/{year}"
    processed_dir = f"../processed data/{year}"
    file_path_raw = f"{raw_dir}/uitslagen{year}.txt"
    file_path_processed = f"{processed_dir}/uitslagen{year}.csv"

    # Ensure directories exist
    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(processed_dir, exist_ok=True)

    # Check and resolve conflicts for processed file
    if os.path.isfile(file_path_processed):
        os.remove(file_path_processed)  # Remove the conflicting file
    elif os.path.isdir(file_path_processed):
        import shutil
        shutil.rmtree(file_path_processed)  # Remove the conflicting directory

    # Write raw file
    with open(file_path_raw, "w", encoding="utf-8") as file:
        file.write(page.text)

    # Write processed CSV file
    df.to_csv(file_path_processed, index=False)

    print(year, "Succes")





1956 Succes
1957 Succes
1958 Succes
1959 Succes
1960 Succes
1961 Succes
1962 Succes
1963 Succes
1964 Succes
1965 Succes
1966 Succes
1967 Succes
1968 Succes
1969 Succes
1970 Succes
1971 Succes
1972 Succes
1973 Succes
1974 Succes
1975 Succes
1976 Succes
1977 Succes
1978 Succes
1979 Succes
1980 Succes
1981 Succes
1982 Succes
1983 Succes
1984 Succes
1985 Succes
1986 Succes
1987 Succes
1988 Succes
1989 Succes
1990 Succes
1991 Succes
1992 Succes
1993 Succes
1994 Succes
1995 Succes
1996 Succes
1997 Succes
1998 Succes
1999 Succes
2000 Succes
2001 Succes
2002 Succes
2003 Succes
2004 Succes
2005 Succes
2006 Succes
2007 Succes
2008 Succes
2009 Succes
2010 Succes
2011 Succes
2012 Succes
2013 Succes
2014 Succes
2015 Succes
2016 Succes
2017 Succes
2018 Succes
2020 Succes
2021 Succes
2022 Succes
2023 Succes
