In [None]:
import pdfplumber
import re
import csv

In [None]:
# Path to the PDF file
pdf_path = '/Users/joathcarrera/Desktop/CSE115A/Personal/Bundesliga_Calendar.pdf'
# Path to the output CSV file
base_csv_path = '/Users/joathcarrera/Desktop/CSE115A/Personal/bundesliga_2024-2025fixtures.csv'

In [None]:
def extract_tables_from_pdf(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()
            for table in page_tables:
                tables.append(table)
    return tables

In [None]:
def clean_dates(text):
    # Remove the '- Sa' and '- Fr' parts
    text = re.sub(r'(\d{2}\.\d{2}\.\d{4}) - \w{2}', r'\1', text)
    # Convert dates to US format and change '.' to '/'
    return re.sub(r'(\d{2})\.(\d{2})\.(\d{4})', r'\2/\1/\3', text)

In [None]:
def clean_and_split_tables(tables):
    matchday_tables = {}
    for table in tables:
        for row in table:
            if row and len(row) > 1:
                # Clean dates in the row
                row = [clean_dates(cell) if cell else cell for cell in row]
                # Check if the row contains a matchday number
                if row[2] and re.match(r"\d+", row[2]):
                    current_matchday = row[2]
                    if current_matchday not in matchday_tables:
                        matchday_tables[current_matchday] = []
                    matchday_tables[current_matchday].append(row)
    return matchday_tables

In [None]:
def filter_bundesliga_matches(matchday_tables):
    filtered_tables = {}
    exclude_keywords = {"DFB", "DFL", "UECL", "A", "UCL", "UEL", "REL"}
    for matchday, table in matchday_tables.items():
        filtered_table = [
            row for row in table
            if len(row) > 3 and row[4] and not any(keyword in re.sub(r'\s+', ' ', row[4]).lower() for keyword in exclude_keywords)
        ]
        if filtered_table:
            filtered_tables[matchday] = filtered_table
    return filtered_tables

In [None]:
def save_matchday_tables_to_csv(matchday_tables, base_csv_path):
    for matchday, table in matchday_tables.items():
        csv_path = f"{base_csv_path}_matchday_{matchday}.csv"
        with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Date", "Matchday", "Home Team", "Away Team"])  # Set headers
            for row in table:
                if len(row) >= 5:  # Ensure there are enough columns
                    # Create a new row including Home Team and Away Team
                    new_row = [row[0], row[2], row[4], row[5]]
                    # Skip rows that match the unwanted header
                    if new_row != ["Datum", "Spieltag", "Heim", "Gast"]:
                        writer.writerow(new_row)

In [None]:
# Extract tables from PDF
tables = extract_tables_from_pdf(pdf_path)

In [None]:
# Clean and split tables into individual matchdays
matchday_tables = clean_and_split_tables(tables)

In [None]:
# Filter out non-Bundesliga matches
filtered_matchday_tables = filter_bundesliga_matches(matchday_tables)

In [None]:
 # Save matchday tables to CSV
save_matchday_tables_to_csv(filtered_matchday_tables, base_csv_path)

In [None]:
print("Filtered Bundesliga matchday tables have been saved to CSV files.")