## Goal: Predict Peak Times in the DHBW Cafeteria Using Calendar Data

This notebook integrates and analyzes external calendar data (e.g., public holidays, lecture schedules, exam periods) to improve the prediction of peak hours in the DHBW cafeteria. By identifying patterns in visitor behavior, we aim to support better planning for both cafeteria staff and students.


### Funktion scrape_rapla_calendar

Scrape the Rapla calendar HTML for a given user & file (course).\
Optionally filter calendar week.


In [None]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

def extract_fields(title_text: str) -> dict:
    fields = {
        "veranstaltungstitel": None,
        "sprache": None,
        "studiengang": None,
        "anzahl_studierende": None,
        "geplante_vorlesungsstunden": None,
        "planungsnotiz": None,
        "ressourcen": None,
        "personen": None,
    }

    # Wenn der Titel NICHT mit 'Lehrveranstaltung' beginnt → nimm erste Zeile als Veranstaltungstitel
    if not title_text.strip().lower().startswith("lehrveranstaltung"):
        first_line = title_text.strip().split("\n")[0]
        fields["veranstaltungstitel"] = first_line.strip()
        return fields

    # Andernfalls: Extrahiere strukturierte Informationen
    def extract(label):
        pattern = rf"{label}:\s*\n*(.*?)\n\n"
        match = re.search(pattern, title_text, re.DOTALL | re.IGNORECASE)
        return match.group(1).strip() if match else None

    fields["veranstaltungstitel"] = extract("Titel")
    fields["sprache"] = extract("Sprache")
    fields["studiengang"] = extract("Studiengang")
    fields["anzahl_studierende"] = extract("Anzahl Studierende")
    fields["geplante_vorlesungsstunden"] = extract("Gepl\. Vorlesungsstunden")
    fields["planungsnotiz"] = extract("Planungsnotiz")

    res_match = re.search(r"Ressourcen:\s*\n(.*?)\n\n", title_text, re.DOTALL)
    if res_match:
        ressourcen = res_match.group(1).strip().replace("\n", ",")
        fields["ressourcen"] = ressourcen

    personen_match = re.search(r"Personen:\s*\n(.*?)\n\n", title_text, re.DOTALL)
    if personen_match:
        personen_block = personen_match.group(1).strip()
        personen = list({line.strip() for line in personen_block.split("\n") if line.strip()})
        fields["personen"] = ", ".join(personen)

    return fields


def scrape_rapla_calendar(user: str, file: str, day: int = None, month: int = None, year: int = None):
    base_url = "https://rapla.dhbw-karlsruhe.de/rapla"
    params = {
        "page": "calendar",
        "user": user,
        "file": file
    }

    if day and month and year:
        params.update({
            "day": day,
            "month": month,
            "year": year
        })

    try:
        resp = requests.get(base_url, params=params)
        resp.raise_for_status()  # Raise an exception for bad status codes
        soup = BeautifulSoup(resp.text, "html.parser")
        text = soup.get_text(separator="\n")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()  # Or raise the exception if you want the program to stop

    pattern = re.compile(
        r"(?P<start>\d{2}:\d{2})\s*-\s*(?P<end>\d{2}:\d{2})\s+"
        r"(?P<title>.*?)(?:\n|\s)+"
        r"(?P<weekday>Mo|Di|Mi|Do|Fr|Sa|So)\s+"
        r"(?P<date>\d{2}\.\d{2}\.\d{2})",
        re.DOTALL
    )

    events = []
    for m in pattern.finditer(text):
        try:
            dd, mm, yy = m.group("date").split(".")
            iso_date = f"20{yy}-{mm}-{dd}"
            title_raw = m.group("title").strip()

            extracted = extract_fields(title_raw)

            event = {
                "title": title_raw,
                "start": f"{iso_date} {m.group('start')}",
                "end": f"{iso_date} {m.group('end')}",
                "weekday": m.group("weekday"),
                "raw_date": m.group("date"),
            }
            event.update(extracted)
            events.append(event)
        except Exception as e:
            print(f"Error processing event: {e}, text: {m.group(0)}")
            continue  # Skip to the next event

    df = pd.DataFrame(events)

    if df.empty:
        print("Warning: No events found for this period.")
        return df  # Or handle as appropriate

    try:
        df["start"] = pd.to_datetime(df["start"])
        df["end"] = pd.to_datetime(df["end"])
        df["date"] = df["start"].dt.date
    except KeyError as e:
        print(f"KeyError: {e}.  Columns in DataFrame: {df.columns}")
        return df  # Or handle the error as needed

    return df

In [66]:
import datetime

def get_monday_of_week(year: int, week: int) -> datetime.date:
    # ISO Kalender: Montag als erster Tag der Woche
    # ISO-Woche 1: die Woche mit dem ersten Donnerstag
    return datetime.datetime.strptime(f'{year}-W{week:02d}-1', "%Y-W%W-%w").date()

def scrape_full_year(user: str, file: str, year: int = 2025):
    all_dfs = []
    max_week = 53  # max. KW, 2025 hat 53 Wochen laut ISO-Kalender

    for week in range(1, max_week + 1):
        try:
            monday = get_monday_of_week(year, week)
        except ValueError:
            # Manche Kombinationen können nicht existieren, z.B. KW 53 in manchen Jahren
            continue

        print(f"Scraping Woche {week}, Startdatum {monday}")
        df_week = scrape_rapla_calendar(
            user=user,
            file=file,
            day=monday.day,
            month=monday.month,
            year=monday.year,
        )
        all_dfs.append(df_week)

    # Alle DataFrames zusammenführen
    full_df = pd.concat(all_dfs, ignore_index=True)
    print(f"Gesamter Kalender für {year} gespeichert als kalender_{year}_gesamt.csv")
    return full_df


In [68]:
full_calendar_df = scrape_full_year(user="brockmans", file="WWI22B1", year=2025)
full_calendar_df

Scraping Woche 1, Startdatum 2025-01-06
Scraping Woche 2, Startdatum 2025-01-13
Scraping Woche 3, Startdatum 2025-01-20
Scraping Woche 4, Startdatum 2025-01-27
Scraping Woche 5, Startdatum 2025-02-03
Scraping Woche 6, Startdatum 2025-02-10


KeyError: 'start'