In [1]:
import requests
from bs4 import BeautifulSoup
import os
from enum import Enum

class NBVSOption(str, Enum):
    NRSITZ = "NRSITZ"  # Nationalrat - Plenarsitzungen
    BRSITZ = "BRSITZ"  # Bundesrat - Plenarsitzungen
    USA = "USA"        # Untersuchungsausschüsse
    BVSITZ = "BVSITZ"  # Bundesversammlung
    AUS = "AUS"        # Ausschüsse
    EU = "EU"          # EU-Ausschüsse
    GFT = "GFT"        # Gedenk-, Fest- und Trauersitzungen
    PARL = "PARL"      # Jugend- und Lehrlingsparlament
    VER = "VER"        # Symposien und Veranstaltungen
    ENQ = "ENQ"        # Enqueten und Enquete-Kommissionen

def fetch_protocols(nbvs_list, start_date: str = None, end_date: str = None):
    url = "https://www.parlament.gv.at/Filter/api/filter/data/211?js=eval&showAll=true&export=true"
    headers = {
        "Content-Type": "application/json"
    }
    # Convert single enum value to list
    if isinstance(nbvs_list, NBVSOption):
        nbvs_list = [nbvs_list]
    start_iso = None
    end_iso = None
    if start_date and end_date:
        # Convert input strings to ISO 8601 format with time
        start_iso = f"{start_date}T00:00:00.000Z"
        end_iso = f"{end_date}T23:59:00.000Z"
    payload = {
        "NBVS": [nbvs.value for nbvs in nbvs_list],
        "DATUM": [start_iso, end_iso]
    }
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()

# TODO: update function: downloads only the first protocol for test purposes now
def download_html_protocol(data, folder="."):
    records = data.get("rows", [])
    if not records:
        print("No records found.")
        return
    first = records[0]
    if not isinstance(first, list) or len(first) < 11:
        print("Unexpected data format.")
        return
    html_snippet = first[10]
    soup = BeautifulSoup(html_snippet, "html.parser")
    # Find the first HTML link (not PDF)
    html_link = soup.find("a", href=lambda href: href and href.endswith(".html"))
    if not html_link:
        print("No HTML link found in the HTML snippet.")
        return
    file_url = html_link["href"]
    if file_url.startswith("/"):
        file_url = "https://www.parlament.gv.at" + file_url
    print(f"Downloading HTML protocol from: {file_url}")
    filename = os.path.basename(file_url)
    filepath = os.path.join(folder, filename)
    response = requests.get(file_url)
    response.raise_for_status()

    
    
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"Saved HTML to {filepath}")

if __name__ == "__main__":
    data = fetch_protocols(NBVSOption.NRSITZ)
    download_html_protocol(data)

Downloading HTML protocol from: https://www.parlament.gv.at/dokument/XXVII/NRSITZ/178/fnameorig_1547883.html
Saved HTML to ./fnameorig_1547883.html


In [7]:
import re

def extract_only_html_href_values(data):
    """
    Extracts only the values from 'href' attributes that end with '.html'.

    Args:
        data (dict): The input data containing 'rows' with potential HTML strings.

    Returns:
        list: A list of all extracted .html href values.
    """
    html_href_values = []

    # Regex to find href attributes where the value ends with .html
    # It specifically captures the URL part.
    href_html_regex = re.compile(r'href=["\'](.*?\.html)["\']')

    for row in data.get('rows', []):
        for item in row:
            if isinstance(item, str):
                # Find all matches of the regex in the string item
                matches = href_html_regex.findall(item)
                for match in matches:
                    html_href_values.append(match)
    return html_href_values

In [9]:
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_nationalrat_protocol(html_file_path):
    """
    Scrapes a stenographic protocol HTML file of the Austrian Nationalrat
    to extract speaker, political party, and speech text.

    Args:
        html_file_path (str): The path to the HTML file.

    Returns:
        pandas.DataFrame: A DataFrame with columns 'Speaker', 'Political party', and 'Text'.
    """
    with open(html_file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    speaker_pattern = re.compile(r'<b>(.+?)<\/b>')
    paragraphs = soup.find_all('p')

    data = []
    current_speaker = None
    current_party = None
    current_text = []

    for p in paragraphs:
        b_tag = p.find('b')
        if b_tag and speaker_pattern.match(str(b_tag)):
            # If a new speaker is found, save the previous one's data
            if current_speaker:
                data.append({
                    'Speaker': current_speaker,
                    'Political party': current_party,
                    'Text': ' '.join(current_text).strip()
                })

            full_speaker_info = b_tag.get_text(strip=True)
            party_match = re.search(r'\(([^)]+)\)', full_speaker_info)

            current_speaker = full_speaker_info
            current_party = party_match.group(1) if party_match else None

            # Remove party from speaker name if it was extracted
            if current_party:
                current_speaker = current_speaker.replace(f'({current_party})', '').strip()

            # Initialize text for the new speaker
            # Remove the speaker info from the current paragraph's text
            p_text_cleaned = p.get_text(strip=True).replace(full_speaker_info, '', 1).strip()
            current_text = [p_text_cleaned] if p_text_cleaned else []
        else:
            # Continue collecting text for the current speaker
            if current_speaker:
                p_text = p.get_text(strip=True)
                # Exclude specific non-speech lines like '*****' or 'Die nächste Sitzung'
                if p_text and not p_text.startswith('*****') and not p_text.startswith('Die nächste Sitzung'):
                    current_text.append(p_text)

    # Append the last speaker's data after the loop
    if current_speaker:
        data.append({
            'Speaker': current_speaker,
            'Political party': current_party,
            'Text': ' '.join(current_text).strip()
        })

    df = pd.DataFrame(data)
    return df

In [11]:
df = scrape_nationalrat_protocol("fnameorig_1547883.html")
df.to_csv("test_export.csv")

In [29]:
from bs4 import BeautifulSoup
import pandas as pd
import re

file_path = 'fnameorig_1547883.html'


with open(file_path, 'r', encoding='utf-8') as f:
    html = f.read()


def clean_interjections(text):
    # Remove interjections enclosed in parentheses that start with known patterns
    return re.sub(r'\([^()]*\b(?:Abg\.|Zwischenruf|Zwischenrufe|Beifall|Ruf|Präsident(?:in)?|Glockenzeichen)[^()]*\)', '', text)

soup = BeautifulSoup(html, 'html.parser')
divs = soup.find_all('div', class_=lambda c: c and re.match(r'^WordSection\d+$', c))

entries = []

for div in divs:
    paragraphs = div.find_all('p')

    current_speaker = None
    current_party = None
    current_speech = []

    for p in paragraphs:
        text = p.get_text(" ", strip=True)

        # Identify speaker paragraph
        if p.find('a', href=re.compile(r'/WWER/')) and ':' in text:
            name_tag = p.find('a', href=re.compile(r'/WWER/'))
            current_speaker = name_tag.get_text(strip=True) if name_tag else None

            # Extract party
            party_match = re.search(r'\(([^)]+)\)', text)
            current_party = party_match.group(1) if party_match else None

            # Extract text after colon
            speech_parts = re.split(r':', text, maxsplit=1)
            if len(speech_parts) == 2:
                cleaned = clean_interjections(speech_parts[1])
                current_speech.append(cleaned.strip())
        elif current_speaker:
            # Continue speaker's speech (cleaned)
            cleaned = clean_interjections(text)
            if cleaned:
                current_speech.append(cleaned.strip())

    if current_speaker and current_speech:
        full_speech = ' '.join(current_speech).strip()
        if full_speech:
            entries.append({
                'speaker': current_speaker,
                'party': current_party,
                'speech': full_speech
            })

# Convert to DataFrame
df = pd.DataFrame(entries)

df.to_csv("test_4.csv",index=False)


In [19]:
file_path = 'fnameorig_1547883.html'


with open(file_path, 'r', encoding='utf-8') as f:
    html_content = f.read()

soup = BeautifulSoup(html_content, 'html.parser')
# Find all div tags where the class attribute starts with "WordSection"
word_sections = soup.find_all('div', class_=re.compile(r'^WordSection[3-9]\d*$'))


word_sections

[<div class="WordSection3">
 <p class="MsoNormal"><b><span style="display:none"><!----></span>Präsident
 <a href="/WWER/PAD_88386/index.shtml">Mag. Wolfgang Sobotka</a><span style="display:none"><!--¦--></span>:</b> Meine sehr
 geehrten Damen und Herren, ich darf die 178. Sitzung des Nationalrates
 für <b><i>eröffnet</i></b> erklären. Ich darf Sie, werte
 Abgeordnete, recht herzlich begrüßen. Ich freue mich über die
 zahlreichen Besucherinnen und Besucher auf der Galerie und über die
 Anwesenheit der Damen und Herren von den Medien. Ich grüße die Damen
 und Herren, die die Sitzung heute zu Hause vor dem Bildschirm verfolgen.</p>
 <p class="MsoNormal">Mein besonderer Gruß gilt dem Herrn
 Bundespräsidenten, dem ich auch von dieser Stelle herzlich zu seiner
 Wiederwahl gratulieren darf. <i><span lang="DE">(Allgemeiner Beifall.)</span></i><span lang="DE"> </span></p>
 <p class="MsoNormal"><span lang="DE">Ich darf auch die Präsidentin des
 Rechnungshofes recht herzlich in unserer Mitte beg