In [2]:
import xml.etree.ElementTree as ET
import duckdb
import os
from duckdb import ConstraintException, FatalException
from pathlib import Path
from tqdm import tqdm

In [3]:
# Connect to sql database
con = duckdb.connect(database='german-parliament', read_only=False)

In [3]:
def create_db(reset_db=False):
    """
    Creates the necessary tables in the DuckDB database for storing plenary minutes.
    This function creates the 'party', 'speaker', and 'plenary_minute' tables, as well as sequences for party and speaker IDs.

    Args:
        reset_db (bool): If True, drops existing tables and sequences before creating new ones.
                          Defaults to False.

    Returns:
        None
    """
    if reset_db:
        con.execute("DROP TABLE IF EXISTS speech;")
        con.execute("DROP TABLE IF EXISTS speaker;")
        con.execute("DROP TABLE IF EXISTS party;")
        con.execute("DROP TABLE IF EXISTS plenary_minute;")
        con.execute("DROP TABLE IF EXISTS session;")


        con.execute("DROP SEQUENCE IF EXISTS party_id_seq;")
        con.execute("DROP SEQUENCE IF EXISTS speaker_id_seq;")
        con.execute("DROP SEQUENCE IF EXISTS legislative_period_id_seq;")
        con.execute("DROP SEQUENCE IF EXISTS session_id_seq;")
        con.execute("DROP SEQUENCE IF EXISTS speech_id_seq;")

    # Create a sequence for party IDs to ensure unique IDs for each party. It's like an auto-incrementing primary key.
    con.execute("CREATE SEQUENCE IF NOT EXISTS party_id_seq START 1;")
    # Create the party which will be used to identify a party
    con.execute(
        """
        CREATE TABLE IF NOT EXISTS party (
            id INTEGER DEFAULT nextval('party_id_seq') PRIMARY KEY,
            -- If we insert a party we just use DEFAULT to get the next value from the sequence.
            name VARCHAR UNIQUE NOT NULL
        );
        """
    )

    # Create a sequence for speaker IDs to ensure unique IDs for each speaker.
    con.execute("CREATE SEQUENCE IF NOT EXISTS speaker_id_seq START 1;")
    # Create the speaker which will be used to identify a speaker
    con.execute("""
        CREATE TABLE IF NOT EXISTS speaker (
            old_id INTEGER DEFAULT nextval('speaker_id_seq') UNIQUE, -- old_id is used to identify the speaker in the XML file of period < 19
            -- If we insert a speaker we just use DEFAULT to get the next value from the sequence.
            new_id INTEGER UNIQUE, -- new_id is used to identify the speaker in the XML file of period 19 and later. This id is provided by the Bundestag.
            name VARCHAR NOT NULL,
            party_id INTEGER REFERENCES party(id) NOT NULL,
            PRIMARY KEY (name, party_id)
        );
        """
    )

    # Create a sequence for legislative periods to ensure unique IDs for each legislative period.
    con.execute("CREATE SEQUENCE IF NOT EXISTS legislative_period_id_seq START 1;")
    # Create the session table to store information about legislative periods and sessions.
    con.execute("""
        CREATE TABLE IF NOT EXISTS session (
            id INTEGER DEFAULT nextval('legislative_period_id_seq') UNIQUE NOT NULL,
            legislative_period INTEGER NOT NULL,
            session_no INTEGER NOT NULL,
            date DATE NOT NULL,
            primary key (legislative_period, session_no)
        );
        """
    )

    # Create a sequence for plenary minute IDs to ensure unique IDs for each plenary minute.
    con.execute("CREATE SEQUENCE IF NOT EXISTS speech_id_seq START 1;")
    con.execute("""
        CREATE TABLE IF NOT EXISTS speech (
            id VARCHAR DEFAULT nextval('speech_id_seq') PRIMARY KEY,
            -- id is a unique identifier for each speech, generated from the sequence for speeches for periods < 19.
            -- For newer periods the bundestag provides a unique id for each speech. Those begin with "ID"
            session_id INTEGER REFERENCES session(id) NOT NULL,
            speaker_old_id INTEGER REFERENCES speaker(old_id),
            speaker_new_id INTEGER REFERENCES speaker(new_id),
            role VARCHAR, -- Role of the speaker, e.g. 'mp'
            position VARCHAR, -- Position of the speaker, e.g. 'Präsident', 'Alterspräsident', etc. | In newer periods this is named Rolle!
            content TEXT NOT NULL
        );
        """
    )
    con.commit()

In [4]:
def parse_xml(file_path):
    """
    Parses an XML file and returns the root element.

    Args:
        file_path (str): The path to the XML file.

    Returns:
        xml.etree.ElementTree.Element: The root element of the parsed XML tree,
                                         or None if parsing fails.
    """
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        return root
    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
        return None
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

In [5]:
def insert_session(legislative_period, session, date):
    """ Inserts session information into the session table.

    Args:
        meta_data (xml.etree.ElementTree.Element): The title statement element containing legislative period and session information, as well as the date.

    Raises:
        Exception: If the titleStmt is None or if the publicationStmt is not found in the XML file.

    Returns:
        None
    """
    # print("------------------------------------")
    # print(f"Legislative Period: {legislative_period}, Session: {session}")
    # print(f"Publication Date: {date}")#
    # print("------------------------------------\n")

    # Insert session into the session table(

    sql = """
        INSERT INTO session (legislative_period, session_no, date)
        VALUES (?, ?,?)
        ON CONFLICT DO NOTHING;
    """

    params = (
        legislative_period,
        session,
        date
    )
    con.execute(sql, params)

    # con.execute(f"""
    #     INSERT INTO session (id, legislative_period, session_no, date)
    #     VALUES (DEFAULT, '{legislative_period}', '{session}', '{date}');
    # """)


def insert_party(attributes):
    # Insert party into the party table if it does not exist. The on conflict do nothing clause ensures that our key constraints aren't violated
    sql = "INSERT INTO party (name) VALUES (?) ON CONFLICT DO NOTHING;"
    con.execute(sql, (attributes["party"],))


def insert_speaker(attributes):
    # Insert speaker into the speaker table
    sql = """
        INSERT INTO speaker (name, party_id)
        VALUES (
            ?,                                    -- speaker’s full name
            (SELECT id FROM party WHERE name = ?) -- look-up party once, safely
        )
        ON CONFLICT DO NOTHING;
        """

    con.execute(sql, (attributes["name"], attributes["party"]))




def insert_speech(speech, attributes, legislative_period, session):
    # Insert speech into the speech table
    # print(attributes)
    speech_string = ET.tostring(speech, encoding='unicode') if speech is not None else ''

    party_id = con.execute(f"""
                    SELECT id FROM party WHERE name = '{attributes['party']}';
                """).fetchone()[0]
    speaker_id = con.execute(f"""
                    SELECT id FROM speaker WHERE name = '{attributes['name']}' AND party_id = {party_id};
                """).fetchone()[0]
    session_id = con.execute(f"""
                    SELECT id FROM session WHERE legislative_period = '{legislative_period}' AND session_no = '{session}';
                 """).fetchone()[0]

    sql = """
    INSERT INTO speech
           (session_id, speaker_id, role, position, content)
    VALUES (?, ?, ?, ?, ?)
    ON CONFLICT DO NOTHING;
    """
    speaker_role = attributes.get("role")
    speaker_role = None if speaker_role == 'NA' else speaker_role  # Convert 'NA' to None for SQL NULL
    # values to bind — order must match the placeholders
    params = (
        session_id,
        speaker_id,
        speaker_role,      # None → NULL in SQL
        attributes.get("position"),
        speech_string
    )

    con.execute(sql, params)




def parse_plenary_minute(root):
    """ Parses the plenary minute XML root element and extracts speeches, speakers, and parties and inserts them into the database.

    Args:
        root (xml.etree.ElementTree.Element): The root element of the XML tree containing the data of a single plenary minute.

    Raises:
        Exception: If the XML root element is None.

    Returns:
        None

    """
    if root is not None:
        # Get the meta data information, i.e. date, session, etc.
        meta_data = root.find('teiHeader').find('fileDesc')
        
        titleStmt = meta_data.find('titleStmt')
        publicationStmt = meta_data.find('publicationStmt')
        
        legislative_period = titleStmt.find('legislativePeriod').text
        session = titleStmt.find('sessionNo').text
        date = publicationStmt.find('date').text

        # Insert session entry into the db
        insert_session(legislative_period, session, date)
        #Get the actual content of the plenary minute
        plenary_minute = root.find('text').find('body').findall('div')
        speeches = []
        # Get all 'div' elements in the 'body' of the 'text'. Those are the elements that contain the actual plenary minute content.
        for div in plenary_minute:
            speeches.extend(div.findall('sp'))

        for speech in speeches:
            # Get attributes of the speech element
            attributes = speech.attrib
            # Insert party of speaker into db
            insert_party(attributes)
            # Insert speaker (politician) into db
            insert_speaker(attributes)
            # Insert the speech into the db
            insert_speech(speech, attributes, legislative_period, session)
    else:
        raise Exception("Missing XML root element.")

def parse_legislative_period(path):
    print(f"Parsing legislative period at path: {path}")
    # We materialise the generator once, to obtain all entries, so we can call len() on it
    files = list(Path(path).rglob("*"))
    # tqdm is used to create a nice loading bar for progress
    for file in tqdm(files, total=len(files), desc="Processing files"):
        if file.suffix == ".xml":
            # print(f"Parsing file: {file}")
            root = parse_xml(file)
            parse_plenary_minute(root)
            # print(f"Parsed file: {file}")

    print(f"Finished parsing legislative period at path: {path}")



def main():
    # Create database tables
    create_db(True)
    # Parse each period between period 1 and 18
    for period in range(1, 19): 
        # 02:d adjusts the number to be two digits, e.g. 01, 02, ..., 10, 11, ...
        try:
            con.execute("BEGIN TRANSACTION;")
            parse_legislative_period(f'../data/plenary_minutes/wahlperiode_{period:02d}')
            con.execute("COMMIT;")
        except FatalException as e:
            con.execute("ROLLBACK;")
            print(f"Error processing legislative period {period}: {e}")


if __name__ == "__main__":
    main()

# Always close connection to ensure that we won't have problems reconnecting to the db!
con.close()

Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_01


Processing files: 100%|██████████| 282/282 [01:16<00:00,  3.68it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_01
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_02


Processing files: 100%|██████████| 227/227 [01:00<00:00,  3.76it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_02
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_03


Processing files: 100%|██████████| 168/168 [00:50<00:00,  3.33it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_03
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_04


Processing files: 100%|██████████| 198/198 [01:30<00:00,  2.18it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_04
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_05


Processing files: 100%|██████████| 247/247 [02:11<00:00,  1.88it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_05
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_06


Processing files: 100%|██████████| 199/199 [01:39<00:00,  2.00it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_06
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_07


Processing files: 100%|██████████| 259/259 [02:12<00:00,  1.96it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_07
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_08


Processing files: 100%|██████████| 230/230 [01:55<00:00,  1.99it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_08


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_09


Processing files: 100%|██████████| 142/142 [01:07<00:00,  2.12it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_09
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_10


Processing files: 100%|██████████| 256/256 [02:20<00:00,  1.83it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_10
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_11


Processing files: 100%|██████████| 236/236 [02:05<00:00,  1.88it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_11
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_12


Processing files: 100%|██████████| 243/243 [02:06<00:00,  1.92it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_12
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_13


Processing files: 100%|██████████| 248/248 [02:05<00:00,  1.97it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_13
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_14


Processing files: 100%|██████████| 253/253 [02:00<00:00,  2.10it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_14
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_15


Processing files: 100%|██████████| 183/183 [01:20<00:00,  2.26it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_15
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_16


Processing files: 100%|██████████| 233/233 [01:54<00:00,  2.03it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_16
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_17


Processing files: 100%|██████████| 253/253 [02:23<00:00,  1.77it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_17
Parsing legislative period at path: ../data/plenary_minutes/wahlperiode_18


Processing files: 100%|██████████| 245/245 [02:00<00:00,  2.03it/s]


Finished parsing legislative period at path: ../data/plenary_minutes/wahlperiode_18


In [12]:
con.execute("select * from speech where role <> 'mp' limit 10 offset 10000").fetchdf()

Unnamed: 0,id,session_id,speaker_id,role,position,content
0,17983,151,59,presidency,Vizepräsident,"<sp who=""Schmid"" parliamentary_group=""NA"" role..."
1,17985,151,59,presidency,Vizepräsident,"<sp who=""Schmid"" parliamentary_group=""NA"" role..."
2,17986,152,1,presidency,Präsident,"<sp who=""Ehlers"" parliamentary_group=""NA"" role..."
3,17988,152,1,presidency,Präsident,"<sp who=""Ehlers"" parliamentary_group=""NA"" role..."
4,17990,152,1,presidency,Präsident,"<sp who=""Ehlers"" parliamentary_group=""NA"" role..."
5,17992,152,1,presidency,Präsident,"<sp who=""Ehlers"" parliamentary_group=""NA"" role..."
6,17993,153,59,presidency,Vizepräsident,"<sp who=""Schmid"" parliamentary_group=""NA"" role..."
7,17995,153,59,presidency,Vizepräsident,"<sp who=""Schmid"" parliamentary_group=""NA"" role..."
8,17996,153,384,government,Bundeskanzler,"<sp who=""Adenauer"" parliamentary_group=""NA"" ro..."
9,17997,153,59,presidency,Vizepräsident,"<sp who=""Schmid"" parliamentary_group=""NA"" role..."


In [29]:
con.close()

In [None]:
con.execute("select * from speaker join party on speaker.party_id = party.id ;").fetchdf()