In [54]:
import duckdb as db
from pathlib import Path
import argparse
import pandas as pd
import json

In [76]:
def connect_to_db(db_path: str) -> db.DuckDBPyConnection:
    """
    Connect to a DuckDB database.

    Args:
        db_path (str): The path to the DuckDB database file.

    Returns:
        duckdb.DuckDBPyConnection: A connection object to the DuckDB database.
    """
    return db.connect(database=db_path, read_only=False)

def create_table(con:db.DuckDBPyConnection, reset_db:bool=False) -> None:
    if reset_db:
        con.execute("DROP TABLE IF EXISTS annotated_paragraphs")
        con.execute("DROP TABLE IF EXISTS annotations")

    sql = """
        CREATE TABLE IF NOT EXISTS annotated_paragraphs (
            id INTEGER PRIMARY KEY REFERENCES group_mention(id),
            group_text VARCHAR NOT NULL,
            inference_paragraph VARCHAR NOT NULL,
            adjusted_span BOOLEAN NOT NULL
            )
    """
    con.execute(sql)

    sql = """
        CREATE TABLE IF NOT EXISTS annotations (
            annotation_id INTEGER NOT NULL,
            annotator VARCHAR(32) NOT NULL,
            annotated_paragraph_id INTEGER NOT NULL REFERENCES annotated_paragraphs(id),
            stance CHECK (stance IN ('favour', 'against', 'neither')),
            PRIMARY KEY(annotation_id, annotator)
            )
    """
    con.execute(sql)

In [77]:
def adjust_inference_para(paragraph:str, group_span_adjustment:list[dict]) -> str:
    # remove the span tags
    clean_paragraph = paragraph.replace("<span>", "").replace("</span>", "")
    # get placement of adjusted group span

    offsets = group_span_adjustment[0].get('globalOffsets')
    start = offsets.get("start")
    end = offsets.get("end")


    # input validation
    # ensures start and end are valid integers and in the correct order.
    if not (isinstance(start, int) and isinstance(end, int) and 0 <= start <= end <= len(clean_paragraph)):
        print(f"Warning: Invalid offsets provided. start={start}, end={end}, text_length={len(clean_paragraph)}")
        # Return the original text if offsets are invalid to prevent errors.
        return None

    # slice string, based on offsets to build new inference paragraph
    before_span = clean_paragraph[:start]
    span_content = clean_paragraph[start:end]
    after_span = clean_paragraph[end:]

    # construct the new string with the tags wrapped around the middle part.
    return f"{before_span}<span>{span_content}</span>{after_span}"

In [78]:
def process_primary_annotations(path:str, con:db.DuckDBPyConnection) -> None:
    """ Processes annotations from the primary annotator. This will be used, to
        build our annotated_paragraphs table. That means only the inference paragraphs are used / adjusted.

    Args:
        path (str): The path to the annotation file.

    Returns:
        None
    """
    # Read annotated data
    annotation_data = pd.read_csv(path)
    # Iterate over each annotated entry and add it to db
    for index, row in annotation_data.iterrows():
        group_text = row['group_text']
        paragraph_id = row['id']
        inference_paragraph = row['inference_paragraph']
        group_span_adjustments = row['answer']
        adjusted_span = False

        if not pd.isna(group_span_adjustments):
            try:
                adjusted_span = True
                # Convert list string into list
                group_span_adjustments = json.loads(group_span_adjustments)
                # If our adjustments are not NA it means, that the group span was adjusted!
                inference_paragraph = adjust_inference_para(inference_paragraph, group_span_adjustments)
                # Adjust group text, since the group span was adjusted
                group_text = group_span_adjustments[0].get('text')
            except Exception as e:
                print(type(group_span_adjustments))
                print(group_span_adjustments[0])
                print(e)
                return None
    sql = """
        INSERT INTO annotated_paragraphs (id, group_text, inference_paragraph, adjusted_span)
             VALUES (?, ?, ?, ?)
          """
    con.execute(sql, (paragraph_id, group_text, inference_paragraph, adjusted_span))

def process_annotations(path:str, annotator:str) -> None:
    """ Processes annotations from an annotator. This will be used, to fill the annotations table, where annotations from each annotator are stored"""


In [79]:
con = db.connect(database="", read_only=False)
create_table(con, reset_db=True)
path = "../data/annotated_data/maris-2025-06-30-14-57-790f3829.csv"
process_primary_annotations(path, con)

CatalogException: Catalog Error: Table with name group_mention does not exist!
Did you mean "pg_enum"?

In [3]:
def main():
    """ Main function to process annotated data.

    Args:
        None

    Returns:
        None
    """

    # Get path to db
    home_dir = Path.home()
    db_path = home_dir / "stance-detection-german-llm" / "data" / "database" / "german-parliament.duckdb"
    # Get db connection
    con = connect_to_db(db_path)



    # build argparser, to pass information whether to reset db or not when starting the script
    parser = argparse.ArgumentParser(description="Process annotated data.")

    parser.add_argument(
        "--reset_db",
        action='store_true', # This is the key change
        help="If this flag is present, the annotations table will be reset."
    )




if __name__ == "__main__":
    main()

NameError: name 'main' is not defined