In [1]:
import duckdb as db
import pandas as pd
from datetime import datetime

In [2]:
 # Connect to sql database
con = db.connect(database='../data/database/german-parliament.duckdb', read_only=False)

In [4]:
def build_to_be_annotated_data(target_file:str):
    """" Builds a CSV file with paragraphs and group mentions to be annotated. It selects random paragraphs from the group_mention table, for which the group mention is not in the ignore list.
        It creates a new column 'formatted_paragraph' that contains the paragraph with the group mention highlighted.

    Args:
        target_file (str): The path to the CSV file where the data will be saved.

    Returns:
        None
    """
    print(f"Saving data to: {target_file}...")
    #@ todo not ignore groups, rather normalize them!!!!!
    # ignore_groups = "('EPPOL','EOPOL','GPE','EOWIRT','EOSCI','EOFINANZ','EONGO', 'EOMEDIA', 'EOMIL')"
    # Get all labels
    excluded_labels = ['GPE', 'EPOWN']
    labels = con.execute(f"select distinct(label) from group_mention where label not in {excluded_labels}").fetchdf().label.tolist()
    print(f"Current labels in Database: {labels}")
    # List for dataframes of each label
    dataframes = []
    for label in labels:
        sql = f"""
            SELECT *
            FROM group_mention g 
                JOIN speech s 
                ON g.speech_id = s.id 
            WHERE g.label = '{label}' 
                AND LENGTH(paragraph) <= 3000 
            ORDER BY RANDOM() 
            LIMIT 10
        """
        dataframes.append(con.execute(sql).fetchdf())

    data = pd.concat(dataframes, ignore_index=True)
    data.to_csv(target_file, index=False)
    print("Exported to be labeled data")

def main():
    """ Main function to execute the data preparation for annotation. """
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    build_to_be_annotated_data(f'annotation_data_{timestamp}.csv')

if __name__ == "__main__":
    main()
    con.close()


Saving data to: annotation_data_2025-06-19 18:01:35.csv...
Current labels in Database: ['PETH', 'PFUNK', 'EOWIRT', 'EOMIL', 'EPREL', 'EPPOL', 'EPMOV', 'EOFINANZ', 'PGEN', 'EPMEDIA', 'EPSCI', 'EOREL', 'EPMIL', 'EPKULT', 'EOPOL', 'EOSCI', 'PSOZ', 'EOMOV', 'EONGO', 'EPWIRT', 'PNAT', 'PAGE', 'EOMEDIA', 'EPNGO']
Exported to be labeled data


In [4]:
con.execute("select label, count(*) as count from group_mention group by label order by count desc").fetchdf()

Unnamed: 0,label,count
0,EOPOL,8744
1,EPPOL,5891
2,GPE,4078
3,PFUNK,3767
4,PNAT,607
5,PAGE,587
6,PGEN,474
7,EOMIL,404
8,EOWIRT,335
9,PETH,286


In [6]:
con.close()