In [1]:
import duckdb as db
import re

In [2]:
 # Connect to sql database
con = db.connect(database='../data/database/german-parliament.duckdb', read_only=False)

In [8]:
def create_highlighted_paragraph(paragraph: str, target: str, ignore_case: bool = False):
    """
    @todo docstring
    """

    # Normalize the target by removing all internal whitespace
    essential_target_chars = re.sub(r'\s', '', target)
    
    # Define the base style for the entire paragraph content
    paragraph_base_style = "font-family: 'Figtree Sans Serif', sans-serif; font-size: 16px;"

    # Construct a regex pattern from essential characters, allowing flexible whitespace
    pattern_str = r'\s*'.join([re.escape(char) for char in essential_target_chars])
    
    # The search pattern for re.sub needs to be wrapped in parentheses
    # to create a capturing group for \1 (the matched text from paragraph)
    search_pattern = f"({pattern_str})"

    sub_flags = re.IGNORECASE if ignore_case else 0
    
    # Inline styles specific to the highlight itself
    # (background, font-weight, padding). Font family and size will be inherited.
    highlight_specific_style = "background-color: #FFFF81; font-weight: normal; padding: 0.1em 0;"
    
    replacement_html = fr'<span style="{highlight_specific_style}">\1</span>'
    
    # First, apply highlights to the target words
    highlighted_content = re.sub(
        search_pattern,
        replacement_html,
        paragraph,
        flags=sub_flags
    )
    
    # base style for the entire paragraph content
    paragraph_base_style = "font-family: 'Figtree Sans Serif', sans-serif; font-size: 16px;"
    
    final_styled_paragraph = f'<div style="{paragraph_base_style}">{highlighted_content}</div>'
    
    return final_styled_paragraph

def build_to_be_annotated_data(target_file:str):
    print(f"Saving data to: {target_file}...")
    ignore_groups = "('EPPOL','EOPOL','GPE','EOWIRT','EOSCI','EOFINANZ','EONGO', 'EOMEDIA', 'EOMIL')"
    data = con.execute(f"select * from group_mention where label NOT IN {ignore_groups} order by random() LIMIT 500").fetchdf()
    if not data.empty:
        data['formatted_paragraph'] = data.apply(
        lambda row: create_highlighted_paragraph(row['paragraph'], row['group_text']),
        axis=1  # Apply function to each row
        )
    data.to_csv(target_file, index=False)
    print("Exported to be labeled data to: {target_file}")

def main():
    build_to_be_annotated_data('test_data_to_be_annotated.csv')

if __name__ == "__main__":
    main()


Saving data to: test_data_to_be_annotated.csv...
Exported to be labeled data to: {target_file}


In [6]:
con.sql("select count(*) from group_mention")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         4551 │
└──────────────┘

In [17]:
con.close()