In [None]:
import duckdb as db
import re

In [None]:
 # Connect to sql database
con = db.connect(database='../data/database/german-parliament.duckdb', read_only=False)

In [None]:
def create_highlighted_paragraph(paragraph: str, group_text: str, ignore_case: bool = False):
    """ Creates a styled HTML paragraph out of the given and highlights every occurrence of the group mention.

    Args:
        paragraph (str): The paragraph text where the group mention should be highlighted.
        group_text (str): The text of the group mention to highlight in the paragraph.
        ignore_case (bool): If True, the search for group_text will be case-insensitive.

    Returns:
        str: An HTML string representing the paragraph with highlighted group mentions.
    """

    # Normalize the group_text by removing all internal whitespace
    essential_group_chars = re.sub(r'\s', '', group_text)
    
    # Define the base style for the entire paragraph content
    paragraph_base_style = "font-family: 'Figtree Sans Serif', sans-serif; font-size: 16px;"

    # Construct a regex pattern from essential characters, allowing flexible whitespace
    pattern_str = r'\s*'.join([re.escape(char) for char in essential_group_chars])
    
    # The search pattern for re.sub needs to be wrapped in parentheses
    # to create a capturing group for \1 (the matched text from paragraph)
    search_pattern = f"({pattern_str})"

    sub_flags = re.IGNORECASE if ignore_case else 0
    
    # Inline styles specific to the highlight itself
    # (background, font-weight, padding). Font family and size will be inherited.
    highlight_specific_style = "background-color: #FFFF81; font-weight: normal; padding: 0.1em 0;"
    
    replacement_html = fr'<span style="{highlight_specific_style}">\1</span>'
    
    # First, apply highlights to the group_text words
    highlighted_content = re.sub(
        search_pattern,
        replacement_html,
        paragraph,
        flags=sub_flags
    )
    
    # base style for the entire paragraph content
    paragraph_base_style = "font-family: 'Figtree Sans Serif', sans-serif; font-size: 16px;"
    
    final_styled_paragraph = f'<div style="{paragraph_base_style}">{highlighted_content}</div>'
    
    return final_styled_paragraph

def build_to_be_annotated_data(target_file:str):
    """" Builds a CSV file with paragraphs and group mentions to be annotated. It selects random paragraphs from the group_mention table, for which the group mention is not in the ignore list.
        It creates a new column 'formatted_paragraph' that contains the paragraph with the group mention highlighted.

    Args:
        target_file (str): The path to the CSV file where the data will be saved.

    Returns:
        None
    """
    print(f"Saving data to: {target_file}...")
    ignore_groups = "('EPPOL','EOPOL','GPE','EOWIRT','EOSCI','EOFINANZ','EONGO', 'EOMEDIA', 'EOMIL')"
    data = con.execute(f"select * from group_mention where label NOT IN {ignore_groups} order by random() LIMIT 500").fetchdf()
    if not data.empty:
        data['formatted_paragraph'] = data.apply(
        lambda row: create_highlighted_paragraph(row['paragraph'], row['group_text']),
        axis=1  # Apply function to each row
        )
    data.to_csv(target_file, index=False)
    print("Exported to be labeled data to: {target_file}")

def main():
    """ Main function to execute the data preparation for annotation. """
    build_to_be_annotated_data('test_data_to_be_annotated.csv')

if __name__ == "__main__":
    main()
