In [None]:
# Preprocess the poetic texts: Orthographic variants, Verses should be condensed to sentences
# Code mostly reused from ....

from typing import Dict, List
import re
import csv

# Initialize text
def load_text(filename: str) -> list:
    with open(filename, 'r', encoding='utf-8') as file:
        text_list = file.readlines()
        text = []
        for line in text_list:
            line = line.split("\t")
            text.append(line)
    return text

def tokenizer(text: str) -> list:
    """
    Tokenizes the given text by splitting it into words based on specified delimiters.
    
    Parameters:
        text (str): The text to tokenize.
    
    Returns:
        list: A list of words extracted from the text.
    """
    # Include punctuation marks as separate tokens
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def read_in_text(input_text: str) -> List[List[str]]:
    """
    Reads the input text file and returns its content as a list of rows.
    
    Parameters:
        input_text (str): Path to the input text file.
    
    Returns:
        List[List[str]]: A list containing the rows of the input file.
    """
    with open(input_text, "r", newline='', encoding='utf-8') as samplefile: 
        reader = csv.reader(samplefile, delimiter="\t", quoting=csv.QUOTE_NONE) 
        my_list = list(reader)
    return my_list

def transform_token(token: str)-> str:
    """
    Transforms a token based on predefined prefix rules.
    
    Parameters:
        token (str): The token to transform.
    
    Returns:
        str: The transformed token.
    """
    # Dictionary mapping prefixes to their transformations
    prefix_map = {
    'adt': 'att', 'Adt': 'Att', 'adp': 'app', 'Adp': 'App', 'adc': 'acc', 'Adc': 'Acc',
    'adg': 'agg', 'Adg': 'Agg', 'adf': 'aff', 'Adf': 'Aff', 'adl': 'all', 'Adl': 'All',
    'adr': 'arr', 'Adr': 'Arr', 'ads': 'ass', 'Ads': 'Ass', 'adqu': 'acqu', 'Adqu': 'Acqu',
    'inm': 'imm', 'Inm': 'Imm', 'inl': 'ill', 'Inl': 'Ill', 'inr': 'irr', 'Inr': 'Irr',
    'inb': 'imb', 'Inb': 'Imb', 'conm': 'comm', 'Conm': 'Comm', 'conl': 'coll', 'Conl': 'Coll',
    'conr': 'corr', 'Conr': 'Corr', 'conb': 'comb', 'Conb': 'Comb', 'conp': 'comp', 'Conp': 'Comp'}
    vowels = 'aeiou'
    # Determine if the token starts with a recognized prefix
    for prefix, replacement in prefix_map.items():
        if token.lower().startswith(prefix):
            # Special handling for 'ads'/'Ads'
            if prefix.lower() == 'ads' and len(token) > 3:
                if token[3].lower() in vowels:
                    # Only apply replacement if followed by a vowel
                    new_start = token[0] + replacement[1:]  # Assimilate (eg. adserit > asserit)
                    return new_start + token[3:]
                else:
                    new_start = token[0] + token[2:] # Keep simple consonant before consonant (eg. adstringit > astringit)
                    return new_start
            elif prefix.lower() != 'ads':  # Apply other transformations directly
                new_start = token[0] + replacement[1:]  # Maintain case of the first letter
                return new_start + token[len(prefix):]
            break  # Exit the loop after finding the first matching prefix
    return token  # Return the token unchanged if no prefix matches

def assimilate(my_list: List[List[str]]) -> List[List[str]]:
    """
    Applies assimilation rules to each sentence in the list.
    
    Parameters:
        my_list (List[List[str]]): A list of sentences to process.
    
    Returns:
        List[List[str]]: The list with assimilated sentences.
    """
    for text in my_list:
        tokens = tokenizer(text[1])  # Tokenize the text
        for i, token in enumerate(tokens):
            transformed_token = transform_token(token)
            if transformed_token:
                neu = text[1].replace(tokens[i],transformed_token)
                text[1] = neu 
    return my_list

def normalize_quotation_marks(text_list: List[List[str]]) -> List[List[str]]:
    """
    Replace all types of quotation marks in each text string with a standard apostrophe.

    Args:
    - text_list: A list of lists, where each inner list contains an ID and a text string.

    Returns:
    - A new list of lists with quotation marks normalized in the text strings.
    """
    new_text_list = [[sublist[0], re.sub("[\"”“„’‘]", "\'", sublist[1])] for sublist in text_list]
    return new_text_list

# Connect -que and -ve with their preceding words
def remove_whitespace_before_connectors(text_list: List[List[str]]) -> List[List[str]]:
    """
    Remove whitespace before '-que' and '-ve' in each text string.

    Args:
    - text_list: A list of lists, where each inner list contains an ID and a text string.

    Returns:
    - A new list of lists with the whitespace removed before connectors in the text strings.
    """
    new_text_list = [[sublist[0], re.sub(" (que|ve|ue)([ ,\.!?])", r"\1\2", sublist[1])] for sublist in text_list]
    return new_text_list

# Remove digits (left over chapter numbers) at the start of each unit
def remove_left_over_counts(text_list: List[List[str]]) -> List[List[str]]:
    """
    Remove digits (leftover chapter numbers) at the start of each text string.

    Args:
    - text_list: A list of lists, where each inner list contains an ID and a text string.

    Returns:
    - A new list of lists with the leading digits removed from the text strings.
    """
    new_text_list = [[sublist[0], re.sub('\A\d{1,3}\.?', "", sublist[1])] for sublist in text_list]
    return new_text_list
    
def strip_whitespaces(text_list: List[List[str]]) -> List[List[str]]:
    """
    Remove leading and trailing whitespaces from the text strings in each sublist.

    Args:
        text_list: A list of sublists, each containing an ID and a text string.

    Returns:
        A new list of sublists with leading and trailing whitespaces removed from the text strings.
    """
    new_text_list = [[sublist[0], sublist[1].strip()] for sublist in text_list]
    return new_text_list

def separate_text_at_ellipsis(text_list: List[List[str]]) -> List[List[str]]:
    """
    Separate text at ellipsis ('...') points, creating new sublists for text following the ellipsis. If an ellipsis is followed by an apostrophe ("...'"), it includes the apostrophe in the separation.

    Args:
        text_list: A list of sublists, each containing an ID and a text string.

    Returns:
        The modified list with additional sublists created for text following ellipsis points.
    """
    k = 0
    while k < len(text_list):
        sublist = text_list[k]
        if '...' in sublist[1]:
            i1 = sublist[1].index('...')
            if sublist[1][i1:i1+4] == "...\'":
                text_list.insert(k+1, [sublist[0], sublist[1][i1+4:]])
                sublist[1] = sublist[1][:i1+4]
            else:
                text_list.insert(k+1, [sublist[0], sublist[1][i1+3:]])
                sublist[1] = sublist[1][:i1+3]
        k = k+1
    return text_list

def separate_text_at_punctuation_marks(text_list: List[List[str]]) -> List[List[str]]:
    """
    Separate text at specific punctuation marks (., ?, !, and : '), creating new sublists for text following these marks. This function takes care to not split ellipsis ('...').

    Args:
        text_list: A list of sublists, each containing an ID and a text string.

    Returns:
        The modified list with additional sublists created for text following the specified punctuation marks.
    """
    k = 0
    while k < len(text_list):
        sublist = text_list[k]
        if '.' in sublist[1] or '?' in sublist[1] or '!' in sublist[1] or ': \'' in sublist[1]:
            if '.' in sublist[1]:
                i1 = sublist[1].index('.')
                if sublist[1][i1:i1+3] == '...': # Make sure elipsis does not get split
                    i1= len(sublist[1])+2
            else:
                i1 = len(sublist[1])+2
            if '?' in sublist[1]:
                i2 = sublist[1].index('?')
            else:
                i2 = len(sublist[1])+2
            if '!' in sublist[1]:
                i3 = sublist[1].index('!')
            else:
                i3 = len(sublist[1])+2
            if ': \'' in sublist[1]:
                i4 = sublist[1].index(': \'')
            else:
                i4 = len(sublist[1])+2
            l = min(i1, i2, i3, i4)
            text_list.insert(k+1, [sublist[0], sublist[1][l+2:]])
            sublist[1] = sublist[1][:l+2]
        k = k+1
    return text_list

def cleanup(input_list: List[List[str]]) -> List[List[str]]:
    """
    Remove sublists with empty or whitespace-only text strings, and strip leading and trailing whitespaces from remaining text strings.

    Args:
        input_list: A list of sublists, each containing an ID and a text string.

    Returns:
        A cleaned list of sublists with whitespaces stripped and empty or whitespace-only text strings removed.
    """
    output_list = [[sublist[0], sublist[1].strip()] for sublist in input_list if len(sublist) > 0 and len(sublist[1]) > 1]
    return output_list

def apply_condition_and_action(text_list: List[List[str]], condition_func: callable, action_func: callable) -> List[List[str]]:
    """
    Apply a condition and action to each pair of consecutive sublists in the text list.

    Args:
    - text_list: A list of lists, each containing an ID and a text string.
    - condition_func: A function that takes two sublists and returns True if the action should be applied.
    - action_func: A function that applies a transformation based on the condition, modifying the text list in place.

    Returns:
    - The modified text list after applying the condition and action functions.
    """
    k = 0
    while k < len(text_list)-1:  # Adjust to prevent index out of range for operations involving the next element
        if condition_func(text_list[k], text_list[k+1]):
            action_func(k, text_list)
        k += 1
    return text_list

def mark_verse_endings(text_list: List[List[str]]) -> List[List[str]]:
    """
    Append a marker (' /') at the end of each text string in the list to denote verse endings.

    Args:
        text_list: A list of sublists, each containing an ID and a text string representing a verse.

    Returns:
        A new list of sublists where each text string is appended with ' /' to denote the end of a verse.
    """
    new_text_list = [[sublist[0], sublist[1] + ' /'] for sublist in text_list]
    return new_text_list

def remove_left_over_marks(text_list: List[List[str]]) -> List[List[str]]:
    """
    Remove left over marks ('/ ') from the beginning of each text string in the list.

    Args:
        text_list: A list of sublists, each containing an ID and a text string.

    Returns:
        A new list of sublists where each text string starting with '/ ' has had those initial characters removed.
    """
    new_text_list = [[sublist[0], sublist[1][2:] if sublist[1][:2] == '/ ' else sublist[1]] for sublist in text_list ]
    return new_text_list

def condition_contract_verse_endings(sublist: List[str], next_sublist: List[str]) -> bool:
    """
    Determine if the ending of a verse (' /') should lead to a contraction with the subsequent verse, excluding cases where the marker is part of a specific pattern (': /').

    Args:
        sublist: The current sublist containing an ID and a text string representing a verse.
        next_sublist: The subsequent sublist containing an ID and a text string representing the next verse.

    Returns:
        A boolean value: True if the current verse ends with ' /' (excluding ': /') and should be contracted with the next, False otherwise.
    """
    return sublist[1][-2:] == ' /' and not sublist[1][-3:] == ': /'

def condition_insertions_in_brackets(sublist: List[str], next_sublist: List[str]) -> bool:
    """
    Check if a text contains an opening bracket '(' without a corresponding closing bracket ')' afterwards. This function is used to identify cases where text spans multiple sublists due to an unclosed bracket.

    Args:
    - sublist: A list containing an ID and a text string, where the condition is checked.
    - next_sublist: The subsequent list containing an ID and a text string, not directly used in this condition but included for consistency with the framework.

    Returns:
    - A boolean value: True if there's an opening bracket without a closing bracket, False otherwise.
    """
    return '(' in sublist[1] and ')' not in sublist[1][sublist[1].index('('):]

def condition_after_interjections(sublist: List[str], next_sublist: List[str]) -> bool:
    """
    Check if the end of the current text string contains specific interjection patterns (' a!' or ' o!').

    Args:
        sublist: The current sublist containing an ID and a text string.
        next_sublist: The subsequent sublist. This parameter is included for consistency but not used in this condition.

    Returns:
        A boolean indicating whether the current text string ends with a specified interjection pattern.
    """
    return sublist[1][-3:] in [' a!', ' o!']

def condition_contract_insertions(sublist: List[str], next_sublist: List[str]) -> bool:
    """
    Determine if the start of the next text string contains a specific insertion pattern ('—').

    Args:
        sublist: The current sublist containing an ID and a text string.
        next_sublist: The subsequent sublist containing an identifier and a text string to check for the insertion pattern.

    Returns:
        A boolean indicating whether the next text string starts with the specified insertion pattern.
    """
    return next_sublist[1][0] == '—'

def condition_direct_speeches_1(sublist: List[str], next_sublist: List[str]) -> bool:
    """
    Check for a pattern indicating the beginning of inserted direct speeches, where the current text string ends with an apostrophe and the next text string starts with certain Latin terms ('inquit', 'ait', 'dixit', 'dicit').

    Args:
        sublist: The current sublist containing an ID and a text string ending with an apostrophe.
        next_sublist: The subsequent sublist containing an ID and a text string starting with the specified Latin term.

    Returns:
        A boolean indicating the presence of a direct speech pattern across the current and next text strings.
    """
    return sublist[1][-1] == '\'' and (
        next_sublist[1].startswith('inquit') or 
        next_sublist[1].startswith('ait') or 
        next_sublist[1].startswith('dixit') or 
        next_sublist[1].startswith('dicit')
    )

def condition_direct_speeches_2(sublist: List[str], next_sublist: List[str]) -> bool:
    """
    Check for a pattern indicating the beginning of inserted direct speeches, similar to 'condition_direct_speeches_1', but for cases where the next text string starts with a comma followed by certain Latin terms (', inquit', ', ait', ', dixit', ', dicit').

    Args:
        sublist: The current sublist containing an ID and a text string ending with an apostrophe.
        next_sublist: The subsequent sublist containing an ID and a text string starting with a comma followed by a Latin term.

    Returns:
        A boolean indicating the presence of a direct speech pattern with a preceding comma across the current and next text strings.
    """
    return sublist[1][-1] == '\'' and (
        next_sublist[1].startswith(', inquit') or 
        next_sublist[1].startswith(', ait') or 
        next_sublist[1].startswith(', dixit') or 
        next_sublist[1].startswith(', dicit')
    )

def action_contract_next(k: int, text_list: List[List[str]]) -> None:
    """
    Concatenates the text of the next sublist to the current sublist with a space in between and then clears the text of the current sublist.

    Args:
        k: The index of the current sublist in the text_list.
        text_list: A list of sublists, each containing an ID and a text string.

    Returns:
        None. The function modifies the list in place.
    """
    text_list[k+1] = [text_list[k][0], text_list[k][1] + ' ' + text_list[k+1][1]]
    text_list[k][1] = ''

def contract_after_personal_names(text_list: List[List[str]]) -> List[List[str]]:
    """
    Contracts text after personal names, dates, and common abbreviations by appending the text of the next sublist to the current one if the current sublist ends with specific patterns.

    Args:
        text_list: A list of sublists, each containing an identifier and a text string.

    Returns:
        The modified text list after the contraction process.
    """
    l = 0
    while l < len(text_list):
        sublist = text_list[l]
        if sublist[1][-2:] == 'A.' or sublist[1][-3:] == ' a.' or sublist[1][-3:] == ' b.' or sublist[1][-2:] == 'C.' or sublist[1][-3:] == ' c.' or sublist[1][-2:] == 'D.' or sublist[1][-3:] == ' d.' or sublist[1][-2:] == 'E.' or sublist[1][-3:] == ' e.' or sublist[1][-3:] == ' f.' or sublist[1][-2:] == 'H.' or sublist[1][-2:] == 'L.' or sublist[1][-2:] == 'M.' or sublist[1][-3:] == ' m.' or sublist[1][-2:] == 'N.' or sublist[1][-2:] == 'P.' or sublist[1][-3:] == ' p.' or sublist[1][-2:] == 'Q.' or sublist[1][-3:] == ' q.' or sublist[1][-2:] == 'S.' or sublist[1][-2:] == 'T.' or sublist[1][-3:] == ' t.' or sublist[1][-3:] == ' v.' or sublist[1][-3:] == 'An.' or sublist[1][-4:] == ' an.' or sublist[1][-3:] == 'Cn.' or sublist[1][-4:] == ' in.' or sublist[1][-4:] == ' ex.'or sublist[1][-4:] == ' ut.' or sublist[1][-3:] == 'M\'.' or sublist[1][-3:] == 'R\'.' or sublist[1][-4:] == ' pl.' or sublist[1][-3:] == 'Sp.' or sublist[1][-3:] == 'Ti.' or sublist[1][-4:] == ' tr.' or sublist[1][-3:] == 'Id.' or sublist[1][-4:] == 'App.' or sublist[1][-4:] == 'Ser.' or sublist[1][-4:] == 'Sex.' or sublist[1][-4:] == 'Tib.' or sublist[1][-5:] == ' cos.' or sublist[1][-4:] == 'Cos.' or sublist[1][-4:] == 'Kal.' or sublist[1][-5:] == ' kal.' or sublist[1][-5:] == ' med.' or sublist[1][-4:] == 'Med.' or sublist[1][-4:] == 'Non.' or sublist[1][-5:] == ' non.' or sublist[1][-5:] == ' scr.' or sublist[1][-4:] == 'Scr.' or sublist[1][-5:] == ' vid.' or sublist[1][-4:] == 'Vid.' or sublist[1][-5:] == 'Mart.' or sublist[1][-4:] == 'Apr.' or sublist[1][-4:] == 'Mai.' or sublist[1][-4:] == 'Iun.' or sublist[1][-6:] == 'Quint.' or sublist[1][-5:] == 'Sext.' or sublist[1][-5:] == 'Sept.' or sublist[1][-4:] == 'Oct.' or sublist[1][-4:] == 'Nov.' or sublist[1][-4:] == 'Dec.' or sublist[1][-4:] == 'Ian.' or sublist[1][-5:] == 'Febr.' or sublist[1][-6:] == ' coss.' or sublist[1][-5:] == 'Coss.' or sublist[1][-5:] == 'fort.' or sublist[1][-6:] == ' prid.' or sublist[1][-5:] == 'Prid.' or sublist[1][-2:] == '.,' or sublist[1][-2:] == '?)' or sublist[1][-4:] == 'frg.' or sublist[1][-6:] == 'Schol.' or sublist[1][-4:] == 'Cus.': 
            text_list[l+1] = [sublist[0], sublist[1] + ' ' + text_list[l+1][1]]
            sublist.clear()    
        l = l+1
    return text_list

def write_output(text_list: List[List[str]], output_path: str) -> None:
    """
    Writes the content of text_list to a txt file at the specified output path.

    Args:
        text_list: A list of sublists, each containing an identifier and a text string.
        output_path: The file path where the output txt file will be written.
    """
    with open(output_path, 'w', newline='', encoding='utf-8') as file:
        result = csv.writer(file, delimiter='\t', quotechar='|')
        for i in text_list:
            if len(i) > 1:
                result.writerow(i)

def assimilation(input_text: str, output_path: str) -> List[List[str]]:
    """
    Processes the input text through assimilation steps.

    Args:
        input_text: The path to the input text file to be assimilated.
        output_path: The file path where the assimilated text will be written.

    Returns:
        A list of sublists, each containing an ID and the assimilated text string.
    """
    text_list = read_in_text(input_text)
    assimilated_list = assimilate(text_list)
    return assimilated_list

def phrasing_prose(text_list: List[List[str]], output_path: str) -> List[List[str]]:
    """
    Processes a list of text elements for prose by applying a series of normalization and cleanup steps, including normalization of quotation marks, removal of whitespace
    around connectors, stripping of leading chapter numbers and whitespace, and separation of text at ellipsis and punctuation marks. Conditions and actions are applied to handle
    specific formatting cases.

    Args:
        text_list: A list of sublists, each containing an ID and a text string.
        output_path: The file path where the processed text will be written.

    Returns:
        The processed list of text elements after applying all normalization and cleanup steps.
    """
    text_list = normalize_quotation_marks(text_list)
    text_list = remove_whitespace_before_connectors(text_list)
    text_list = remove_left_over_counts(text_list)
    text_list = strip_whitespaces(text_list)
    text_list = separate_text_at_ellipsis(text_list)
    text_list = separate_text_at_punctuation_marks(text_list)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_insertions_in_brackets, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_after_interjections, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_contract_insertions, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_direct_speeches_1, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_direct_speeches_2, action_contract_next)
    text_list = cleanup(text_list)
    text_list = contract_after_personal_names(text_list)
    text_list = cleanup(text_list)
    write_output(text_list, output_path)
    return text_list

def phrasing_poetry(text_list: List[List[str]], output_path: str) -> List[List[str]]:
    """
    Processes a list of text elements for poetry by marking verse endings, applying a series of normalization and cleanup steps similar to prose, but with additional handling for verse
    endings.

    Args:
        text_list: A list of sublists, each containing an ID and a text string.
        output_path: The file path where the processed poetry text will be written.

    Returns:
        The processed list of poetry text elements after applying all normalization, cleanup,
        and verse ending marking steps.
    """
    text_list = mark_verse_endings(text_list)
    text_list = normalize_quotation_marks(text_list)
    text_list = remove_whitespace_before_connectors(text_list)
    text_list = remove_left_over_counts(text_list)
    text_list = strip_whitespaces(text_list)
    text_list = separate_text_at_ellipsis(text_list)
    text_list = separate_text_at_punctuation_marks(text_list)
    text_list = apply_condition_and_action(text_list, condition_contract_verse_endings, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_insertions_in_brackets, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_after_interjections, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_contract_insertions, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_direct_speeches_1, action_contract_next)
    text_list = cleanup(text_list)
    text_list = apply_condition_and_action(text_list, condition_direct_speeches_2, action_contract_next)
    text_list = cleanup(text_list)
    text_list = contract_after_personal_names(text_list)
    text_list = cleanup(text_list)
    text_list = remove_left_over_marks(text_list)
    write_output(text_list, output_path)
    return text_list

works = ["aeneid", "pharsalia", "thebaid", "argonautica", "punica"]
for work in works:
    text = read_in_text(f"../texts/{work}.txt")
    cleaned_text = phrasing_poetry(text, output_path=f"../texts/{work}_clean.txt")
