In [100]:
import pandas as pd
import re
import os
from ast import literal_eval

#REGEX_ASPECTS_ACD = r"\[([^,\]]+)(?:,\s*([^,\]]+))*\]"
REGEX_ASPECTS_ACD = r'\[([^\]]+)\]'
REGEX_ASPECTS_ACSD = r"\(([^,]+),[^,]+,\s*\"[^\"]*\"\)"
REGEX_LABELS_ACSD = r"\([^,]+,\s*([^,]+)\s*,\s*\"[^\"]*\"\s*\)"
REGEX_PHRASES_ACSD = r"\([^,]+,\s*[^,]+\s*,\s*\"([^\"]*)\"\s*\)"
REGEX_ASPECTS_ACSA = r'\(([^,\s]+)'
# REGEX_LABELS_ACSA = r'\([^,]+,\s*([^,\s]+)'
REGEX_LABELS_ACSA = r'\(([^,]+),\s*([^)]+)\)'
REGEX_PAIRS_ACSA_ACSD = r'\([^()]+?\)'
REGEX_PHRASES_E2E = r'"([^"]+)"'
REGEX_POLARITIES_E2E = r'\(\s*"[^"]*"\s*,\s*([A-Z]+)\s*\)'
POLARITIES = ['POSITIVE', 'NEUTRAL', 'NEGATIVE']

def safe_recursive_pattern(depth, max_depth):
    quoted_content = r'"(?:[^"\\]|\\.)*"'  # Matches anything inside quotes.
    
    if depth == max_depth:
        # Base case: stop recursion at max_depth, match anything except parentheses
        return rf'(?:{quoted_content}|[^()])*'
    
    # Recursive case: match parentheses content, excluding quoted strings
    return rf'\((?:{quoted_content}|[^()]|{safe_recursive_pattern(depth + 1, max_depth)})*\)'

def extract_valid_e2e_tuples(text):
    # Define the pattern for a well-formed tuple: ("Phrase", Label)
    pattern = r'\(\s*"([^"]*)"\s*,\s*(POSITIVE|NEGATIVE|NEUTRAL)\s*\)'
    
    # Compile the regex to extract valid tuples
    compiled_pattern = re.compile(pattern)
    
    # Extract all matches from the string
    valid_tuples = compiled_pattern.findall(text)
    
    # Return the tuples in the format [('Phrase', 'Label'), ...]
    return valid_tuples

def extractAspects(output, task, cot = False, evaluation = False):
    def strip_cot_output(output, keywords):
        for keyword in keywords:
            if keyword in output:
                return output.split(keyword)[1]
        return output

    if cot and evaluation:
        keywords = [
            'folgenden Aspekt-Sentiment-Paar:', 'folgenden Aspekt-Sentiment-Paaren:',
            'the following aspect-sentiment-pair:', 'the following aspect-sentiment-pairs:',
            'folgenden Aspekt-Sentiment-Phrasen-Tripeln:', 'folgenden Aspekt-Sentiment-Phrasen-Tripel:',
            'the following aspect-sentiment-phrase-triple:', 'the following aspect-sentiment-phrase-triples:',
            'the following phrase-polarity-tuple:','the following phrase-polarity-tuples:'
        ]
        output = strip_cot_output(output, keywords)
        
    if task == 'acd':

        pattern_asp = re.compile(REGEX_ASPECTS_ACD)
        matches = pattern_asp.findall(output)
        
        return matches[0].split(', ') if matches else []
        
    elif task == 'acsa':
                
        pattern_pairs = re.compile(REGEX_PAIRS_ACSA_ACSD)
        pattern_lab = re.compile(REGEX_LABELS_ACSA)
        
        pairs = pattern_pairs.findall(output)
        
        return [[m[1], m[2]] for pair in pairs if (m := pattern_lab.search(pair))] or []

    elif task == 'e2e' or task == 'acsd':
        if task == 'e2e':
            
            return extract_valid_e2e_tuples(output)
        
            # return [
            #     [pattern_phrase.search(pair)[1], pattern_pol.search(pair)[1]]
            #     for pair in pairs if pattern_phrase.search(pair) and pattern_pol.search(pair)
            # ]
        else:  # task == 'acsd'
            max_depth = 1
            pattern_targets = re.compile(safe_recursive_pattern(0, max_depth))
            pairs = pattern_targets.findall(output)
            
            pattern_asp = re.compile(REGEX_ASPECTS_ACSD)
            pattern_pol = re.compile(REGEX_LABELS_ACSD)
            pattern_phrase = re.compile(REGEX_PHRASES_ACSD)
            
            return [
                [pattern_asp.search(pair)[1], pattern_pol.search(pair)[1], pattern_phrase.search(pair)[1]]
                for pair in pairs if pattern_asp.search(pair) and pattern_pol.search(pair) and pattern_phrase.search(pair)]

In [106]:
import pandas as pd
import re

# Example data
data = [
    {
        "id": 5,
        "text": "My wife had the fried shrimp which are huge and loved it.",
        "labels": [
            ["FOOD#STYLE_OPTIONS", "POSITIVE", "fried shrimp"],
            ["FOOD#QUALITY", "POSITIVE", "fried shrimp"],
            ["FOOD#STYLE_OPTIONS", "NEGATIVE", "NULL"]  # Example of an implicit phrase
        ]
    }
]

# Function to format text with proper spaces around punctuation and contractions
 # Add spaces around punctuation
def format_text(text):
    text = remove_special_characters(text)
    text = re.sub(r'([(".,!?;:/)])', r" \1 ", text)
    # Add spaces in front of quotation marks
    text = re.sub(r'(["„“…])', r'', text)
    text = re.sub(r'([\'])', r' \1', text)
    text = re.sub(r'([-])', r' \1 ', text)
    text = re.sub(r'([\s\s])', r' ', text)
    
    # Add spaces for contractions like I'm -> I 'm
    text = re.sub(r"\b(I|You|We|They|He|She|It|Don|Didn|Doesn|Can|Couldn|Wouldn|Shouldn|Won|Would|Wasn|Aren|Ain|Isn|Hasn|Haven|Weren|Mightn|Mustn)('|’)(m|t|ll|ve|re|s|d)\b", r"\1 \2\3", text)
    # Remove extra spaces
    return re.sub(r"\s+", " ", text).strip()


# Initialize result dataframe
columns = [
    "sentence_id", "sentence", "target", "category", "polarity",
    "category_polarity", "entailed", "start", "end"
]

# Tokenize sentence into words
def tokenize(sentence):
    return sentence.split(' ')

# Get word indices for the target
def get_word_indices(sentence, target):
    tokens = tokenize(sentence)
    target_tokens = target.split()
    for i in range(len(tokens) - len(target_tokens) + 1):
        if tokens[i:i + len(target_tokens)] == target_tokens:
            return i, i + len(target_tokens) - 1
    return 0, 0  # Default for implicit cases or no match

def remove_special_characters(text):
    # Allow letters (a-z, A-Z), digits, whitespace, and German-specific characters (äöüßÄÖÜ)
    # Explicitly exclude special characters like "…"
    allowed_characters = re.compile(r"[^a-zA-Z0-9äöüßÄÖÜ \".,!?;:'-]")
    return allowed_characters.sub('', text)
    
def formatData(df):
    output = []
    
    # Process each row in the dataset
    for id, row in df.iterrows():
        sentence_id = id
        sentence = format_text(row["text"])  # Format the sentence
        labels = extractAspects('[' + ', '.join(row["labels_phrases"]) + ']', 'acsd')
        for label in labels:
            category, polarity, target = label
    
            # Generate rows for each polarity type
            for pol in ["positive", "negative", "neutral"]:
                category_clean = category.lower().replace("#", " ")
                category_polarity = f"{category_clean} {pol}"
                entailed = "yes" if pol.upper() == polarity else "no"
    
                # Handle implicit cases (target = 'NULL')
                if target == "NULL":
                    start = end = 0  # Implicit case always has start and end as 0
                else:
                    # Calculate word indices only if entailed
                    if target != 'NULL':
                        target = remove_special_characters(target)
                        target = re.sub(r'([-])', r' \1 ', target)
                        target = re.sub(r'([\s\s])', r' ', target)
                        target = re.sub(r"([(.,!?;:/)])", r" \1 ", target)
                        target = re.sub(r'(["„“…])', r'', target)
                        target = re.sub(r'([\'])', r' \1', target)
                        target = re.sub(r"\b(I|You|We|They|He|She|It|Don|Didn|Doesn|Can|Couldn|Wouldn|Shouldn|Won|Would|Wasn|Aren|Ain|Isn|Hasn|Haven|Weren|Mightn|Mustn)('|’)(m|t|ll|ve|re|s|d)\b", r"\1 \2\3", target)
                        target = re.sub(r"\s+", " ", target).strip()
                        
                    if entailed == "yes":
                        start, end = get_word_indices(sentence, target)
                        start += 1
                        end += 2
                    else:
                        start = end = 0  # Not entailed

                
                # Append row
                output.append([
                    f"{sentence_id}",
                    sentence,
                    target if target != "NULL" else "NULL",
                    category_clean,
                    pol,
                    category_polarity,
                    entailed,
                    start,
                    end
                ])
            
    return pd.DataFrame(output, columns=columns)


In [107]:
###
#  Normal Dataset
###
for DATA_PATH in ['','_e2e']:
    for DATASET in ['GERestaurant']:
        for LR_SETTING in ['_500', '_1000', '_full']:
            for SPLIT in [1,2,3,4,5]:
        
                input_path = f'../../data{DATA_PATH}/{DATASET}/split_{SPLIT}/'
                path = f'data{DATA_PATH}/{DATASET}/split_{SPLIT}/'
                
                df_eval = pd.read_csv(input_path + f'test{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                      'labels': literal_eval,
                                                                      'labels_phrases': literal_eval}).set_index('id')
                df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                      'labels': literal_eval,
                                                                      'labels_phrases': literal_eval}).set_index('id')
        
                os.makedirs(path, exist_ok= True)
                
                transformed_data = formatData(df_eval)
                df_eval = pd.DataFrame(transformed_data, columns=columns)
                
                df_eval.to_csv(path +  f'test{LR_SETTING}.txt', encoding = 'utf-8', index = False, sep = '\t')
                
                transformed_data = formatData(df_train)
                df_train = pd.DataFrame(transformed_data, columns=columns)
                
                df_train.to_csv(path +  f'train{LR_SETTING}.txt', encoding = 'utf-8', index = False, sep = '\t')
    
            input_path = f'../../data{DATA_PATH}/{DATASET}/'
            path = f'data{DATA_PATH}/{DATASET}/'
            
            df_eval = pd.read_csv(input_path + f'val{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')
            df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')

            os.makedirs(path, exist_ok= True)
            
            transformed_data = formatData(df_eval)
            df_eval = pd.DataFrame(transformed_data, columns=columns)
            df_eval.to_csv(path +  f'val{LR_SETTING}.txt', encoding = 'utf-8', index = False, sep = '\t')
            
            transformed_data = formatData(df_train)
            df_train = pd.DataFrame(transformed_data, columns=columns)
            df_train.to_csv(path +  f'train{LR_SETTING}.txt', encoding = 'utf-8', index = False, sep = '\t')