In [4]:
import pandas as pd
import re
import os
from ast import literal_eval

REGEX_ASPECTS_ACD = r'\[([^\]]+)\]'
REGEX_ASPECTS_ACSD = r"\(([^,]+),[^,]+,\s*\"[^\"]*\"\)"
REGEX_LABELS_ACSD = r"\([^,]+,\s*([^,]+)\s*,\s*\"[^\"]*\"\s*\)"
REGEX_PHRASES_ACSD = r"\([^,]+,\s*[^,]+\s*,\s*\"([^\"]*)\"\s*\)"
REGEX_LABELS_ACSA = r'\(([^,]+),\s*([^)]+)\)'
REGEX_PAIRS_ACSA_ACSD = r'\([^()]+?\)'

def safe_recursive_pattern(depth, max_depth):
    quoted_content = r'"(?:[^"\\]|\\.)*"'  # Matches anything inside quotes.
    
    if depth == max_depth:
        # Base case: stop recursion at max_depth, match anything except parentheses
        return rf'(?:{quoted_content}|[^()])*'
    
    # Recursive case: match parentheses content, excluding quoted strings
    return rf'\((?:{quoted_content}|[^()]|{safe_recursive_pattern(depth + 1, max_depth)})*\)'

def extract_valid_e2e_tuples(text):
    # Define the pattern for a well-formed tuple: ("Phrase", Label)
    pattern = r'\(\s*"([^"]*)"\s*,\s*(POSITIVE|NEGATIVE|NEUTRAL)\s*\)'
    
    # Compile the regex to extract valid tuples
    compiled_pattern = re.compile(pattern)
    
    # Extract all matches from the string
    valid_tuples = compiled_pattern.findall(text)
    
    # Return the tuples in the format [('Phrase', 'Label'), ...]
    return valid_tuples

def extractAspects(output, task, cot = False, evaluation = False):
    def strip_cot_output(output, keywords):
        for keyword in keywords:
            if keyword in output:
                return output.split(keyword)[1]
        return output

    if cot and evaluation:
        keywords = [
            'folgenden Aspekt-Sentiment-Paar:', 'folgenden Aspekt-Sentiment-Paaren:',
            'the following aspect-sentiment-pair:', 'the following aspect-sentiment-pairs:',
            'folgenden Aspekt-Sentiment-Phrasen-Tripeln:', 'folgenden Aspekt-Sentiment-Phrasen-Tripel:',
            'the following aspect-sentiment-phrase-triple:', 'the following aspect-sentiment-phrase-triples:',
            'the following phrase-polarity-tuple:','the following phrase-polarity-tuples:'
        ]
        output = strip_cot_output(output, keywords)
        
    if task == 'acd':

        pattern_asp = re.compile(REGEX_ASPECTS_ACD)
        matches = pattern_asp.findall(output)
        
        return matches[0].split(', ') if matches else []
        
    elif task == 'acsa':
                
        pattern_pairs = re.compile(REGEX_PAIRS_ACSA_ACSD)
        pattern_lab = re.compile(REGEX_LABELS_ACSA)
        
        pairs = pattern_pairs.findall(output)
        
        return [[m[1], m[2]] for pair in pairs if (m := pattern_lab.search(pair))] or []

    elif task == 'e2e' or task == 'tasd':
        if task == 'e2e':
            
            return extract_valid_e2e_tuples(output)
        
            # return [
            #     [pattern_phrase.search(pair)[1], pattern_pol.search(pair)[1]]
            #     for pair in pairs if pattern_phrase.search(pair) and pattern_pol.search(pair)
            # ]
        else:  # task == 'acsd'
            max_depth = 1
            pattern_targets = re.compile(safe_recursive_pattern(0, max_depth))
            pairs = pattern_targets.findall(output)
            
            pattern_asp = re.compile(REGEX_ASPECTS_ACSD)
            pattern_pol = re.compile(REGEX_LABELS_ACSD)
            pattern_phrase = re.compile(REGEX_PHRASES_ACSD)
            
            return [
                [pattern_asp.search(pair)[1], pattern_pol.search(pair)[1], pattern_phrase.search(pair)[1]]
                for pair in pairs if pattern_asp.search(pair) and pattern_pol.search(pair) and pattern_phrase.search(pair)]

def formatLabels(labels):
    labels_new = []
    lab = extractAspects('[' + ', '.join(labels) + ']', 'acsd')
    for i, label in enumerate(lab):
        category = f"'{label[0].lower().replace('#', ' ')}'"
        sentiment = f"{label[1].lower()}"
        phrase = f"'{label[2]}'" if not "'" in label[2] else f'"{label[2]}"'
        phrase = phrase if phrase != "'NULL'" else "'noaspectterm'"
        labels_new.append(f"{{'term': {phrase}, 'polarity': '{sentiment}'}}")
    labels_new = ', '.join(labels_new)
    return f'[{labels_new}]'

In [7]:
###
#  Normal Dataset
###
for DATA_PATH in ['', '_e2e']:
    for DATASET in ['rest-16', 'GERestaurant']:
        for LR_SETTING in ['_500', '_1000', '_full']:
            for SPLIT in [1,2,3,4,5]:
        
                input_path = f'../../data{DATA_PATH}/{DATASET}/split_{SPLIT}/'
                path = f'data{DATA_PATH}/{DATASET}/split_{SPLIT}/'
                
                df_eval = pd.read_csv(input_path + f'test{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                      'labels': literal_eval,
                                                                      'labels_phrases': literal_eval}).set_index('id')
                df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                      'labels': literal_eval,
                                                                      'labels_phrases': literal_eval}).set_index('id')
        
                os.makedirs(path, exist_ok= True)
                
                df_eval = df_eval[['text','labels_phrases']]
                df_eval.columns = ['raw_text', 'aspectTerms']
        
                df_eval['aspectTerms'] = df_eval['aspectTerms'].apply(formatLabels)
                df_eval.to_csv(path +  f'test{LR_SETTING}.csv', encoding = 'utf-8')
                
                df_train = df_train[['text','labels_phrases']]
                df_train.columns = ['raw_text', 'aspectTerms']
        
                df_train['aspectTerms'] = df_train['aspectTerms'].apply(formatLabels)
                df_train.to_csv(path +  f'train{LR_SETTING}.csv', encoding = 'utf-8')
    
            input_path = f'../../data{DATA_PATH}/{DATASET}/'
            path = f'data{DATA_PATH}/{DATASET}/'
            
            df_eval = pd.read_csv(input_path + f'val{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')
            df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')

            os.makedirs(path, exist_ok= True)
            
            df_eval = df_eval[['text','labels_phrases']]
            df_eval.columns = ['raw_text', 'aspectTerms']
        
            df_eval['aspectTerms'] = df_eval['aspectTerms'].apply(formatLabels)
            df_eval.to_csv(path +  f'val{LR_SETTING}.csv', encoding = 'utf-8')
            
            df_train = df_train[['text','labels_phrases']]
            df_train.columns = ['raw_text', 'aspectTerms']
        
            df_train['aspectTerms'] = df_train['aspectTerms'].apply(formatLabels)
            df_train.to_csv(path +  f'train{LR_SETTING}.csv', encoding = 'utf-8')