In [82]:
import pandas as pd
import re
import os
from ast import literal_eval

REGEX_ASPECTS_ACD = r'\[([^\]]+)\]'
REGEX_ASPECTS_ACSD = r"\(([^,]+),[^,]+,\s*\"[^\"]*\"\)"
REGEX_LABELS_ACSD = r"\([^,]+,\s*([^,]+)\s*,\s*\"[^\"]*\"\s*\)"
REGEX_ASPECTS_ACSA = r'\(([^,\s]+)'
REGEX_LABELS_ACSA = r'\(([^,]+),\s*([^)]+)\)'
REGEX_PAIRS_ACSA_ACSD = r'\([^()]+?\)'

def safe_recursive_pattern(depth, max_depth):
    quoted_content = r'"(?:[^"\\]|\\.)*"'
    
    if depth == max_depth:
        return rf'(?:{quoted_content}|[^()])*'
    
    return rf'\((?:{quoted_content}|[^()]|{safe_recursive_pattern(depth + 1, max_depth)})*\)'

def extract_valid_e2e_tuples(text):
    pattern = r'\(\s*"([^"]*)"\s*,\s*(POSITIVE|NEGATIVE|NEUTRAL)\s*\)'
    
    compiled_pattern = re.compile(pattern)
    valid_tuples = compiled_pattern.findall(text)
    
    return valid_tuples

def extractAspects(output, task, cot = False, evaluation = False):
    def strip_cot_output(output, keywords):
        for keyword in keywords:
            if keyword in output:
                return output.split(keyword)[1]
        return output

    if cot and evaluation:
        keywords = [
            'folgenden Aspekt-Sentiment-Paar:', 'folgenden Aspekt-Sentiment-Paaren:',
            'the following aspect-sentiment-pair:', 'the following aspect-sentiment-pairs:',
            'folgenden Aspekt-Sentiment-Phrasen-Tripeln:', 'folgenden Aspekt-Sentiment-Phrasen-Tripel:',
            'the following aspect-sentiment-phrase-triple:', 'the following aspect-sentiment-phrase-triples:',
            'the following phrase-polarity-tuple:','the following phrase-polarity-tuples:'
        ]
        output = strip_cot_output(output, keywords)
        
    if task == 'acd':

        pattern_asp = re.compile(REGEX_ASPECTS_ACD)
        matches = pattern_asp.findall(output)
        
        return matches[0].split(', ') if matches else []
        
    elif task == 'acsa':
                
        pattern_pairs = re.compile(REGEX_PAIRS_ACSA_ACSD)
        pattern_lab = re.compile(REGEX_LABELS_ACSA)
        
        pairs = pattern_pairs.findall(output)
        
        return [[m[1], m[2]] for pair in pairs if (m := pattern_lab.search(pair))] or []

    elif task == 'e2e' or task == 'acsd':
        if task == 'e2e':
            
            return extract_valid_e2e_tuples(output)
        
        else:  # task == 'acsd'
            max_depth = 1
            pattern_targets = re.compile(safe_recursive_pattern(0, max_depth))
            pairs = pattern_targets.findall(output)
            
            pattern_asp = re.compile(REGEX_ASPECTS_ACSD)
            pattern_pol = re.compile(REGEX_LABELS_ACSD)
            pattern_phrase = re.compile(REGEX_PHRASES_ACSD)
            
            return [
                [pattern_asp.search(pair)[1], pattern_pol.search(pair)[1], pattern_phrase.search(pair)[1]]
                for pair in pairs if pattern_asp.search(pair) and pattern_pol.search(pair) and pattern_phrase.search(pair)]


In [83]:
def formatText(text):
    text = remove_special_characters(text)
    text = re.sub(r'([(".,!?;:/)])', r" \1", text)
    text = re.sub(r'(["„“…])', r'', text)
    text = re.sub(r'([\'])', r' \1', text)
    # text = re.sub(r'([-])', r' \1 ', text)
    text = re.sub(r'([\s\s])', r' ', text)
    text = re.sub(r"\b(I|You|We|They|He|She|It|Don|Didn|Doesn|Can|Couldn|Wouldn|Shouldn|Won|Would|Wasn|Aren|Ain|Isn|Hasn|Haven|Weren|Mightn|Mustn)('|’)(m|t|ll|ve|re|s|d)\b", r"\1 \2\3", text)
    return re.sub(r"\s+", " ", text).strip()

def formatData(row):
    sentence = row['text']
    labels = row['labels_phrases']
    labels_new = []
    lab = extractAspects('[' + ', '.join(labels) + ']', 'acsd')
    for i, label in enumerate(lab):
        category = f"{label[0].lower().replace('#', ' ')}"
        sentiment = f"{label[1].lower()}"
        phrase = label[2]
        if phrase != 'NULL':
            phrase = formatText(phrase)
            phrase = f"'{phrase}'" if not "'" in label[2] else f'"{phrase}"'
        else:
            phrase = "'NULL'"
        labels_new.append(f"({phrase}, '{category}', '{sentiment}')")
    labels_new = '####[' + ', '.join(labels_new) + ']'
    sentence = formatText(sentence) 
    return sentence + labels_new

def remove_special_characters(text):
    allowed_characters = re.compile(r"[^a-zA-Z0-9äöüßÄÖÜ \".,!?;:'-()/]")
    return allowed_characters.sub('', text)

In [84]:
###
#  Normal Dataset
###

for DATASET in ['rest-16', 'GERestaurant']:
    for LR_SETTING in ['_500', '_1000', '_full']:
        for SPLIT in [1,2,3,4,5]:
    
            input_path = f'../../data/{DATASET}/split_{SPLIT}/'
            path = f'data/tasd/{DATASET}/split_{SPLIT}/'
            
            df_eval = pd.read_csv(input_path + f'test{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')
            df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')
    
            os.makedirs(path, exist_ok= True)
                
            eval_samples = df_eval.apply(formatData, axis = 1)
            with open(path +  f'test{LR_SETTING}.txt', 'w') as file:
                for sample in eval_samples:
                    file.write(sample + '\n')
                            
            train_samples = df_train.apply(formatData, axis = 1)
            with open(path +  f'train{LR_SETTING}.txt', 'w') as file:
                for sample in train_samples:
                    file.write(sample + '\n')