In [1]:
import pandas as pd
import re
from ast import literal_eval

REGEX_ASPECTS_ACD = r'\[([^\]]+)\]'
REGEX_ASPECTS_ACSA = r'\(([^,\s]+)'
REGEX_LABELS_ACSA = r'\(([^,]+),\s*([^)]+)\)'
REGEX_PAIRS_ACSA_ACSD = r'\([^()]+?\)'
POLARITIES = ['POSITIVE', 'NEUTRAL', 'NEGATIVE']

def extractAspects(output, task):
    
    if task == 'acd':

        pattern_asp = re.compile(REGEX_ASPECTS_ACD)
        matches = pattern_asp.findall(output)

        # Split the result by commas
        aspects = matches[0].split(', ') if matches else []
        
        return aspects if aspects else []
        
    elif task == 'acsa':
                
        pattern_pairs = re.compile(REGEX_PAIRS_ACSA_ACSD)
        pairs = pattern_pairs.findall(output)
        
        if pairs:
            # pattern_asp = re.compile(REGEX_ASPECTS_ACSA)   
            # pattern_pol = re.compile(REGEX_LABELS_ACSA)

            pattern_lab = re.compile(REGEX_LABELS_ACSA)

            matches = []
            
            for labels in pairs:
                # match_asp = pattern_asp.search(labels)
                # match_pol = pattern_pol.search(labels)

                match = pattern_lab.search(labels)

                # if match_asp and match_pol:
                #     matches.append([match_asp[1], match_pol[1]])
                # else:
                #     matches.append([])

                if match:
                    matches.append([match[1], match[2]])
                else:
                    matches.append([])
                    
        else:
            matches = []

        return matches if matches else []

def formatLabels(labels):
    lab_format = ''
    labels = extractAspects(str(labels), 'acsa')
    for i, label in enumerate(labels):
        sentiment = 1 if label[1] == 'POSITIVE' else 0 if label[1] == 'NEUTRAL' else -1
        lab = f'{label[0]}#{sentiment}' if i < 1 else  f'\t{label[0]}#{sentiment}'
        lab_format += lab
    return lab_format



In [3]:
for LR_SETTING in ['_500', '_1000', '_full']:
    for SPLIT in [1,2,3,4,5]:
        for DATASET in ['rest-16', 'GERestaurant']:

            input_path = f'./{DATASET}/split_{SPLIT}/'
            path = f'{DATASET}/split_{SPLIT}/'
            
            df_eval = pd.read_csv(input_path + f'test{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')
            df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                                  'labels': literal_eval,
                                                                  'labels_phrases': literal_eval}).set_index('id')
    
            
            df_eval = df_eval[['text','labels']]
            with open(path + f'test{LR_SETTING}.tsv', "w") as writer:
                for i, row in df_eval.iterrows():
                    writer.write("%s\t%s\n" % (row['text'], formatLabels(row['labels'])))
    
            df_train = df_train[['text','labels']]
            with open(path + f'train{LR_SETTING}.tsv', "w") as writer:
                for i, row in df_train.iterrows():
                    writer.write("%s\t%s\n" % (row['text'], formatLabels(row['labels'])))

In [4]:
for LR_SETTING in ['_500', '_1000', '_full']:
    for DATASET in ['rest-16', 'GERestaurant']:

        input_path = f'./{DATASET}/'
        path = f'{DATASET}/'
    
        df_eval = pd.read_csv(input_path + f'val{LR_SETTING}.tsv', sep = '\t', converters = {
                                                              'labels': literal_eval,
                                                              'labels_phrases': literal_eval}).set_index('id')
        df_train = pd.read_csv(input_path + f'train{LR_SETTING}.tsv', sep = '\t', converters = {
                                                              'labels': literal_eval,
                                                              'labels_phrases': literal_eval}).set_index('id')
    
        
        df_eval = df_eval[['text','labels']]
        with open(path + f'val{LR_SETTING}.tsv', "w") as writer:
            for i, row in df_eval.iterrows():
                writer.write("%s\t%s\n" % (row['text'], formatLabels(row['labels'])))
    
        df_train = df_train[['text','labels']]
        with open(path + f'train{LR_SETTING}.tsv', "w") as writer:
            for i, row in df_train.iterrows():
                writer.write("%s\t%s\n" % (row['text'], formatLabels(row['labels'])))