In [49]:
import xml.etree.ElementTree as ET
import xml.dom.minidom
import json
import re

In [50]:
def xml_to_dict_list(file_path: str) -> list:

    """
    Transforms dictionary into the following schema (populated with example):
    Args:
    - file_path (str): relative file path


    transformed_dict = {"text": "Der Burger hat gut geschmeckt", 
            "tags": [
                {"start": 0, "end": 0, "label": "food", "polarity": "positive", "type": "label-explicit"},
            ]}

    - The aspect labels will be adjusted according to the labels used in the GERestaurant paper
    - Explicit and implicit label types are added as Rest15/16 datasets do not provide these labels
    - The dictionaries will passed to the XML-Tagger which will turn the dictionaries into xml-format for the TASD-Task 
    """

    tree = ET.parse(file_path)
    root = tree.getroot()

    # Labels mapped to the lables of the GERestaurant Paper
    label_map = {
            "PRICES": "PRICE",
            "RESTAURANT" : "GENERAL-IMPRESSION",
            "LOCATION" : "AMBIENCE",
            "DRINKS" : "FOOD",
            "SERVICE" : "SERVICE",
            "AMBIENCE": "AMBIENCE",
            "FOOD" : "FOOD"
        }

    # Transform the data
    transformed_data = []

    for sentence in root.findall(".//sentence"):
        text = sentence.find("text").text
        tags = []
        for opinion in sentence.findall(".//Opinion"):

            # get Price label
            label_check = opinion.attrib["category"].split('#')
            if "PRICES" in label_check:
                opinion_label = "PRICES"
            else:
                opinion_label = opinion.attrib["category"].split("#")[0] # Extracting main label (e.g., "FOOD")

            tag = {
                "start": int(opinion.attrib["from"]),
                "end": int(opinion.attrib["to"]),
                "label": opinion_label,  
                "polarity": opinion.attrib["polarity"],
                "type": "label-explicit"  # Static value as per example
            }

            if opinion.attrib['target'] == "NULL":
                tag['type'] = "label-implicit"

            # mapping categories to GERestaurant
            original_label = tag['label']
            ger_label = label_map[original_label]

            # assign new label
            tag['label'] = ger_label

            tags.append(tag)

        # ignore sentences that do not have any opinion tags
        if not tags:
            continue
        else:
            transformed_data.append({"text": text, "tags": tags})

    return transformed_data

In [51]:
def dict_list_to_json(list_dict: list, file_name: str) -> None:
    
    with open(f"{file_name}.json", "w", encoding='utf-8') as json_input:
        json.dump(list_dict, json_input, indent=2)

In [52]:
# Author: Nils Hellwig

from collections import Counter
import uuid
import re


def convert_ner_to_xml(ner_dict):
    text = ner_dict['text']
    tags = ner_dict['tags']
    tag_positions = []

    for tag in tags:
        start = tag['start']
        end = tag['end']
        label = tag['label']
        polarity = tag['polarity']
        tag_type = tag['type']

        if tag_type == 'label-explicit':
            tag_positions.append(
                (start, f'<aspect-term aspect="{label}" polarity="{polarity}">'))
            tag_positions.append((end, '</aspect-term>'))

        # Add implicit labels
        # Author: Lucas Müller
        if tag_type == 'label-implicit':
            end_text = len(text) + 1
            tag_positions.append(
                (end_text, f'<aspect-term aspect="{label}" polarity="{polarity}"></aspect-term>'))
            

        
    tag_positions.sort(reverse=True, key=lambda x: x[0])

    xml_text = list(text)
    for position, tag in tag_positions:
        xml_text.insert(position, tag)

    

    return ''.join(xml_text)


def remove_xml_tags_from_string(str):
    return re.sub(r'<.*?>', '', str)


def remove_xml_tags(input_string):
    return re.sub(r'<[^>]+>', '', input_string)


def extract_aspect_polarity(xml_string):
    aspect_match = re.search(r'aspect="([^"]+)"', xml_string)
    polarity_match = re.search(r'polarity="([^"]+)"', xml_string)

    aspect = aspect_match.group(1) if aspect_match else None
    polarity = polarity_match.group(1) if polarity_match else None

    return aspect, polarity


def get_explicit_aspects(tags, predicted_text):

    # 1. match position
    pattern = r"<aspect-term(?!.*<aspect-term).*?<\/aspect-term>"
    matches = list(re.finditer(pattern, predicted_text))
    if len(matches) == 0:
        return tags, predicted_text  # Return tags and the updated text

    match = matches[0]

    # 2. position in text with tags
    tag_xml = match.group()
    tag_xml_start = match.start()
    tag_xml_end = match.end()

    # 3. identify aspect and polarity
    aspect, polarity = extract_aspect_polarity(tag_xml)

    # 4. position in text without tags
    tag_text = remove_xml_tags(tag_xml)
    tag_start = len(remove_xml_tags(predicted_text[0:match.start()]))
    tag_end = tag_start + len(tag_text)

    # 5. remove tag from text
    predicted_text = predicted_text[0:tag_xml_start] + \
        tag_text + predicted_text[tag_xml_end:]

    # 6. add tags to list
    tags.append({"text": tag_text, "start": tag_start, "end": tag_end, "tag_with_polarity": aspect+"-"+polarity,
                "tag_with_polarity_and_type": aspect+"-"+polarity+"-explicit", "type": "label-explicit", "label": aspect, "polarity": polarity})

    # Recursive call
    return get_explicit_aspects(tags, predicted_text)


def check_difference_between_tags_in_synth_text_and_label(label, tags_synth):
    """
    This function identifies the differences between aspect-polarity pairs in the label and the synthesised text.

    Args:
    label (list of tuples): The aspect-polarity pairs in the label.
    tags_synth (list of tuples): The aspect-polarity pairs in the synthesised text.

    Returns:
    tuple: A tuple containing two lists:
        - List of aspect-polarity pairs present in the label but not in the synthesised text.
        - List of aspect-polarity pairs present in the synthesised text but not in the label.
    """
    
    # Count the occurrences of aspect-polarity pairs in the label and synthesised text
    label_count = Counter(label)
    tags_synth_count = Counter(tags_synth)
    
    # Find aspect-polarity pairs in the label but not in the synthesised text
    not_in_tags_synth_count = [tup for tup, count in label_count.items() for _ in range(max(0, count - tags_synth_count.get(tup, 0)))]
    
    # Find aspect-polarity pairs in the synthesised text but not in the label
    not_in_label = [tup for tup, count in tags_synth_count.items() if count > label_count.get(tup, 0)]
    
    return not_in_tags_synth_count, not_in_label

In [53]:
def validate_tagged_reviews(list_of_reviews: list) -> list:
    """Validation of review sentences via RegEx
    Args:
    - list_of_reviews (str): list of dictionaries containing reviews
    """


    # RegEx pattern for TASD-XML-Tags
    # pattern = r"<aspect-term\s+aspect=\"(?:GENERAL-IMPRESSION|FOOD|AMBIENCE|SERVICE|PRICE)\"\s+polarity=\"(?:negative|positive|neutral)\">.*?</aspect-term>"

    list_sent = []
    
    for review in list_of_reviews:
        try:
            tasd_sent = convert_ner_to_xml(review)
            if re.search(pattern, tasd_sent):
                list_sent.append(tasd_sent)
            else:
                raise Exception(f"Sentence found not valid: {tasd_sent}")
        except Exception as e:
                print(f"Invalid Sentence caught: {e}")
                break

    return list_sent

In [54]:
def string_to_xml(list_of_sentences: list, file_name: str) -> None:
    """Converts List of strings to XML-file
    Args:
    - list_of_sentences (str): List containing validated sentences
    - file_name (str): file name for saving file
    """

    sentence_tag_start = '<sentence>'
    sentence_tag_end = '</sentence>'

    list_tagged = []

    for sentence in list_of_sentences:
        tagged = sentence_tag_start + sentence + sentence_tag_end
        list_tagged.append(tagged)

    xml_string_file = ''.join(list_tagged)
    xml_string_file = "<?xml version='1.0' encoding='utf-8'?>" + "<Reviews>" + xml_string_file + "</Reviews>"

    # replace ampersand due to validation conflicts
    xml_string_file = re.sub('&', "'", xml_string_file)

    with open("test.txt", "w") as f:
        f.write(xml_string_file)


    tree = ET.ElementTree(ET.fromstring(xml_string_file))

    # Write to file
    tree.write(f"{file_name}.xml", encoding='utf-8')


    # Prettify XML-file
    with open(f"{file_name}.xml", "r", encoding="utf-8") as file:
        xml_string = file.read()

    # Parse and pretty-print
    xml_doc = xml.dom.minidom.parseString(xml_string)
    pretty_xml = xml_doc.toprettyxml(indent="  ")

    # Optionally, save the beautified XML to a new file
    with open(f"{file_name}.xml", "w", encoding='utf-8') as file:
        file.write(pretty_xml)

In [55]:
# xml_file_path_rest15 = "..\\data\\original_data\\test\\ABSA15_Restaurants_Test.xml"

In [56]:
xml_file_path_train15 = "..\\data\\original_data\\train\\ABSA-15_Restaurants_Train_Final.xml"
xml_file_path_train16 = "..\\data\\original_data\\train\\ABSA16_Restaurants_Train_SB1_v2.xml"

xml_file_path_test16 = "..\\data\\original_data\\test\\EN_REST16_SB1_TEST_GOLD.xml"
xml_file_path_test15 = "..\\data\\original_data\\test\\ABSA15_Restaurants_Test.xml"

In [57]:
file_paths = {
    "TASD_REST15_TRAIN": xml_file_path_train15,
    "TASD_REST16_TRAIN": xml_file_path_train16,
    "TASD_REST15_TEST": xml_file_path_test15,
    "TASD_REST16_TEST": xml_file_path_test16
}

In [58]:

for file_name,file_path in file_paths.items():

    list_sent = []
    
    transformed_data_rest = xml_to_dict_list(file_path)
    for review in transformed_data_rest:
        list_sent.append(convert_ner_to_xml(review))

    string_to_xml(list_sent, file_name)
# print(f"Number of valid reviews for Rest15: {len(list_sent)}") # 1120 valid reviews
  

1120
