# XML to Dictionary
- @Lucas Müller
- Conversion of the Rest15/16 Dataset in xml to python dictonary with predefined schema, in order to be transformed into xml using the code provided by @Nils Hellwig

In [2]:
import xmltodict
import json
from pprint import pprint

In [3]:
from dataclasses import dataclass, asdict, field
from typing import List, Dict, Union

In [4]:
@dataclass
class Review:
    """
    This dataclass is used as a schema which will be casted into a dictionary after populating it

    - str: Annotated text
    - tags: tags and found in the xml-file containing
        - spans (start, end)
        - polarity 
        - label (aspect category)
        - type

    """
    # Fields
    text: str
    tags: List[Dict[str, Union[str, int]]]  # Allowing integers for span numbers

Loading trial Rest15/16 dataset and convert to dictionary

In [5]:

xml_file_path = "Data\\restaurants_trial_english_sl.xml"

# Read the XML file
with open(xml_file_path, "r", encoding="utf-8") as file:
    xml_content = file.read()

# Convert XML to Python dictionary
xml_dict = xmltodict.parse(xml_content)

In [6]:
print(xml_dict)

{'Reviews': {'Review': [{'@rid': '1090587', 'sentences': {'sentence': [{'@id': '1090587:0', 'text': 'Just went here for my girlfriends 23rd bday.'}, {'@id': '1090587:1', 'text': "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer.", 'Opinions': {'Opinion': {'@target': 'view', '@category': 'LOCATION#GENERAL', '@polarity': 'positive', '@from': '80', '@to': '84'}}}, {'@id': '1090587:2', 'text': 'Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening.', 'Opinions': {'Opinion': [{'@target': 'service', '@category': 'SERVICE#GENERAL', '@polarity': 'positive', '@from': '18', '@to': '25'}, {'@target': 'food', '@category': 'FOOD#QUALITY', '@polarity': 'positive', '@from': '36', '@to': '40'}, {'@target': 'food', '@category': 'FOOD#PRICES', '@polarity': 'positive', '@from': '36', '@to': '40'}]}}, {'@id': '1090587:3', 'text': 'The lava cake dessert was incredible and I recom

In [75]:
def xml_dict_transformation(xml_dict):

    """
    Transforms dictionary into the following schema (populated with example):

    transformed_dict = {"text": "Der Burger hat gut geschmeckt", 
            "tags": [
                {"start": 0, "end": 0, "label": "food", "polarity": "positive", "type": "label-explicit"},
            ]}

    - The aspect labels will be adjusted according to the labels used in the GERestaurant paper
    - Explicit and implicit label types are added as Rest15/16 datasets do not provide these labels
    - The dictionaries will passed to the XML-Tagger which will turn the dictionaries into xml-format for the TASD-Task 
    
    """

    # Mappig of the labels 
    # Rest15/16 : GERestaurant

    label_map = {
        "PRICES": "PRICE",
        "RESTAURANT" : "GENERAL-IMPRESSION",
        "LOCATION" : "AMBIENCE",
        "DRINKS" : "FOOD",
        "SERVICE" : "SERVICE",
        "AMBIENCE": "AMBIENCE",
        "FOOD" : "FOOD"
    }

    # List of transformed dictionaries
    reviews_list = []

    # Get reviews
    reviews = xml_dict['Reviews']['Review']


    for review in reviews:
        
        # Get every sentence from review
        review_sentences = review['sentences']['sentence']
        for sent in review_sentences:
            
            # Check if sentence has opinion tags, otherwise dismiss
            if type(sent) == str:
                continue
            try:
                sent["Opinions"]
            except KeyError:
                continue

            text = sent["text"]
            
            # Check if opinion tags are in a list or dict
            if type(sent['Opinions']['Opinion']) == dict:
                opinion = sent['Opinions']['Opinion']

                # Check if explicit or implicit
                aspect_type = opinion['@target']
                if aspect_type != 'NULL':
                    aspect_type = "label-explicit"
                else:
                    aspect_type = "label-implicit"

                label = opinion['@category']

                # map label to GERestaurant labels
                label = label.split('#')[0]
                label = label_map[label]
                
                polarity = opinion['@polarity']
                start = int(opinion['@from']) # start, cast into integer
                to = int(opinion['@to']) # end, cast into integer

                # create Review
                review_class = Review(text, tags=[{"start": start, "end": to, "label": label, "polarity": polarity, "type": aspect_type}])

                reviews_list.append(asdict(review_class))

            else:

                opinions = sent['Opinions']['Opinion']

                tags_list = []

                for opinion in opinions:

                    # Check if explicit or implicit
                    aspect_type = opinion['@target']
                    if aspect_type != 'NULL':
                        aspect_type = "label-explicit"
                    else:
                        aspect_type = "label-implicit"


                    label = opinion['@category']

                    # map label to GERestaurant labels
                    label = label.split('#')[0]
                    label = label_map[label]

                    polarity = opinion['@polarity']
                    start = int(opinion['@from']) # start, cast into integer
                    to = int(opinion['@to']) # end, cast into integer

                    tags = {"start": start, "end": to, "label": label, "polarity": polarity, "type": aspect_type}
                    tags_list.append(tags)

                # create Review instance
                review_class = Review(text, tags=tags_list)

                reviews_list.append(asdict(review_class))

    return reviews_list


In [8]:
xml_dict_transformation(xml_dict)

[{'text': "If you've ever been along the river in Weehawken you have an idea of the top of view the chart house has to offer.",
  'tags': [{'start': 80,
    'end': 84,
    'label': 'AMBIENCE',
    'polarity': 'positive',
    'type': 'label-explicit'}]},
 {'text': 'Add to that great service and great food at a reasonable price and you have yourself the beginning of a great evening.',
  'tags': [{'start': 18,
    'end': 25,
    'label': 'SERVICE',
    'polarity': 'positive',
    'type': 'label-explicit'},
   {'start': 36,
    'end': 40,
    'label': 'FOOD',
    'polarity': 'positive',
    'type': 'label-explicit'},
   {'start': 36,
    'end': 40,
    'label': 'FOOD',
    'polarity': 'positive',
    'type': 'label-explicit'}]},
 {'text': 'The lava cake dessert was incredible and I recommend it.',
  'tags': [{'start': 4,
    'end': 21,
    'label': 'FOOD',
    'polarity': 'positive',
    'type': 'label-explicit'}]},
 {'text': 'Pizza here is consistently good.',
  'tags': [{'start': 0,
    

# XML Tagger
- @Nils Hellwig

In [9]:
from collections import Counter
import uuid
import re


def convert_ner_to_xml(ner_dict):
    text = ner_dict['text']
    tags = ner_dict['tags']
    tag_positions = []

    for tag in tags:
        start = tag['start']
        end = tag['end']
        label = tag['label']
        polarity = tag['polarity']
        tag_type = tag['type']

        if tag_type == 'label-explicit':
            tag_positions.append(
                (start, f'<aspect-term aspect="{label}" polarity="{polarity}">'))
            tag_positions.append((end, '</aspect-term>'))

    tag_positions.sort(reverse=True, key=lambda x: x[0])

    xml_text = list(text)
    for position, tag in tag_positions:
        xml_text.insert(position, tag)

    return ''.join(xml_text)


def remove_xml_tags_from_string(str):
    return re.sub(r'<.*?>', '', str)


def remove_xml_tags(input_string):
    return re.sub(r'<[^>]+>', '', input_string)


def extract_aspect_polarity(xml_string):
    aspect_match = re.search(r'aspect="([^"]+)"', xml_string)
    polarity_match = re.search(r'polarity="([^"]+)"', xml_string)

    aspect = aspect_match.group(1) if aspect_match else None
    polarity = polarity_match.group(1) if polarity_match else None

    return aspect, polarity


def get_explicit_aspects(tags, predicted_text):

    # 1. match position
    pattern = r"<aspect-term(?!.*<aspect-term).*?<\/aspect-term>"
    matches = list(re.finditer(pattern, predicted_text))
    if len(matches) == 0:
        return tags, predicted_text  # Return tags and the updated text

    match = matches[0]

    # 2. position in text with tags
    tag_xml = match.group()
    tag_xml_start = match.start()
    tag_xml_end = match.end()

    # 3. identify aspect and polarity
    aspect, polarity = extract_aspect_polarity(tag_xml)

    # 4. position in text without tags
    tag_text = remove_xml_tags(tag_xml)
    tag_start = len(remove_xml_tags(predicted_text[0:match.start()]))
    tag_end = tag_start + len(tag_text)

    # 5. remove tag from text
    predicted_text = predicted_text[0:tag_xml_start] + \
        tag_text + predicted_text[tag_xml_end:]

    # 6. add tags to list
    tags.append({"text": tag_text, "start": tag_start, "end": tag_end, "tag_with_polarity": aspect+"-"+polarity,
                "tag_with_polarity_and_type": aspect+"-"+polarity+"-explicit", "type": "label-explicit", "label": aspect, "polarity": polarity})

    # Recursive call
    return get_explicit_aspects(tags, predicted_text)


def check_difference_between_tags_in_synth_text_and_label(label, tags_synth):
    """
    This function identifies the differences between aspect-polarity pairs in the label and the synthesised text.

    Args:
    label (list of tuples): The aspect-polarity pairs in the label.
    tags_synth (list of tuples): The aspect-polarity pairs in the synthesised text.

    Returns:
    tuple: A tuple containing two lists:
        - List of aspect-polarity pairs present in the label but not in the synthesised text.
        - List of aspect-polarity pairs present in the synthesised text but not in the label.
    """
    
    # Count the occurrences of aspect-polarity pairs in the label and synthesised text
    label_count = Counter(label)
    tags_synth_count = Counter(tags_synth)
    
    # Find aspect-polarity pairs in the label but not in the synthesised text
    not_in_tags_synth_count = [tup for tup, count in label_count.items() for _ in range(max(0, count - tags_synth_count.get(tup, 0)))]
    
    # Find aspect-polarity pairs in the synthesised text but not in the label
    not_in_label = [tup for tup, count in tags_synth_count.items() if count > label_count.get(tup, 0)]
    
    return not_in_tags_synth_count, not_in_label

In [None]:
# Convert dictionary to xml
ner_dict = xml_dict_transformation(xml_dict)
example = ner_dict[0]
xml = convert_ner_to_xml(example)

print(xml)

41
If you've ever been along the river in Weehawken you have an idea of the top of <aspect-term aspect="AMBIENCE" polarity="positive">view</aspect-term> the chart house has to offer.


# Translations
- 1. Locally run gemma 7b
- 2. GoogleTranslate

In [13]:
# PromptLoader

class PromptLoader():

    """This loads different prompts in order to compare different them and prevent hallucinations e.g. unwanted characters"""

    def __init__(self, prompt_example):
        self.prompt_example = prompt_example

    def load_basic_prompt(self):

        return f"""Translate the following restaurant review to german. Maintain aspect and sentiment annotations. Text in english: {self.prompt_example}
        Text in german: 
        """

    def load_guard_rail_prompt(self):
        return f"""Translate the following restaurant review to German. The string contains xml-Tags that need to be maintained after the translation. Return only the translated string. Here is the following review in english: {self.prompt_example} \\ Translated review in german: """
        

In [14]:
ollama_prompt = f"""Translate the following restaurant review to German. The string contains xml-Tags that need to be maintained after the translation.
Text English: {xml} Text German:"""

print(ollama_prompt)

Translate the following restaurant review to German. The string contains xml-Tags that need to be maintained after the translation.
Text English: If you've ever been along the river in Weehawken you have an idea of the top of <aspect-term aspect="AMBIENCE" polarity="positive">view</aspect-term> the chart house has to offer. Text German:


In [15]:
pl = PromptLoader(prompt_example=xml)
ollama_prompt = pl.load_guard_rail_prompt()
print(ollama_prompt)

Translate the following restaurant review to German. The string contains xml-Tags that need to be maintained after the translation. Return only the translated string. Here is the following review in english: If you've ever been along the river in Weehawken you have an idea of the top of <aspect-term aspect="AMBIENCE" polarity="positive">view</aspect-term> the chart house has to offer. \ Translated review in german: 


In [16]:
from ollama import chat
from ollama import ChatResponse



response: ChatResponse = chat(model='gemma:7b', messages=[
  {
    'role': 'user',
    'content': ollama_prompt,
  },
])
# print(response['message']['content'])
# or access fields directly from the response object
print(response.message.content)

Wenn Sie jemals entlang des Flusses in Weehawken waren, wissen Sie, was die Chart House zu bieten hat: die beste <aspect-term aspect="AMBIENCE" polarity="positive">Ansicht</aspect-term> an der ganzen Reihe.


In [11]:
from deep_translator import GoogleTranslator

google_translated = GoogleTranslator(source='en', target='de').translate(text=xml)
print(google_translated)

Wenn Sie schon einmal am Fluss in Weehawken waren, haben Sie eine Vorstellung von der besten <aspect-term aspect="AMBIENCE" polarity="positive">Aussicht</aspect-term>, die das Kartenhaus zu bieten hat.


In [175]:
# Notizen:
# gemma:7b macht grammatikalische Fehler, teilweise wird die Prompt Anleitung "Text German" als Antwort mit übernommen (Halluzination)

# Dataset - Full TASD Tagging with Google Translator
- Translation of the Training Dataset Rest16 and Rest15


In [155]:
xml_file_path16 = "data\\ABSA16_Restaurants_Train_SB1_v2.xml"
xml_file_path15 = "data\\ABSA-15_Restaurants_Train_Final.xml"

# Read the XML file
with open(xml_file_path16, "r", encoding="utf-8") as file:
    xml_content = file.read()

# Convert XML to Python dictionary
xml_dict = xmltodict.parse(xml_content)

In [156]:
xml_dict

{'Reviews': {'Review': [{'@rid': '1004293',
    'sentences': {'sentence': [{'@id': '1004293:0',
       'text': 'Judging from previous posts this used to be a good place, but not any longer.',
       'Opinions': {'Opinion': {'@target': 'place',
         '@category': 'RESTAURANT#GENERAL',
         '@polarity': 'negative',
         '@from': '51',
         '@to': '56'}}},
      {'@id': '1004293:1',
       'text': 'We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.',
       'Opinions': {'Opinion': {'@target': 'staff',
         '@category': 'SERVICE#GENERAL',
         '@polarity': 'negative',
         '@from': '75',
         '@to': '80'}}},
      {'@id': '1004293:2',
       'text': 'They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
       'Opinions': {'Opinion': {'@target': 'NULL',
         '@category': 'SERVICE#GENERAL',
         '@p

In [157]:
# Convert dictionary to xml
ner_dict = xml_dict_transformation(xml_dict)
example = ner_dict[0]
xml = convert_ner_to_xml(example)
xml

'Judging from previous posts this used to be a good <aspect-term aspect="GENERAL-IMPRESSION" polarity="negative">place</aspect-term>, but not any longer.'

In [158]:
# Test new TASD tagged xml file
list_sent = []
for review in ner_dict:
    tasd_sent = convert_ner_to_xml(review)
    # check if implicit review - implicit review do not have any tags
    if '<' and '>' not in tasd_sent:
        continue
    else:
        list_sent.append(tasd_sent)

print(len(list_sent))


1232


In [159]:
def sentence_xml_tagging(tasd_sentence: str) -> str:
    sentence_tag_start = '<sentence>'
    sentence_tag_end = '</sentence>'
    concat = sentence_tag_start + tasd_sentence + sentence_tag_end
    return concat

In [160]:
# tag filtered TASD sentences
final_list = []
for sent in list_sent:
    tagged = sentence_xml_tagging(sent)
    final_list.append(tagged)

xml_file_tasd = ''.join(final_list)


In [161]:
# add root
xml_file_tasd = '<Reviews>' + xml_file_tasd + '</Reviews>'

In [162]:
# to test file
import xml.etree.ElementTree as ET
tree = ET.ElementTree(ET.fromstring(xml_file_tasd))

In [163]:
tree.write('rest16_TASD.xml')

In [164]:
import json


with open("rest16.json", "w") as outf:
    json.dump(ner_dict, outf)