In [17]:
# Libs
from deep_translator import GoogleTranslator
import xml.dom.minidom
import xml.etree.ElementTree as ET
import re

In [18]:
# TASD

TASD_REST16 = "TASD_REST16_TRAIN.xml"
TASD_REST15 = "TASD_REST15_TRAIN.xml"

TASD_REST16_TEST = "TASD_REST16_TEST.xml"
TASD_REST15_TEST = "TASD_REST15_TEST.xml"


In [19]:
def sentence_extraction(file_path: str) -> list:

    sent_list = []
    # Read XML file as a string
    with open(file_path, 'r') as file:
        xml_string = file.read()

    matches = re.findall(r'<sentence>(.*?)</sentence>', xml_string, re.DOTALL)

    # Iterate through the matches and print each sentence
    for idx, match in enumerate(matches, start=1):
        sent_list.append(match)

    return sent_list
        



In [20]:
def string_to_xml(list_of_sentences: list, file_name: str) -> None:
    """Converts List of strings to XML-file
    Args:
    - list_of_sentences (str): List containing validated sentences
    - file_name (str): file name for saving file
    """

    sentence_tag_start = '<sentence>'
    sentence_tag_end = '</sentence>'

    list_tagged = []

    for sentence in list_of_sentences:
        tagged = sentence_tag_start + sentence + sentence_tag_end
        list_tagged.append(tagged)

    xml_string_file = ''.join(list_tagged)
    xml_string_file = "<?xml version='1.0' encoding='utf-8'?>" + "<Reviews>" + xml_string_file + "</Reviews>"

    # replace ampersand due to validation conflicts
    #xml_string_file = re.sub('&', "'", xml_string_file)

    # with open("test.txt", "w") as f:
    #     f.write(xml_string_file)


    tree = ET.ElementTree(ET.fromstring(xml_string_file))

    # Write to file
    tree.write(f"{file_name}.xml", encoding='utf-8')


    # Prettify XML-file
    with open(f"{file_name}.xml", "r", encoding="utf-8") as file:
        xml_string = file.read()

    # Parse and pretty-print
    xml_doc = xml.dom.minidom.parseString(xml_string)
    pretty_xml = xml_doc.toprettyxml(indent="  ")

    # Optionally, save the beautified XML to a new file
    with open(f"{file_name}.xml", "w", encoding='utf-8') as file:
        file.write(pretty_xml)

In [21]:
def translate(sentence: str) -> str:
    translated_sent = GoogleTranslator(source='en', target='de').translate(text=sentence).strip()
    return translated_sent

In [22]:
def validation(sentence: str) -> str:

    # Check if TASD xml tags are in the sentence
    explicit_pattern = r"<aspect-term\s+aspect=\"(?:GENERAL-IMPRESSION|FOOD|AMBIENCE|SERVICE|PRICE)\"\s+polarity=\"(?:negative|positive|neutral)\">.*?</aspect-term>"
    implicit_pattern = r"<aspect-term\s+aspect=\"(?:GENERAL-IMPRESSION|FOOD|AMBIENCE|SERVICE|PRICE)\"\s+polarity=\"(?:negative|positive|neutral)\"/>"

    # validated translated sentences
    if re.search(explicit_pattern, sentence):
        print("Valid - explicit sentence")
        return sentence
    
    elif re.search(implicit_pattern, sentence):
        print("Valid - implicit sentence")
        return sentence
    
    else:
        print("-------------------------Invalid--------------------------")
        print(sentence)

    

In [25]:
# Validator
def google_translation(extracted_sent: str, file_name: str) -> None:
    """Translates sentences with Google Translates and validates the xml-tags"""

    # Check if TASD xml tags are in the sentence
    explicit_pattern = r"<aspect-term\s+aspect=\"(?:GENERAL-IMPRESSION|FOOD|AMBIENCE|SERVICE|PRICE)\"\s+polarity=\"(?:negative|positive|neutral)\">.*?</aspect-term>"
    implicit_pattern = r"<aspect-term\s+aspect=\"(?:GENERAL-IMPRESSION|FOOD|AMBIENCE|SERVICE|PRICE)\"\s+polarity=\"(?:negative|positive|neutral)\"/>"

    # validated translated sentences
    sent = []
    

    for idx, match in enumerate(extracted_sent, start=1):

        translated = translate(match)
        translated = translated.strip()
        print(f"Sentence: {idx}")
        sent.append(translated)



        # if re.search(explicit_pattern, match):
        #     print(f"{idx} Explicit - matched")
        #     sentence = translate(match)
        #     valid_sent = validation(sentence)
        #     validated_sent.append(valid_sent)

        #     #print(valid_sent)

        # elif re.search(implicit_pattern, match):
        #     print(f"{idx} Implicit - matched")
        #     # preprocess
        #     # text_only = match.split('<')[0]
        #     # tags = match.split('<')[1]

        #     # sentence = translate(text_only)
        #     # translated_full = sentence + '<' + tags
        #     sentence = translate(match)
        #     valid_sent = validation(sentence)
        #     validated_sent.append(valid_sent)

        #     #print(valid_sent)
            
        # else:
        #     print(f"Failed sentence: {match}")
        #     break



    print(f"Number of sentences: {len(extracted_sent)}")
    print(f"Number of validated sentences: {len(sent)}")

        

    string_to_xml(sent, file_name)



In [26]:
# Extract sentences
list_sent = sentence_extraction(TASD_REST15_TEST)
google_translation(list_sent, "TASD_TEST15_GOOGLE_TRANSLATED_TESTFORMAT")

Sentence: 1
Sentence: 2
Sentence: 3
Sentence: 4
Sentence: 5
Sentence: 6
Sentence: 7
Sentence: 8
Sentence: 9
Sentence: 10
Sentence: 11
Sentence: 12
Sentence: 13
Sentence: 14
Sentence: 15
Sentence: 16
Sentence: 17
Sentence: 18
Sentence: 19
Sentence: 20
Sentence: 21
Sentence: 22
Sentence: 23
Sentence: 24
Sentence: 25
Sentence: 26
Sentence: 27
Sentence: 28
Sentence: 29
Sentence: 30
Sentence: 31
Sentence: 32
Sentence: 33
Sentence: 34
Sentence: 35
Sentence: 36
Sentence: 37
Sentence: 38
Sentence: 39
Sentence: 40
Sentence: 41
Sentence: 42
Sentence: 43
Sentence: 44
Sentence: 45
Sentence: 46
Sentence: 47
Sentence: 48
Sentence: 49
Sentence: 50
Sentence: 51
Sentence: 52
Sentence: 53
Sentence: 54
Sentence: 55
Sentence: 56
Sentence: 57
Sentence: 58
Sentence: 59
Sentence: 60
Sentence: 61
Sentence: 62
Sentence: 63
Sentence: 64
Sentence: 65
Sentence: 66
Sentence: 67
Sentence: 68
Sentence: 69
Sentence: 70
Sentence: 71
Sentence: 72
Sentence: 73
Sentence: 74
Sentence: 75
Sentence: 76
Sentence: 77
Sentence

In [1]:
# ACSA Translations