# WebNLG-EN Translation

The translation was done using the APIs provided by DeepL.

In [8]:
import os
import deepl 
import sentencepiece as spm
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom

DEEPL_AUTH_KEY = ""

In [9]:
def remove_empty_lines(node):
    for child in node.childNodes[:]:
        if child.nodeType == child.TEXT_NODE and child.nodeValue.strip() == '':
            node.removeChild(child)
        elif child.nodeType == child.ELEMENT_NODE:
            remove_empty_lines(child)

def prettify(root):
    xml_string = ET.tostring(root, encoding='utf-8')
    dom = minidom.parseString(xml_string)
    remove_empty_lines(dom)
    formatted_xml = dom.toprettyxml(indent="  ")
    lines = formatted_xml.split("\n") # Rimuovi la prima riga vuota
    formatted_xml = "\n".join(line for line in lines if line.strip())

    return formatted_xml

def save(root, url):
    formatted_xml = prettify(root)
    with open(url, 'w', encoding='utf-8') as f:
        f.write(formatted_xml)

## Train and Dev sets translation

In [6]:
translator = deepl.Translator(DEEPL_AUTH_KEY) 
triple_numbers = ["1", "2", "3", "4", "5", "6", "7"]
dataset_types = ["train", "dev"]

count = 0
words = 0

files = [
    "Airport.xml", 
    "Artist.xml", 
    "Astronaut.xml", 
    "Athlete.xml", 
    "Building.xml", 
    "CelestialBody.xml",
    "City.xml", 
    "ComicsCharacter.xml", 
    "Company.xml",
    "Food.xml", 
    "MeanOfTransportation.xml",
    "Monument.xml",
    "Politician.xml",
    "SportsTeam.xml", 
    "University.xml",
    "WrittenWork.xml"
]

for dataset_type in dataset_types:
    for triple_number in triple_numbers:
        path = "..\\..\\..\\WebNLG\\en\\" + dataset_type + "\\" + triple_number + "triples"

        file_names = []
        for file_name in os.listdir(path):
            if file_name in files:
                if os.path.isfile(os.path.join(path, file_name)):
                    url = os.path.join(path, file_name)
                    tree = ET.parse(url)
                    root = tree.getroot()

                    for entry in root.iter('entry'):
                        lexs = []
                        for lex in entry.iter('lex'):
                            lex.set('lang', 'en')
                            lexs.append((lex.get('comment'), lex.get('lid'), lex.text))
                            count += 1
                            words += len(lex.text)

                        for comment, idl, text in lexs:
                            new_lex = ET.SubElement(entry, 'lex')
                            new_lex.set('comment', comment)
                            new_lex.set('lid', idl)
                            new_lex.set('lang', 'it')
                            new_lex.text = translator.translate_text(text, target_lang="it").text

                    url = "..\\..\\..\\WebNLG\\it\\" + dataset_type + "\\" + triple_number + "triples\\" + file_name
                    save(root, url)

words

146502

## Test set translation

In [10]:
#translator = deepl.Translator(DEEPL_AUTH_KEY) 
dataset_types = ["test"]

count = 0
words = 0

files = [
    "rdf-to-text-generation-test-data-with-refs-en.xml", 
    "semantic-parsing-test-data-with-refs-en.xml",
]

for dataset_type in dataset_types:
    path = "..\\..\\..\\WebNLG\\en\\" + dataset_type

    file_names = []
    for file_name in os.listdir(path):
        if file_name in files:
            if os.path.isfile(os.path.join(path, file_name)):
                url = os.path.join(path, file_name)
                tree = ET.parse(url)
                root = tree.getroot()

                for entry in root.iter('entry'):
                    lexs = []
                    for lex in entry.iter('lex'):
                        lex.set('lang', 'en')
                        lexs.append((lex.get('comment'), lex.get('lid'), lex.text))
                        count += 1
                        words += len(lex.text)

                    for comment, idl, text in lexs:
                        new_lex = ET.SubElement(entry, 'lex')
                        new_lex.set('comment', comment)
                        new_lex.set('lid', idl)
                        new_lex.set('lang', 'it')
                        new_lex.text = translator.translate_text(text, target_lang="it").text

                url = "..\\..\\..\\WebNLG\\it\\" + dataset_type + "\\" + file_name
                save(root, url)

words

949185