Reformats the WEBNLG 3.0 dataset to work with the model

In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

entire_df = pd.DataFrame()
data_dir = r"data/webnlg/train"

In [2]:
def extract_data(filename: str) -> pd.DataFrame:
    tree = ET.parse(filename)
    root = tree.getroot()
    
    entry_dfs = []
    
    for entry in root[0]:
        # triples
        full_triples_str = ""
        for mtriple in entry.find("modifiedtripleset"):
            triple_str = mtriple.text
            
            triple_str = triple_str.replace(" | ", "<R>", 1) # Relation
            triple_str = triple_str.replace(" | ", "<S>", 1) # Subject
            
            triple_str = "<T>" + triple_str # Start of Triple
            full_triples_str += triple_str
        # full_triples_str += "<EOS>" # End of String
        # print(triples_str)
        
        # NL
        nl_list = []
        for nl in entry.findall("lex"):
            if nl.get("comment") != "good":
                print(f"NG!! {nl.get('comment')} -> {nl.text}")
                
            nl_list.append(nl.text)
        # print(nl_list)
        
        # DataFrame gen
        triples_list = [full_triples_str] * len(nl_list)
        # print(triples_list)
        
        entry_df = pd.DataFrame(
            {
                "NL": nl_list,
                "Triples": triples_list,
            }
        )
        entry_dfs.append(entry_df)
    
    return pd.concat(entry_dfs, ignore_index=True)


In [3]:
from pathlib import Path

# train
data_dfs = []
for file in Path(data_dir).rglob('*.xml'):
    data_dfs.append(extract_data(file.resolve()))

merged_df = pd.concat(data_dfs, ignore_index=True)

NG!! toFix -> The main ethnic group in Japan is the Japanese.
NG!! toFix -> Ballistic is a fictional comic superhero who is sometimes known as Kelvin Mao.
NG!! toFix -> The city of Aarhus is governed by Magistrates.
NG!! toFix -> William Anders, who was from the United States, was actually born in British Hong Kong.


In [4]:
merged_df.to_csv("data/webnlg_train.csv", sep='\t', encoding='utf-8', index=False, header=True)

In [5]:
# test
test_list = [extract_data("data/webnlg/test/semantic-parsing-test-data-with-refs-en.xml"),extract_data("data/webnlg/test/rdf-to-text-generation-test-data-with-refs-en.xml")]
test_df = pd.concat(test_list, ignore_index=True)
test_df.to_csv("data/webnlg_test.csv", sep='\t', encoding='utf-8', index=False, header=True)

NG!! None -> Turn Me On is a 35.1 minute long album produced by Wharton Tiers that was followed by the album entitled Take it Off.
NG!! None -> The location of Trane is Swords, Dublin.
NG!! None -> The Ciudad Ayala city, a part of Morelos with population density and population of 1604.0 and 1,777,539 respectively, has a UTC offset of -6. The government type of Ciudad Ayala is council-manager government and City Manager is one of the leaders.
NG!! None -> The 17068.8 millimeter long ALCO RS-3 has a diesel-electric transmission.
NG!! None -> Alan B. Miller Hall, in Virginia, USA, was designed by Robert A.M. Stern. The address of the hall is "101 Ukrop Way" and the current tenants are the Mason School of Business.
NG!! None -> Liselotte Grschebina was born in Karlsruhe and died in Israel. Ethnic groups in Israel include Arabs.
NG!! None -> It’s Great to Be Young is a film edited by Max Benedict.
NG!! None -> Nurhan Atasoy was born in Turkey led by the President.
NG!! None -> Agremiação Sp