# PEA Extraction (Post-Editing Actions)

Based on Blain et al. 2011

In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom
import spacy
import language_tool_python

nlp = spacy.load("it_core_news_sm")
tool = language_tool_python.LanguageTool("it-IT")

## Extraction of MT sentences from WebNLG-IT

In [2]:
triple_numbers = ["1", "2", "3", "4", "5", "6", "7"]
dataset_types = ["test", "dev", "train"]

dataset = []
for dataset_type in dataset_types:
    if (dataset_type == "test"):
        path = "..\\..\\WebNLG\\it\\test"

        file_names = []
        for file_name in os.listdir(path):
            if os.path.isfile(os.path.join(path, file_name)):
                url = os.path.join(path, file_name)
                tree = ET.parse(url)
                root = tree.getroot()

                for entry in root.iter('entry'):
                    originaltripleset = entry.find('modifiedtripleset')
                    otriple = originaltripleset.find('mtriple')

                    for lex in entry.iter('lex'):
                        if lex.get('lang') == "it":
                            dataset.append((otriple.text, lex.text))
    else:
        for triple_number in triple_numbers:
            path = "..\\..\\WebNLG\\it\\" + dataset_type + "\\" + triple_number + "triples"

            file_names = []
            for file_name in os.listdir(path):
                if os.path.isfile(os.path.join(path, file_name)):
                    url = os.path.join(path, file_name)
                    tree = ET.parse(url)
                    root = tree.getroot()


                    for entry in root.iter('entry'):
                        originaltripleset = entry.find('modifiedtripleset')
                        otriple = originaltripleset.find('mtriple')

                        for lex in entry.iter('lex'):
                            if lex.get('lang') == "it":
                                dataset.append((otriple.text, lex.text))

print(len(dataset))

47195


## Opening the KB containing false positives

In [81]:
#open false-positive-manual-pe.csv into array ner
ner = []
with open('false-positive.csv', 'r', encoding="utf8") as f:
    for line in f:
        ner.append(line.strip())

## Error Detection Loop

It utilizes a combination of:

Language Tool Python
Spacy
false-positive.csv
By analyzing the output generated by this code, the file 'errors.txt' was created.

In [82]:
i = 0
j = 0
for _, text in dataset:
    doc = nlp(text)

    nes = []
    for ent in doc.ents:
        nes.extend(ent.text.lower().split(" "))

    matches = tool.check(text)

    if len(matches) > 0:
        for match in matches:
            if (match.ruleId != "UPPERCASE_SENTENCE_START"):
                error_text = text[match.offset:match.errorLength + match.offset]
                if error_text.lower() not in nes and error_text.lower() not in ner:
                    print('Errore n. ' + str(j))
                    print(error_text.lower(), nes)
                    print("Errors found for sentence " + str(i))
                    print(f"Rule ID: {match.ruleId}")
                    print(f"Error: {match.message}")
                    print(f"Suggested correction: {match.replacements}")
                    print(f"Error: {text[match.offset:match.errorLength + match.offset]}")
                    print(f"Relative sentence: {text}")
                    print(f"Relative triple: {dataset[i][0]}")
                    print(f"ne: {nes}")
                    print('\n')
                    j += 1

    i += 1

print(j)

Errore n. 0
council-manager ['ciudad', 'ayala', 'city', 'manager']
Errors found for sentence 12
Rule ID: MORFOLOGIK_RULE_IT_IT
Error: Trovato un probabile errore di battitura.
Suggested correction: []
Error: council-manager
Relative sentence: La città di Ciudad Ayala, con una popolazione di 1.777.539 abitanti, è guidata da un City Manager, con un sistema di governo di tipo "council-manager". La densità di popolazione è di 1604 abitanti.
Relative triple: Ciudad_Ayala | populationMetro | 1777539
ne: ['ciudad', 'ayala', 'city', 'manager']


Errore n. 1
council-manager ['ciudad', 'ayala', 'city', 'manager']
Errors found for sentence 103
Rule ID: MORFOLOGIK_RULE_IT_IT
Error: Trovato un probabile errore di battitura.
Suggested correction: []
Error: council-manager
Relative sentence: Ciudad Ayala è una città metropolitana che ha una popolazione di 1.777.539 abitanti con una densità abitativa di 1604,0 e rientra nel fuso orario UTC offset -6. Uno dei leader di Ciudad Ayala è il City Manager, c