## Newsela

In [None]:
import pandas as pd
newsela_align = pd.read_csv('data/newsela_data_share-20150302/newsela_articles_20150302.aligned.sents.txt', sep='\t', names=['DOC', 'V_normal', 'V_simple', 'normal_phrase', 'simple_phrase'], on_bad_lines='skip').dropna()
newsela_align

In [None]:
newsela_align = newsela_align.groupby(['V_normal','V_simple', 'DOC', 'normal_phrase']).agg(tuple).applymap(list).reset_index()
newsela_align['simple_phrase'] = newsela_align['simple_phrase'].apply(lambda x: ' '.join(x))
newsela_align = newsela_align[newsela_align.V_normal == 'V0']
newsela_align

In [None]:
import random

for _ in range(10):
    index = random.randint(0, len(newsela_align))
    print("**")
    print("Normal phrase:", newsela_align.iloc[index]['normal_phrase'])
    print("Simple phrase:", newsela_align.iloc[index]['simple_phrase'])

In [None]:
newsela_align.to_csv('data/newsela_sent_aligned_V0.csv', index=False)

### Mask NEs

In [6]:
import pandas as pd

newsela = pd.read_csv("data/newsela_sent_aligned_V0.csv")
newsela = newsela[['V_normal', 'V_simple', 'DOC', 'normal_phrase', 'simple_phrase']].copy()

In [None]:
from tqdm import tqdm
import spacy

nlp = spacy.load("en_core_web_md", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
def replace_ne(index: int, column: str, doc, phrase):
    #for ent in doc.ents:
        #print(ent.label_, ent.label_ in ['EVENT', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'NORP', 'ORG', 'PERSON', 'PRODUCT', 'WORK_OF_ART'])
        #if ent.label_ in ['EVENT', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'NORP', 'ORG', 'PERSON', 'PRODUCT', 'WORK_OF_ART']:
        #    phrase.replace(ent.text, 'NAME')
        #    print(phrase)
    newsela.at[index, column] = ' '.join("NAME" if token.ent_type_ in ['EVENT', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'NORP', 'ORG', 'PERSON', 'PRODUCT', 'WORK_OF_ART'] else token.text for token in doc)

for i, row in tqdm(newsela.iterrows(), total=len(newsela)):
    doc = nlp(row.normal_phrase)
    replace_ne(i, 'normal_phrase_ne', doc, row.normal_phrase)
    doc = nlp(row.simple_phrase)
    replace_ne(i, 'simple_phrase_ne', doc, row.simple_phrase)

newsela

In [14]:
newsela.to_csv("data/newsela_sent_aligned_entities_masked.csv", index=False)

## IT: Teacher/Terence

In [None]:
import xml.etree.ElementTree as ET
with open('data/CORPORA_TEXT_SIMP/Teacher/1_anna_frank_last_senza_ann.txt', 'r', encoding='utf-8') as f:
    content = f.read()
tree = ET.fromstring('<foo>'+content+'</foo>')

In [None]:
frase_norm, frase_simp = tree.findall('doc')
tmp = pd.DataFrame({'normal_phrase': "", 'simple_phrase': [f.text for f in frase_simp.findall('frase')]})
for norm_fras in frase_norm.findall('frase'):
    tmp.loc[int(norm_fras.get('frase_all')) -1, "normal_phrase"] += norm_fras.text
tmp


In [None]:
from xml.etree.ElementTree import ParseError
import pandas as pd
import xml.etree.ElementTree as ET
import os

data_path = "'data/CORPORA_TEXT_SIMP/Teacher/"
def load_data(data_path, align_phrase, source):
    corpus = pd.DataFrame()
    data_path += '/' if not data_path.endswith('/') else ''
    print('*', data_path)
    for filename in os.listdir(data_path):
        if filename.endswith('.txt'):
            print('**', filename)
            with open(data_path + filename, 'r', encoding='utf-8') as f:
                content = f.read()
            try:
                tree = ET.fromstring('<foo>'+content+'</foo>')
            except ParseError:
                print('\tcould not parse')
                continue
            if len(tree.findall('doc')) > 1:
                frase_norm, frase_simp = tree.findall('doc')
            else:
                frase_norm = tree.find('*/originale')
                frase_simp = tree.find('*/semplificato')
            tmp = pd.DataFrame({'normal_phrase': "", 'simple_phrase': [f.text for f in frase_simp.findall('frase')]})
            for norm_fras in frase_norm.findall('frase'):
                #print(norm_fras.get('frase_all'))
                if len(norm_fras.get(align_phrase)) > 0:
                    for simp_index in norm_fras.get(align_phrase).split(';'):
                        tmp.loc[int(simp_index) -1 , "normal_phrase"] += norm_fras.text
            tmp = tmp.groupby(['normal_phrase']).agg(tuple).applymap(list).reset_index()
            tmp['simple_phrase'] = tmp['simple_phrase'].apply(lambda x: ' '.join(x))

            with open(data_path + filename.replace('.txt', '.ann'), 'r', encoding='utf-8') as f:
                annotations = [line for line in f if not line.startswith('#')]
            #print(annotations)
            tmp = tmp[(tmp.normal_phrase.str.len() > 0) & (tmp.simple_phrase.str.len() > 0)]
            tmp['simp_ops'] = len(annotations) / len(tmp)
            tmp['doc'] = filename
            corpus = pd.concat([corpus, tmp])
    corpus['source'] = source
    return corpus

corpus_text_simp = pd.concat([
    load_data('data/CORPORA_TEXT_SIMP/Teacher/', 'frase_all', 'Teacher'),
    pd.concat([load_data('data/CORPORA_TEXT_SIMP/Terence/' + c, 'frase_al', 'Terence') for c in os.listdir('data/CORPORA_TEXT_SIMP/Terence/')])
])
corpus_text_simp

In [None]:
corpus_text_simp.dropna().to_csv('data/corpus_simp_it.csv', index=False)

## Simpitiki

In [None]:
import pandas as pd

tree = ET.parse('data/simpitiki-v2.xml')
normal_phrases = []
simple_phrases = []
types = []
for simplification in tree.findall('*/simplification'):
    normal_phrases.append(simplification.find('before').text)
    simple_phrases.append(simplification.find('after').text)
    types.append(simplification.get('type').strip())
simpitiki = pd.DataFrame({'normal_phrase': normal_phrases, 'simple_phrase': simple_phrases, 'type': types})
simpitiki

In [None]:
simpitiki.to_csv('data/simpitiki.csv', index=False)

In [None]:
relevant_simplification_types = ["1","2","3","32","33","34","35","36","37"]
simpitiki[simpitiki.type.isin(relevant_simplification_types)]