### Load all examples
 * `generate_uids=True`: return UIDs per example
 * `tokenizer=None`: return raw (untokenized) examples

In [1]:
from metal.mmtl.utils.preprocess import load_tsv, get_task_tsv_config

In [2]:
config = get_task_tsv_config('RTE', 'train')
    
(examples, labels), uids = load_tsv(
    tsv_path=config["tsv_path"],
    sent1_idx=config["sent1_idx"],
    sent2_idx=config["sent2_idx"],
    label_idx=config["label_idx"],
    skip_rows=config["skip_rows"],
    tokenizer=None,
    delimiter="\t",
    label_fn=config["label_fn"],
    generate_uids=True
)

assert len(examples) == len(labels) == len(uids)

HBox(children=(IntProgress(value=0, max=2490), HTML(value='')))




### Define Proper Nouns based on Entities
Ref: https://spacy.io/api/annotation#named-entities

In [3]:
import spacy
nlp = spacy.load('en')

PROPER_NOUN_LABELS = [
    "PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", 
    "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"
]

def get_proper_nouns(sent):
    return [ent for ent in nlp(sent).ents
                if ent.label_ in PROPER_NOUN_LABELS]

### Tag all Proper Nouns

In [4]:
from tagger import Tagger
tagger = Tagger(verbose=False)

In [5]:
for idx, (ex, uid) in enumerate(zip(examples, uids)): 
    proper_nouns = get_proper_nouns(ex['sent1']) \
        + get_proper_nouns(ex['sent2'])

    if len(proper_nouns) > 0:
        tagger.add_tag(uid, 'proper_nouns')
        if idx % 100 == 0:
            print(uid)
            print(ex, proper_nouns)
            print()

RTE/train.tsv:2
{'sent1': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'sent2': 'Weapons of Mass Destruction Found in Iraq.'} [Iraq, Iraq]

RTE/train.tsv:102
{'sent1': "For lunch I went to Cipriani. The good thing about Cipriani is that it's all Italian. Every single person is Italian. Even the American sommelier is Italian. Everybody speaks Italian. It's a good feeling. I consider Cipriani one of the most refined services that I've ever had in a restaurant. For lunch I had spaghetti a la chitarra with Amatriciana sauce. I had beef tartar. I had fried seafood, mixed. I had also the fresh pasta with the duckling ragú. It was outstanding. Then I got a plate of Parmesan with green olives and I got the whole roasted branzino. It was me and another person. We had several glasses of wine. We didn't get dessert; we had a glass too much of wine, so we were very full. We stayed there like an hour just finishing the wine because my friend ordered a bottle.", 'sent2': 'Amatriciana is a sa

RTE/train.tsv:2002
{'sent1': 'Brought under Ottoman rule in the 16th century, Jordan has been led only since the 1920s by Hashemite rulers, a family whose roots are in present-day Saudi Arabia.', 'sent2': 'The Hashemite dynasty rules Jordan.'} [Ottoman, Jordan, Hashemite, Saudi Arabia, Jordan]

RTE/train.tsv:2102
{'sent1': 'When she was barely 20 years old, Dutch singer Edsilia Rombley got her first large taste of international acclaim. Already a winner of the smaller imitation contest Soundmix Show, she decided to shoot higher. With a great deal of determination, she performed in front of hundreds of millions of television viewers at the 1998 Eurovision Song Contest in Birmingham, United Kingdom. Her song, the R&B flavored "Hemel en aarde" (Heaven and earth), placed fourth and gave her country their highest placing at Eurovision since their last win in 1975. No Dutch contestant after her has been able to place similarly.', 'sent2': 'Edsilia Rombley is a Dutch singer.'} [Dutch, Edsilia

In [7]:
len(tagger.get_uids("proper_nouns"))

2314