### Load all examples
 * `generate_uids=True`: return UIDs per example
 * `tokenizer=None`: return raw (untokenized) examples

In [1]:
from metal.mmtl.utils.preprocess import load_tsv, get_task_tsv_config

In [2]:
config = get_task_tsv_config('COLA', 'train')
    
(examples, labels), uids = load_tsv(
    tsv_path=config["tsv_path"],
    sent1_idx=config["sent1_idx"],
    sent2_idx=config["sent2_idx"],
    label_idx=config["label_idx"],
    skip_rows=config["skip_rows"],
    tokenizer=None,
    delimiter="\t",
    label_fn=config["label_fn"],
    generate_uids=True
)

assert len(examples) == len(labels) == len(uids)

HBox(children=(IntProgress(value=0, max=8550), HTML(value='')))




### Define Proper Nouns based on Entities
Ref: https://spacy.io/api/annotation#named-entities

In [3]:
import spacy
nlp = spacy.load('en')

PROPER_NOUN_LABELS = [
    "PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", 
    "PRODUCT", "EVENT", "WORK_OF_ART", "LAW"
]

def get_proper_nouns(sent):
    return [ent for ent in nlp(sent).ents
                if ent.label_ in PROPER_NOUN_LABELS]

### Tag all Proper Nouns

In [4]:
from tagger import Tagger
tagger = Tagger(verbose=False)
tagger.get_examples("proper_nouns")

Error: 2491 RTE/train.tsv:2491


[('RTE/train.tsv:10',
  {'sent1': 'Only a week after it had no comment on upping the storage capacity of its Hotmail e-mail service, Microsoft early Thursday announced it was boosting the allowance to 250MB to follow similar moves by rivals such as Google, Yahoo, and Lycos.',
   'sent2': "Microsoft's Hotmail has raised its storage capacity to 250MB.",
   'label': 'entailment'}),
 ('RTE/train.tsv:1000',
  {'sent1': 'Dr Raynard Kington, of the US National Institutes of Health, which funded the research, said: "These findings establish that genetic factors play a strong role in autism spectrum disorder (ASD). "Detailed analysis of the genes and how they affect brain development is likely to yield better strategies for diagnosing and treating children with autism." People with ASD, which include autism and Asperger\'s syndrome, have problems with social interaction, poor communication skills and tend to engage in repetitive behaviours.',
   'sent2': "The ASD includes diseases such as autis

In [7]:
tagger.remove_tag("RTE/train.tsv:2491", "proper_nouns")

ValueError: list.remove(x): x not in list

In [5]:
for idx, (ex, uid) in enumerate(zip(examples, uids)): 
    proper_nouns = get_proper_nouns(ex['sent1']) \
        + get_proper_nouns(ex['sent2'])

    if len(proper_nouns) > 0:
        tagger.add_tag(uid, 'proper_nouns')
        if idx % 100 == 0:
            print(uid)
            print(ex, proper_nouns)
            print()

CoLA/train.tsv:402
{'sent1': 'How intelligent do you wonder whether I consider John?', 'sent2': 'How intelligent do you wonder whether I consider John?'} [John, John]



KeyboardInterrupt: 

In [None]:
len(tagger.get_uids("proper_nouns"))