In [1]:
from collections import defaultdict
from urllib import request
import json
import pandas as pd

In [2]:
def parse_conllu_using_pandas(block):
    records = []
    for line in block.splitlines():
        if not line.startswith('#'):
            records.append(line.strip().split('\t'))
    return pd.DataFrame.from_records(
        records,
        columns=['ID', 'FORM', 'TAG', 'Misc1', 'Misc2'])

In [3]:
def tokens_to_labels(df):
    return (
        df.FORM.tolist(),
        df.TAG.tolist()
    )

In [4]:
PREFIX = "https://raw.githubusercontent.com/UniversalNER/"
DATA_URLS = {
    "en_ewt": {
        "train": "UNER_English-EWT/master/en_ewt-ud-train.iob2",
        "dev": "UNER_English-EWT/master/en_ewt-ud-dev.iob2",
        "test": "UNER_English-EWT/master/en_ewt-ud-test.iob2"
    },
    "en_pud": {
        "test": "UNER_English-PUD/master/en_pud-ud-test.iob2"
    }
}

In [5]:
# en_ewt is the main train-dev-test split
# en_pud is the OOD test set
data_dict = defaultdict(dict)
for corpus, split_dict in DATA_URLS.items():
    for split, url_suffix in split_dict.items():
        url = PREFIX + url_suffix
        with request.urlopen(url) as response:
            txt = response.read().decode('utf-8')
            data_frames = map(parse_conllu_using_pandas,
                              txt.strip().split('\n\n'))
            token_label_alignments = list(map(tokens_to_labels,
                                              data_frames))
            data_dict[corpus][split] = token_label_alignments

In [6]:
# data_dict = defaultdict(dict)
# for corpus, split_dict in DATA_URLS.items():
#     for split, url_suffix in split_dict.items():
#         url = PREFIX + url_suffix
#         with request.urlopen(url) as response:
#             txt = response.read().decode('utf-8')
#             data_frames = map(parse_conllu_using_pandas,
#                               txt.split('\n\n'))
#             token_label_alignments = list(map(tokens_to_labels,
#                                               data_frames))
#             data_dict[corpus][split] = token_label_alignments

In [6]:
# Saving the data so that you don't have to redownload it each time.
with open('ner_data_dict.json', 'w', encoding='utf-8') as out:
    json.dump(data_dict, out, indent=2, ensure_ascii=False)

In [7]:
# Each subset of each corpus is a list of tuples where each tuple
# is a list of tokens with a corresponding list of labels.

# Train on data_dict['en_ewt']['train']; validate on data_dict['en_ewt']['dev']
# and test on data_dict['en_ewt']['test'] and data_dict['en_pud']['test']
#data_dict['en_ewt']['train'][0], data_dict['en_pud']['test'][1]

In [8]:
data_dict['en_ewt']['train'][1], data_dict['en_ewt']['dev'][-1], data_dict['en_ewt']['test'][-1],data_dict['en_pud']['test'][-1]

((['Iguazu', 'Falls'], ['B-LOC', 'I-LOC']),
 (['It',
   'probably',
   'gives',
   'more',
   'bang',
   'for',
   'the',
   'buck',
   'than',
   'acquiring',
   'a',
   'private',
   'company',
   'and',
   'having',
   'to',
   'handle',
   'the',
   'inevitable',
   'culture',
   'clashes',
   'and',
   'process',
   'mis-matches',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']),
 (['Most',
   'Shiites',
   ',',
   'however',
   ',',
   'are',
   'still',
   'reluctant',
   'to',
   'take',
   'major',
   'risks',
   'to',
   'support',
   'the',
   'Sunnis',
   'of',
   'Fallujah',
   ',',
   'many',
   'of',
   'whom',
   'had',
   'supported',
   'Saddam',
   'and',
   'his',
   'anti-Shiite',
   'pogroms',
   '.'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
 