## Data Prep for NER Task

The dataset was downloaded from Kaggle at:

<a href="https://www.kaggle.com/datasets/sushilkumarinfo/covid-ner-data-set">Covid NER Data Set</a>

### Corpus file

This file contains the Spacy-tokenized (words) sentences by document.

In [14]:
import json
import pandas as pd 


corpus = []
with open('./data/CORD-NER-corpus.json') as f:
    for line in f:
        doc = json.loads(line)
        doc_id = doc['doc_id']
        sents = doc['sents']
        for sent in sents:
            sent_id = sent['sent_id']
            sent_tokens = sent['sent_tokens']
            corpus.append([doc_id, sent_id, sent_tokens])

corpus_df = pd.DataFrame(corpus, columns = ['doc_id', 'sent_id', 'sent_tokens'])

### Entities file
We read file with entities and store in a dictionary 

In [16]:
ner = {}
with open('./data/CORD-NER-ner.json') as f:
    for line in f:
        doc = json.loads(line)
        doc_id = doc['doc_id']
        sents = doc['sents']
        sents_dict= {}
        for sent in sents:
            sent_id = sent['sent_id']
            entities = sent['entities']
            sents_dict[sent_id] = entities
        ner[doc_id] = sents_dict

### IOB Annotations


In [24]:
def iob_annotation(doc_id, sent_id, num_tokens, ner):
    labels = num_tokens*['O']
    entities = ner[doc_id][sent_id]
    for ent in entities:
        start = ent['start']
        end = ent['end']
        ent_type = ent['type']
        labels[start] = 'B-' + ent_type
        for i in range(start+1, end):
            labels[i] = 'I-' + ent_type
    return labels

In [25]:
corpus_df['labels'] = corpus_df.apply(lambda x: iob_annotation(x.doc_id, x.sent_id, len(x.sent_tokens), ner), axis=1)
corpus_df.head(5)

Unnamed: 0,doc_id,sent_id,sent_tokens,labels
0,0,0,"[angiotensin_converting_enzyme, 2, (, ace2, ),...","[B-GENE_OR_GENOME, I-GENE_OR_GENOME, O, B-GENE..."
1,0,1,"[a, phylogenetic, analysis, 3, ,, 4, found, a,...","[O, B-EVOLUTION, O, O, O, O, O, O, B-WILDLIFE,..."
2,0,2,"[there, is, a, diversity, of, possible, interm...","[O, O, O, O, O, O, B-NORP, O, B-CORONAVIRUS, O..."
3,0,3,"[there, are, many, similarities, of, sars_cov_...","[O, O, O, O, O, B-CORONAVIRUS, O, O, O, B-CORO..."
4,0,4,"[using, computer, modeling, ,, xu, et, al, .]","[O, O, O, O, B-PERSON, I-PERSON, I-PERSON, O]"


In [27]:
len(corpus_df)

3210155

In [26]:
corpus_df.to_pickle('./data/corpus_ner.pkl')

## WorldPiece Tokenization

In [12]:
import pandas as pd
data = pd.read_pickle('./data/corpus_ner.pkl')

In [13]:
label_set = set()
for labs in data['labels']:
    label_set.update(labs)
labels = list(label_set)
labels.sort(reverse=True)
label_set = set([lab[2:] for lab in labels[1:]])
labels = ['O']
for lab in label_set:
    labels.extend(['B-'+lab,'I-'+lab])
label_dict = {k:v for v, k in enumerate(labels)}
label_dict 

{'O': 0,
 'B-CARDINAL': 1,
 'I-CARDINAL': 2,
 'B-ORGANISM': 3,
 'I-ORGANISM': 4,
 'B-CELL': 5,
 'I-CELL': 6,
 'B-BACTERIUM': 7,
 'I-BACTERIUM': 8,
 'B-LABORATORY_OR_TEST_RESULT': 9,
 'I-LABORATORY_OR_TEST_RESULT': 10,
 'B-CELL_COMPONENT': 11,
 'I-CELL_COMPONENT': 12,
 'B-PERSON': 13,
 'I-PERSON': 14,
 'B-MACHINE_ACTIVITY': 15,
 'I-MACHINE_ACTIVITY': 16,
 'B-SIGN_OR_SYMPTOM': 17,
 'I-SIGN_OR_SYMPTOM': 18,
 'B-BODY_SUBSTANCE': 19,
 'I-BODY_SUBSTANCE': 20,
 'B-TIME': 21,
 'I-TIME': 22,
 'B-SUBSTRATE': 23,
 'I-SUBSTRATE': 24,
 'B-CELL_FUNCTION': 25,
 'I-CELL_FUNCTION': 26,
 'B-ORDINAL': 27,
 'I-ORDINAL': 28,
 'B-HUMAN-CAUSED_PHENOMENON_OR_PROCESS': 29,
 'I-HUMAN-CAUSED_PHENOMENON_OR_PROCESS': 30,
 'B-EVOLUTION': 31,
 'I-EVOLUTION': 32,
 'B-IMMUNE_RESPONSE': 33,
 'I-IMMUNE_RESPONSE': 34,
 'B-EDUCATIONAL_ACTIVITY': 35,
 'I-EDUCATIONAL_ACTIVITY': 36,
 'B-FOOD': 37,
 'I-FOOD': 38,
 'B-LANGUAGE': 39,
 'I-LANGUAGE': 40,
 'B-GPE': 41,
 'I-GPE': 42,
 'B-BODY_PART_ORGAN_OR_ORGAN_COMPONENT': 43,
 'I

In [14]:
label_list = list(label_dict.keys())
label_list

['O',
 'B-CARDINAL',
 'I-CARDINAL',
 'B-ORGANISM',
 'I-ORGANISM',
 'B-CELL',
 'I-CELL',
 'B-BACTERIUM',
 'I-BACTERIUM',
 'B-LABORATORY_OR_TEST_RESULT',
 'I-LABORATORY_OR_TEST_RESULT',
 'B-CELL_COMPONENT',
 'I-CELL_COMPONENT',
 'B-PERSON',
 'I-PERSON',
 'B-MACHINE_ACTIVITY',
 'I-MACHINE_ACTIVITY',
 'B-SIGN_OR_SYMPTOM',
 'I-SIGN_OR_SYMPTOM',
 'B-BODY_SUBSTANCE',
 'I-BODY_SUBSTANCE',
 'B-TIME',
 'I-TIME',
 'B-SUBSTRATE',
 'I-SUBSTRATE',
 'B-CELL_FUNCTION',
 'I-CELL_FUNCTION',
 'B-ORDINAL',
 'I-ORDINAL',
 'B-HUMAN-CAUSED_PHENOMENON_OR_PROCESS',
 'I-HUMAN-CAUSED_PHENOMENON_OR_PROCESS',
 'B-EVOLUTION',
 'I-EVOLUTION',
 'B-IMMUNE_RESPONSE',
 'I-IMMUNE_RESPONSE',
 'B-EDUCATIONAL_ACTIVITY',
 'I-EDUCATIONAL_ACTIVITY',
 'B-FOOD',
 'I-FOOD',
 'B-LANGUAGE',
 'I-LANGUAGE',
 'B-GPE',
 'I-GPE',
 'B-BODY_PART_ORGAN_OR_ORGAN_COMPONENT',
 'I-BODY_PART_ORGAN_OR_ORGAN_COMPONENT',
 'B-SOCIAL_BEHAVIOR',
 'I-SOCIAL_BEHAVIOR',
 'B-EVENT',
 'I-EVENT',
 'B-TISSUE',
 'I-TISSUE',
 'B-FAC',
 'I-FAC',
 'B-MONEY',
 '

In [15]:
from datasets import Dataset, Sequence, ClassLabel
raw_cord_ner_dataset = Dataset.from_pandas(data)
raw_cord_ner_dataset = raw_cord_ner_dataset.cast_column("labels", Sequence(ClassLabel(names=label_list)))
raw_cord_ner_dataset

Casting the dataset: 100%|██████████| 3210155/3210155 [59:13:00<00:00, 15.06 examples/s]   


Dataset({
    features: ['doc_id', 'sent_id', 'sent_tokens', 'labels'],
    num_rows: 3210155
})

Get list of label names from dataset

In [19]:
ner_feature = raw_cord_ner_dataset.features["labels"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-CARDINAL', 'I-CARDINAL', 'B-ORGANISM', 'I-ORGANISM', 'B-CELL', 'I-CELL', 'B-BACTERIUM', 'I-BACTERIUM', 'B-LABORATORY_OR_TEST_RESULT', 'I-LABORATORY_OR_TEST_RESULT', 'B-CELL_COMPONENT', 'I-CELL_COMPONENT', 'B-PERSON', 'I-PERSON', 'B-MACHINE_ACTIVITY', 'I-MACHINE_ACTIVITY', 'B-SIGN_OR_SYMPTOM', 'I-SIGN_OR_SYMPTOM', 'B-BODY_SUBSTANCE', 'I-BODY_SUBSTANCE', 'B-TIME', 'I-TIME', 'B-SUBSTRATE', 'I-SUBSTRATE', 'B-CELL_FUNCTION', 'I-CELL_FUNCTION', 'B-ORDINAL', 'I-ORDINAL', 'B-HUMAN-CAUSED_PHENOMENON_OR_PROCESS', 'I-HUMAN-CAUSED_PHENOMENON_OR_PROCESS', 'B-EVOLUTION', 'I-EVOLUTION', 'B-IMMUNE_RESPONSE', 'I-IMMUNE_RESPONSE', 'B-EDUCATIONAL_ACTIVITY', 'I-EDUCATIONAL_ACTIVITY', 'B-FOOD', 'I-FOOD', 'B-LANGUAGE', 'I-LANGUAGE', 'B-GPE', 'I-GPE', 'B-BODY_PART_ORGAN_OR_ORGAN_COMPONENT', 'I-BODY_PART_ORGAN_OR_ORGAN_COMPONENT', 'B-SOCIAL_BEHAVIOR', 'I-SOCIAL_BEHAVIOR', 'B-EVENT', 'I-EVENT', 'B-TISSUE', 'I-TISSUE', 'B-FAC', 'I-FAC', 'B-MONEY', 'I-MONEY', 'B-MATERIA

In [20]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-CARDINAL',
 'I-CARDINAL',
 'B-ORGANISM',
 'I-ORGANISM',
 'B-CELL',
 'I-CELL',
 'B-BACTERIUM',
 'I-BACTERIUM',
 'B-LABORATORY_OR_TEST_RESULT',
 'I-LABORATORY_OR_TEST_RESULT',
 'B-CELL_COMPONENT',
 'I-CELL_COMPONENT',
 'B-PERSON',
 'I-PERSON',
 'B-MACHINE_ACTIVITY',
 'I-MACHINE_ACTIVITY',
 'B-SIGN_OR_SYMPTOM',
 'I-SIGN_OR_SYMPTOM',
 'B-BODY_SUBSTANCE',
 'I-BODY_SUBSTANCE',
 'B-TIME',
 'I-TIME',
 'B-SUBSTRATE',
 'I-SUBSTRATE',
 'B-CELL_FUNCTION',
 'I-CELL_FUNCTION',
 'B-ORDINAL',
 'I-ORDINAL',
 'B-HUMAN-CAUSED_PHENOMENON_OR_PROCESS',
 'I-HUMAN-CAUSED_PHENOMENON_OR_PROCESS',
 'B-EVOLUTION',
 'I-EVOLUTION',
 'B-IMMUNE_RESPONSE',
 'I-IMMUNE_RESPONSE',
 'B-EDUCATIONAL_ACTIVITY',
 'I-EDUCATIONAL_ACTIVITY',
 'B-FOOD',
 'I-FOOD',
 'B-LANGUAGE',
 'I-LANGUAGE',
 'B-GPE',
 'I-GPE',
 'B-BODY_PART_ORGAN_OR_ORGAN_COMPONENT',
 'I-BODY_PART_ORGAN_OR_ORGAN_COMPONENT',
 'B-SOCIAL_BEHAVIOR',
 'I-SOCIAL_BEHAVIOR',
 'B-EVENT',
 'I-EVENT',
 'B-TISSUE',
 'I-TISSUE',
 'B-FAC',
 'I-FAC',
 'B-MONEY',
 '

In [22]:
words = raw_cord_ner_dataset[0]["sent_tokens"]
labels = raw_cord_ner_dataset[0]["labels"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

angiotensin_converting_enzyme 2                ( ace2             ) as a sars_cov_2    receptor   molecular_mechanisms and potential therapeutic_target sars_cov_2    has been sequenced 3 . 
B-GENE_OR_GENOME              I-GENE_OR_GENOME O B-GENE_OR_GENOME O O  O B-CORONAVIRUS B-CHEMICAL O                    O   O         O                  B-CORONAVIRUS O   O    O         O O 


### Dataset split into training, validation, and testing  

In [23]:
cord_ner_dataset = raw_cord_ner_dataset.train_test_split(train_size=0.8, seed=1980)
temp_cord_ner_dataset = cord_ner_dataset['test'].train_test_split(train_size=0.5, seed=1980)
cord_ner_dataset['validation'] = temp_cord_ner_dataset['train']
cord_ner_dataset['test'] = temp_cord_ner_dataset['test']
cord_ner_dataset

DatasetDict({
    train: Dataset({
        features: ['doc_id', 'sent_id', 'sent_tokens', 'labels'],
        num_rows: 2568124
    })
    test: Dataset({
        features: ['doc_id', 'sent_id', 'sent_tokens', 'labels'],
        num_rows: 321016
    })
    validation: Dataset({
        features: ['doc_id', 'sent_id', 'sent_tokens', 'labels'],
        num_rows: 321015
    })
})

In [29]:
for split in ['train','validation','test']:  
    ner_feature = cord_ner_dataset[split].features["labels"]
    label_names = ner_feature.feature.names
    print('Number of label names in dataset {} split:'.format(split), len(label_names))

Number of label names in dataset train split: 127
Number of label names in dataset validation split: 127
Number of label names in dataset test split: 127


### Save to disk

In [30]:
cord_ner_dataset.save_to_disk('./data/cord-ner')

Saving the dataset (3/3 shards): 100%|██████████| 2568124/2568124 [00:24<00:00, 105905.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321016/321016 [00:02<00:00, 108877.90 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 321015/321015 [00:02<00:00, 112307.92 examples/s]
