# Load dataset

Data format:

|id|word_seq|tag_seq|
|:--|:--|:--|
|index of the sentence|tokenized words|corresponding NER tags|
|0|`["protection", "calves", ...]`|`["O", "LIVESTOCK", ...]`|
|1|`["prevent", "diarrhea",...]` |`["O", "DISEASE_OR_SYNDROME", ...]`|
|...|...|...|



There are 64 categories of NER tags (plus 1 padding token).

The ground-truth tags are provided for the training and testing set, while being omitted in the testing set.

In [42]:
import keras
from keras.utils import to_categorical
import numpy as np
import os
import pickle as pkl

train_dict = pkl.load(open("data/train.pkl", "rb"))
val_dict = pkl.load(open("data/val.pkl", "rb"))
test_dict = pkl.load(open("data/test.pkl", "rb"))
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

keys in train_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in val_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in test_dict: dict_keys(['id', 'word_seq'])


In [39]:
# an entry of the dataset
print("index:", train_dict["id"][0])
print(*zip(train_dict["word_seq"][0], train_dict["tag_seq"][0]))

index: 0
('Protection', 'O') ('of', 'O') ('calves', 'LIVESTOCK') ('against', 'O') ('fatal', 'O') ('enteric', 'DISEASE_OR_SYNDROME') ('colibacillosis', 'DISEASE_OR_SYNDROME') ('by', 'O') ('orally', 'GENE_OR_GENOME') ('administered', 'GENE_OR_GENOME') ('Escherichia', 'GENE_OR_GENOME') ('coli', 'GENE_OR_GENOME') ('K99', 'GENE_OR_GENOME') ('-', 'O') ('specific', 'CARDINAL') ('monoclonal', 'CARDINAL') ('antibody', 'CARDINAL') ('.', 'O') ('A', 'O') ('monoclonal', 'CHEMICAL') ('antibody', 'CHEMICAL') ('(', 'O') ('MCA', 'GENE_OR_GENOME') (')', 'O') ('to', 'O') ('enterotoxigenic', 'CHEMICAL') ('Escherichia', 'CHEMICAL') ('coli', 'CHEMICAL') ('K99', 'O') ('antigen', 'O') ('agglutinated', 'O') ('K99+', 'GENE_OR_GENOME') ('enterotoxigenic', 'GENE_OR_GENOME') ('E', 'GENE_OR_GENOME') ('.', 'O') ('coli', 'CHEMICAL') ('strains', 'CHEMICAL') ('B44', 'CHEMICAL') ('(', 'O') ('O9', 'O') (':', 'O') ('K30', 'O') (';', 'O') ('K99', 'O') (';', 'O') ('F41', 'O') (':', 'O') ('H-', 'O') (')', 'O') ('and', 'O') (

In [43]:
# all the NER tags:
from itertools import chain
print("count of the NER tags:", len(set(chain(*train_dict["tag_seq"]))))
print("all the NER tags:", set(chain(*train_dict["tag_seq"])))

count of the NER tags: 65
all the NER tags: {'ARCHAEON', 'INJURY_OR_POISONING', 'MOLECULAR_FUNCTION', 'GENE_OR_GENOME', 'ORDINAL', 'WILDLIFE', 'CHEMICAL', 'VIRAL_PROTEIN', 'DIAGNOSTIC_PROCEDURE', 'FOOD', 'CARDINAL', '_t_pad_', 'LAW', 'ORGAN_OR_TISSUE_FUNCTION', 'EDUCATIONAL_ACTIVITY', 'LABORATORY_PROCEDURE', 'CELL_COMPONENT', 'SUBSTRATE', 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY', 'IMMUNE_RESPONSE', 'RESEARCH_ACTIVITY', 'GPE', 'EXPERIMENTAL_MODEL_OF_DISEASE', 'ORG', 'EUKARYOTE', 'DISEASE_OR_SYNDROME', 'SOCIAL_BEHAVIOR', 'CELL_FUNCTION', 'LIVESTOCK', 'BODY_SUBSTANCE', 'NORP', 'DATE', 'INDIVIDUAL_BEHAVIOR', 'TIME', 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE', 'DAILY_OR_RECREATIONAL_ACTIVITY', 'GROUP', 'TISSUE', 'O', 'EVOLUTION', 'SIGN_OR_SYMPTOM', 'QUANTITY', 'LOC', 'PHYSICAL_SCIENCE', 'HUMAN-CAUSED_PHENOMENON_OR_PROCESS', 'ORGANISM', 'MONEY', 'CELL_OR_MOLECULAR_DYSFUNCTION', 'GROUP_ATTRIBUTE', 'PERSON', 'BACTERIUM', 'PERCENT', 'CORONAVIRUS', 'EVENT', 'MATERIAL', 'WORK_OF_ART', 'PRODUCT', 'FAC', '

# Prepare the data for training

In [44]:
# prepare word vocab and tag vocab

vocab_dict = {'_unk_': 0, '_w_pad_': 1}

for doc in train_dict['word_seq']:
    for word in doc:
        if(word not in vocab_dict):
            vocab_dict[word] = len(vocab_dict)

tag_dict = {'_t_pad_': 0} # add a padding token

for tag_seq in train_dict['tag_seq']:
    for tag in tag_seq:
        if(tag not in tag_dict):
            tag_dict[tag] = len(tag_dict)
word2idx = vocab_dict
idx2word = {v:k for k,v in word2idx.items()}
tag2idx = tag_dict
idx2tag = {v:k for k,v in tag2idx.items()}            

print("size of word vocab:", len(vocab_dict), "size of tag_dict:", len(tag_dict))

size of word vocab: 82275 size of tag_dict: 65


In [45]:
# The maximum length of a sentence is set to 128
max_sent_length = 128

train_tokens = np.array([[word2idx[w] for w in doc] for doc in train_dict['word_seq']])
val_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in val_dict['word_seq']])
test_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in test_dict['word_seq']])


train_tags = [[tag2idx[t] for t in t_seq] for t_seq in train_dict['tag_seq']]
train_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in train_tags])

val_tags = [[tag2idx[t] for t in t_seq] for t_seq in val_dict['tag_seq']]
val_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in val_tags])

# we don't have test tags

In [22]:
print("training size:", train_tokens.shape, "tag size:", train_tags.shape)
print("validating size:", val_tokens.shape, "tag size:", val_tags.shape)
print(train_dict['word_seq'][:2])
print(np.array(train_dict['word_seq']).shape)
print(train_tokens[:2])

training size: (23600, 128) tag size: (23600, 128, 65)
validating size: (2950, 128) tag size: (2950, 128, 65)
[['Protection', 'of', 'calves', 'against', 'fatal', 'enteric', 'colibacillosis', 'by', 'orally', 'administered', 'Escherichia', 'coli', 'K99', '-', 'specific', 'monoclonal', 'antibody', '.', 'A', 'monoclonal', 'antibody', '(', 'MCA', ')', 'to', 'enterotoxigenic', 'Escherichia', 'coli', 'K99', 'antigen', 'agglutinated', 'K99+', 'enterotoxigenic', 'E', '.', 'coli', 'strains', 'B44', '(', 'O9', ':', 'K30', ';', 'K99', ';', 'F41', ':', 'H-', ')', 'and', 'B41', '(', 'O101', ':', 'K99', ';', 'F41', ':', 'H-', ')', 'grown', 'at', '37', 'degrees', 'C', 'but', 'not', 'at', '18', 'degrees', 'C.', 'The', 'MCA', ',', 'which', 'was', 'characterized', 'as', 'immunoglobulin', 'G1', ',', 'reacted', 'specifically', 'with', 'K99', 'antigen', 'in', 'an', 'enzyme-linked', 'immunosorbent', 'assay', 'and', 'precipitated', 'radiolabeled', 'K99', 'antigen', '.', 'A', 'total', 'of', '45', 'colostrum', 

In [29]:
# an example of training instance and training tags.
print(train_tokens[0,:10], np.argmax(train_tags[0, :10, :], axis=1))
print(train_dict['word_seq'][0][:10], train_dict['tag_seq'][0][:10])

[ 2  3  4  5  6  7  8  9 10 11] [1 1 2 1 1 3 3 1 4 4]
['Protection', 'of', 'calves', 'against', 'fatal', 'enteric', 'colibacillosis', 'by', 'orally', 'administered'] ['O', 'O', 'LIVESTOCK', 'O', 'O', 'DISEASE_OR_SYNDROME', 'DISEASE_OR_SYNDROME', 'O', 'GENE_OR_GENOME', 'GENE_OR_GENOME']


# Two simple models and codes for evaluation

1. Predict all the tags as "O".
2. Random guess

You could use the `calc_accuracy` function to evaluate the accuracy of your predictions.

In [46]:
# Provided function to test accuracy
# You could check the validation accuracy to select the best of your models
def calc_accuracy(preds, tags, padding_id="_t_pad_"):
    """
        Input:
            preds (np.narray): (num_data, length_sentence)
            tags  (np.narray): (num_data, length_sentence)
        Output:
            Proportion of correct prediction. The padding tokens are filtered out.
    """
    preds_flatten = preds.flatten()
    tags_flatten = tags.flatten()
    non_padding_idx = np.where(tags_flatten!=padding_id)[0]
    
    return sum(preds_flatten[non_padding_idx]==tags_flatten[non_padding_idx])/len(non_padding_idx)

In [32]:
# Check accuracy on the training set
train_tags_by_idx = np.argmax(train_tags, axis=2)
train_labels = np.array([[idx2tag[p] for p in preds] for preds in train_tags_by_idx])

print(calc_accuracy(train_labels, train_labels))

# Predict all labels as "O"
# np.ones will create a matrix that contain all ones and idx2tag will change 1 to O 

baseline1_train_preds = np.array([[idx2tag[p] for p in preds] for preds in np.ones(train_labels.shape)])
print(baseline1_train_preds.shape)
print( train_labels.shape)
print("baseline 1, make all predictions as 1. Acc:", 
      calc_accuracy(baseline1_train_preds, 
                    train_labels))

# Randomly guess labels.
baseline2_train_preds = np.array([[idx2tag[p] for p in preds] for preds in np.random.randint(1, len(tag_dict), train_labels.shape)]) 
print("baseline 2, Random guess. Acc:", 
      calc_accuracy(baseline2_train_preds,
                    train_labels))



1.0
(23600, 128)
(23600, 128)
baseline 1, make all predictions as 1. Acc: 0.7562260387120905
baseline 2, Random guess. Acc: 0.015658738595044963


AttributeError: 'list' object has no attribute 'shape'

In [49]:
# train the model 
# calculate the train accuracy 

# our data 
# train_labels 
# train_dict
# train_tags 
# train_tokens 
# 
# val_dict 
# val_tags 
# val_tokens 
# 
# test_dict 
# test_tokens
# test_tags   

# our target is to create something like the train_preds with train_preds.shape = train_labels.shape 

import kashgari
from kashgari.tasks.labeling import BiLSTM_Model 
import os 
model = BiLSTM_Model () 
train_x = train_dict['word_seq'] 
train_y = train_dict ['tag_seq'] 
valid_x = val_dict ['word_seq']
valid_y = val_dict[ 'tag_seq'] 
test_x = test_dict['word_seq']
print(np.array(train_x).shape , np.array(train_y).shape, np.array(valid_x).shape, np.array(valid_y).shape)
for b in [64, 128 ] : 
    for e in [5 , 10 , 20 , 30 ] : 
       
        if os.path.isdir(f'saved_ner_model_{b}_{e}') : 
            model = BiLSTM_Model.load_model(f'saved_ner_model_{b}_{e}')
        else : 
            model = BiLSTM_Model() 
            model.fit(train_x, train_y , valid_x, valid_y , batch_size=b, epochs=e)
            model.save(f'saved_ner_model_{b}_{e}')
        BiLSTM_train_preds = model.predict(train_x)
        BiLSTM_val_preds = model.predict(valid_x)
        train_report = model.evaluate(test_x , test_y )
        val_report = model.evaluate(valid_x , valid_y )
        print("BiLSTM_MODEL train preds : " , calc_accuracy(np.array(BiLSTM_train_preds) , np.array(train_y) ) ) 
        print("BiLSTM_MODEL valid pred : " , calc_accuracy(np.array(BiLSTM_val_preds) , np.array(valid_y ) ) ) 
# test_y = model.predict(test_x) 
# report = model.evaluate(test_x , real_test_y ) 

                       WORK_OF_ART     0.0000    0.0000    0.0000        37
                          BACTERIUM     0.8293    0.5965    0.6939        57
      EXPERIMENTAL_MODEL_OF_DISEASE     0.9412    0.6857    0.7934        70
                    SOCIAL_BEHAVIOR     0.8161    0.7978    0.8068        89
               EDUCATIONAL_ACTIVITY     0.0000    0.0000    0.0000         9
               ANATOMICAL_STRUCTURE     0.8810    0.5286    0.6607        70
      CELL_OR_MOLECULAR_DYSFUNCTION     0.8623    0.8440    0.8530       141
                                FAC     0.4000    0.0417    0.0755        48
                            PERCENT     0.5000    0.2308    0.3158        13
                           LANGUAGE     0.3333    0.2222    0.2667         9
               DIAGNOSTIC_PROCEDURE     0.8955    0.6742    0.7692        89
                               FOOD     0.9286    0.7647    0.8387        34
                               TIME     0.7241    0.5385    0.6176        39


KeyboardInterrupt: 

In [36]:
# print(report )
# get the test pred by loading the models 
# find best f1 score 
# find best epoch 

{'detail': {'CARDINAL': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2280}, 'GENE_OR_GENOME': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 9590}, 'CHEMICAL': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 11470}, 'GPE': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 900}, 'DISEASE_OR_SYNDROME': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6335}, 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1172}, 'CORONAVIRUS': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2525}, 'BODY_PART_ORGAN_OR_ORGAN_COMPONENT': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 394}, 'LIVESTOCK': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 558}, 'DATE': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1831}, 'LOC': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 99}, 'GROUP': {'precision': 1.0, 'recall': 1.0

# Output format

In this project, you should predict the NER tags for the test set tokens.

The index of test set starts from 0 and ends with 2949.

You should write the predictions into a .csv file, where the first column is the test indexes in ascending order, and the second column is a json format prediction list.

E.g.

|id|labels|
|:--:|:--:|
|0|`['O', 'O', 'CHEMICAL', 'VIRUS', ...]`|
|1|`['O', 'O', 'GENE_OR_GENOME', ...]`|
|...|...|

Format requirements:
1. The first column `id` should be an integer, in ascending order, starting from 0 and corresponding to the index in test_dict.
2. The second column `labels` should be a dumped string using json, storing the your predictions for each token. The size of the list should be exactly 128, including padding tokens.

### For example, this is your prediction for the test set:

In [10]:
test_preds_numerical = np.random.randint(1, len(tag_dict), 
                                         (len(test_dict["id"]), max_sent_length))
test_preds = np.array([[idx2tag[p] for p in preds] for preds in test_preds_numerical])
print(test_preds.shape)
print(test_preds[0])

# use the model to make the preds on test and create a matirx called test_preds_numerical
# change the index back to tag 

(2950, 128)
['PERCENT' 'GENE_OR_GENOME' 'FAC' 'FAC' 'IMMUNE_RESPONSE'
 'LABORATORY_OR_TEST_RESULT' 'INJURY_OR_POISONING' 'TISSUE' 'SUBSTRATE'
 'TISSUE' 'DIAGNOSTIC_PROCEDURE' 'WORK_OF_ART' 'NORP' 'SIGN_OR_SYMPTOM'
 'EXPERIMENTAL_MODEL_OF_DISEASE' 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE'
 'GPE' 'VIRUS' 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE'
 'DAILY_OR_RECREATIONAL_ACTIVITY' 'LAW' 'NORP' 'EVENT' 'GROUP_ATTRIBUTE'
 'LIVESTOCK' 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY'
 'ORGAN_OR_TISSUE_FUNCTION' 'TISSUE' 'LOC' 'BODY_SUBSTANCE'
 'BODY_SUBSTANCE' 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE'
 'DIAGNOSTIC_PROCEDURE' 'BODY_PART_ORGAN_OR_ORGAN_COMPONENT' 'TISSUE'
 'EVENT' 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE'
 'CELL_OR_MOLECULAR_DYSFUNCTION' 'SIGN_OR_SYMPTOM' 'BACTERIUM'
 'CELL_COMPONENT' 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY' 'CARDINAL'
 'CORONAVIRUS' 'GROUP' 'GROUP_ATTRIBUTE' 'DISEASE_OR_SYNDROME' 'LIVESTOCK'
 'WORK_OF_ART' 'LABORATORY_OR_TEST_RESULT' 'VIRAL_PROTEIN'
 'INDIVIDUAL_BEHAVIOR' 'CHEMICAL' 'ORDINA

In [11]:
# Let's take the baseline 1 as an example, where we predict all labels as 1.

import json
import pandas as pd

df = pd.DataFrame({'id': test_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in test_preds]})
df.to_csv('test_preds.csv', index=False)

In [12]:
pd.read_csv("test_preds.csv")

Unnamed: 0,id,labels
0,0,"[""PERCENT"", ""GENE_OR_GENOME"", ""FAC"", ""FAC"", ""I..."
1,1,"[""EVENT"", ""IMMUNE_RESPONSE"", ""ORDINAL"", ""MACHI..."
2,2,"[""RESEARCH_ACTIVITY"", ""CARDINAL"", ""ORG"", ""LABO..."
3,3,"[""LIVESTOCK"", ""CORONAVIRUS"", ""ORG"", ""ANATOMICA..."
4,4,"[""PHYSICAL_SCIENCE"", ""DISEASE_OR_SYNDROME"", ""B..."
...,...,...
2945,2945,"[""PERSON"", ""LANGUAGE"", ""WILDLIFE"", ""LOC"", ""CEL..."
2946,2946,"[""IMMUNE_RESPONSE"", ""SUBSTRATE"", ""THERAPEUTIC_..."
2947,2947,"[""TISSUE"", ""CHEMICAL"", ""NORP"", ""WORK_OF_ART"", ..."
2948,2948,"[""VIRUS"", ""MOLECULAR_FUNCTION"", ""CHEMICAL"", ""G..."


# Please make your output-format exactly the same as above

You could check it by playing around with the validation set with our evaluation codes `evaluate.py`:

In [13]:
# val_preds_numerical = np.random.randint(1, len(tag_dict), 
#                                          (len(val_dict["id"]), max_sent_length))
val_preds = np.array([[idx2tag[p] for p in preds] for preds in np.ones((len(val_dict["id"]), max_sent_length))])

import json
import pandas as pd

df = pd.DataFrame({'id': val_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in val_preds]})
df.to_csv('val_preds.csv', index=False)

from evaluate import evaluate

print("val accuracy", evaluate('val_preds.csv', "data/val.pkl"))

val accuracy 0.754806329370245
