In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 10000)
import numpy as np
from tqdm import tqdm, trange
import torch
#from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
#import matplotlib.pyplot as plt
import os
import transformers
from transformers import BertForTokenClassification
from seqeval.metrics import f1_score, accuracy_score

transformers.__version__

torch.__version__

'1.8.1+cu102'

In [3]:
tag_values = ['O', 'I-corporation', 'B-group', 'I-creative-work', 'I-group', 'I-product', 'B-location', 'B-corporation', 'B-person', 'I-person', 'I-location', 'B-creative-work', 'B-product', 'PAD']
tag2idx = {t: i for i, t in enumerate(tag_values)}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx), output_attentions = False, output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [4]:
model.load_state_dict(torch.load("../Downloads/bert_base_emerging_entities_1.pt", map_location=torch.device('cpu')))

<All keys matched successfully>

In [5]:
i = 0
sentence_no = []
f = open('data/emerging.test.annotated', 'r')
for line in f.readlines():
    if len(line.strip()) == 0:
        i = i + 1
    else:
        sentence_no.append(i)

In [6]:
def model_test(data, tokenizer, model):
    test = []
    test_data=data

    sentence_no = 0
    for data in test_data:
        tokenized_sentence = tokenizer.encode(data.lower().strip())
        input_ids = torch.tensor([tokenized_sentence])

        with torch.no_grad():
             output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

        # join bpe split tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)

        for token, label in zip(new_tokens, new_labels):
            test.append((str(sentence_no), label, token))
        sentence_no = sentence_no + 1
    test = pd.DataFrame(test, columns=['sentence_no', 'labels', 'token'])
    return test

In [7]:
df = pd.read_csv('data/emerging.test.annotated', header=None, sep="\t", encoding="latin1", quoting=3, names=['Word', 'Tag'])
df['Sentence #'] = sentence_no
df.dropna(inplace=True, axis=0)

In [9]:
df.head()

Unnamed: 0,Word,Tag,Sentence #
0,&,O,0
1,gt,O,0
2,;,O,0
3,*,O,0
4,The,O,0


In [8]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                     s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

getter = SentenceGetter(df)
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [10]:
sentences[0]

['&',
 'gt',
 ';',
 '*',
 'The',
 'soldier',
 'was',
 'killed',
 'when',
 'another',
 'avalanche',
 'hit',
 'an',
 'army',
 'barracks',
 'in',
 'the',
 'northern',
 'area',
 'of',
 'Sonmarg',
 ',',
 'said',
 'a',
 'military',
 'spokesman',
 '.']

In [11]:
labels[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-location',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [13]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.basic_tokenizer.tokenize(str(word))
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [14]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)]

In [15]:
tokenized_texts_and_labels[0]

(['&',
  'gt',
  ';',
  '*',
  'The',
  'soldier',
  'was',
  'killed',
  'when',
  'another',
  'avalanche',
  'hit',
  'an',
  'army',
  'barracks',
  'in',
  'the',
  'northern',
  'area',
  'of',
  'Sonmarg',
  ',',
  'said',
  'a',
  'military',
  'spokesman',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-location',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [16]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [17]:
i = 0
sentence_no = 0
new_sentence = []
new_data = []
for sentence, label in zip(tokenized_texts, labels):
    new_tokens = []
    new_tags = []
    for token, tag in zip(sentence, label):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_tokens.append(token)
            new_tags.append(tag)
    for new_token, new_tag in zip(new_tokens, new_tags):
        new_data.append((sentence_no, new_token, new_tag))
    sentence_no = sentence_no + 1

In [18]:
data = pd.DataFrame(new_data, columns=['Sentence #', 'Word', 'Tag'])

In [19]:
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,0,&,O
1,0,gt,O
2,0,;,O
3,0,*,O
4,0,The,O


In [20]:
g_test = data.groupby("Sentence #")
test_df = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(sdf.Word)),
                       "Tag": g_test.apply(lambda sdf: ",".join(sdf.Tag))})
test_df.reset_index(inplace=True)

In [22]:
test_df.head()

Unnamed: 0,Sentence #,Sentence,Tag
0,0,"& gt ; * The soldier was killed when another avalanche hit an army barracks in the northern area of Sonmarg , said a military spokesman .","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-location,O,O,O,O,O,O"
1,1,& gt ; * Police last week evacuated 80 villagers from Waltengoo Nar where dozens were killed after a series of avalanches hit the area in 2005 in the south of the territory .,"O,O,O,O,O,O,O,O,O,O,O,B-location,I-location,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,2,& gt ; * The army on Thursday recovered the bodies of ten of its men who were killed in an avalanche the previous day .,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,3,"& gt ; * The four civilians killed included two children of a family whose house was hit by a separate avalanche , also on Wednesday , a police spokesman said .","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,4,"The bodies of the soldiers were recovered after the concerted efforts of the Avalanche Rescue Teams ( ART ) , which is equipped to work in inhospitable terrain and weather conditions .","O,O,O,O,O,O,O,O,O,O,O,O,O,B-group,I-group,I-group,O,B-group,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [21]:
test_df.isna().sum()

Sentence #    0
Sentence      0
Tag           0
dtype: int64

In [23]:
test = model_test(test_df['Sentence'].values.tolist(), tokenizer, model)

In [24]:
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,0,&,O
1,0,gt,O
2,0,;,O
3,0,*,O
4,0,The,O


In [25]:
indexNames = test[test['token'] == "[CLS]" ].index
test.drop(indexNames, inplace=True)
indexNames = test[test['token'] == "[SEP]" ].index
test.drop(indexNames, inplace=True)
test.reset_index(drop=True, inplace=True)

In [26]:
test.head()

Unnamed: 0,sentence_no,labels,token
0,0,O,&
1,0,O,gt
2,0,O,;
3,0,O,*
4,0,O,the


In [27]:
test.tail()

Unnamed: 0,sentence_no,labels,token
30247,1286,O,with
30248,1286,O,this
30249,1286,O,dress
30250,1286,O,code
30251,1286,O,ð


In [28]:
data.tail()

Unnamed: 0,Sentence #,Word,Tag
30247,1286,with,O
30248,1286,this,O
30249,1286,dress,O
30250,1286,code,O
30251,1286,ð,O


In [29]:
test['ground_truth_label'] = data['Tag']
test['ground_truth_token'] = data['Word']

In [30]:
test['ground_truth_label'].unique()

array(['O', 'B-location', 'I-location', 'B-group', 'I-group', 'B-person',
       'I-person', 'B-creative-work', 'I-creative-work', 'B-corporation',
       'B-product', 'I-corporation', 'I-product'], dtype=object)

In [None]:
g_test = test.groupby("sentence_no")
test = pd.DataFrame({"model_tag": g_test.apply(lambda sdf: sdf.labels.values.tolist()),
                       "ground_truth_tag": g_test.apply(lambda sdf: sdf.ground_truth_label.values.tolist())})

In [None]:
test['Sentence #'] = test.index
test[["Sentence #"]] = test[["Sentence #"]].apply(pd.to_numeric)
test.sort_values('Sentence #', inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
print("Accuracy: " , accuracy_score(test['model_tag'].values.tolist(), test['ground_truth_tag'].values.tolist()))
print("F1 Score: ",f1_score(test['model_tag'].values.tolist(), test['ground_truth_tag'].values.tolist()))
