# Import Necessary Library

In [None]:
import json
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, \
                            TimeDistributed, LSTM, Dropout, Bidirectional, \
                            Conv1D, BatchNormalization
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer, BertConfig
from transformers import TFBertForTokenClassification, AdamW

plt.style.use("tableau-colorblind10")

In [None]:
 from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('drive/MyDrive/MIT_6.862/')

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/MIT_6.862/')

In [None]:
from metrics.ner_evaluation.ner_eval import collect_named_entities
from metrics.ner_evaluation.ner_eval import compute_metrics

# Load data and EDA

In [None]:
data = pd.read_csv('NER_data/ner_dataset.csv', encoding="latin1")

In [None]:
data.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 40496,the,NN,O
freq,1,52573,145807,887908


In [None]:
data.dtypes

Sentence #    object
Word          object
POS           object
Tag           object
dtype: object

In [None]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
# fill in the empty positions in column Sentence #
sentence_sep = data['Sentence #'].isna()
for i in range(data.shape[0]):
    if sentence_sep[i]:
        data.iloc[i,0] = data.iloc[i-1,0]

In [None]:
# save imputed dataset to csv
data.to_csv('NER_data/ner_dataset_fill.csv', index=False)

## Start to run from here!

In [None]:
# read in imputed dataset
df = pd.read_csv('NER_data/ner_dataset_fill.csv', index_col=False, encoding="latin1")

In [None]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [None]:
# check if there are any NAs left
df.isna().any()

Sentence #    False
Word          False
POS           False
Tag           False
dtype: bool

In [None]:
# Create unique word list, store the list and its length
words = sorted(df['Word'].unique())
words.append('ENDPAD')
words_size = len(words)

# Create unique tag list, store the list and its length
tags = sorted(df['Tag'].unique())
tags.append('PAD')
tags_size = len(tags)

# Create two dictionaries word:word_idx and word_idx:word
word2idx = {value: count for count, value in enumerate(words)}
idx2word = {count: value for value, count in word2idx.items()}

# Create two dictionaries tag:tag_idx and tag_idx:tag
tag2idx = {value: count for count, value in enumerate(tags)}
idx2tag = {count: value for value, count in tag2idx.items()}

In [None]:
# create list of list where each inner list is the list of word for each sentences
# create list of list where each inner list is the list of tag for each sentences
sentence_group = df.groupby('Sentence #')
sentence_list = []
tag_list = []
count = 1
for sen in sentence_group.groups.keys():
    if count % 5000 == 0:
        print(f'iter: {count}')
    count += 1
    df_group = sentence_group.get_group(sen)
    sentence_list.append(df_group['Word'].tolist())
    tag_list.append(df_group['Tag'].tolist())

iter: 5000
iter: 10000
iter: 15000
iter: 20000
iter: 25000
iter: 30000
iter: 35000
iter: 40000
iter: 45000


In [None]:
# pick the appropriate sentence length. Here we want to make sure that the majority of our sentences is shorter than our picked length.
# evectually we land on the 99.75% percentile.
max_length = int(np.percentile([len(sen) for sen in sentence_list], 99.75))
num_long_length = len([sen for sen in sentence_list if len(sen) > 40])
print(f'Picked max length for one sentence: {max_length}')
print(f'Number of sentences being trimmed: {num_long_length}')

Picked max length for one sentence: 50
Number of sentences being trimmed: 772


In [None]:
# set max_length
max_length = 50

## Train Test Split

In [None]:
# split train-test with ratio 0.1
sent_tr, sent_te, tag_tr, tag_te = train_test_split(sentence_list, tag_list, test_size=0.1, random_state=42)

## LSTM Pre-Processing

In [None]:
# create list of list where each inner list is the list of word indices for each sentences
# create list of list where each inner list is the list of tag indices for each sentences
X_tr = []
y_tr = []
for i in range(len(sent_tr)):
    X_tr.append(list(map(word2idx.get, sent_tr[i])))
    y_tr.append(list(map(tag2idx.get, tag_tr[i])))

X_te = []
y_te = []
for i in range(len(sent_te)):
    X_te.append(list(map(word2idx.get, sent_te[i])))
    y_te.append(list(map(tag2idx.get, tag_te[i])))

In [None]:
# pad both our X and y 
X_tr_pad = pad_sequences(sequences = X_tr, maxlen = max_length, padding = 'post', value = word2idx['ENDPAD'])
y_tr_pad = pad_sequences(sequences = y_tr, maxlen = max_length, padding = 'post', value = tag2idx["PAD"])

X_te_pad = pad_sequences(sequences = X_te, maxlen = max_length, padding = 'post', value = word2idx['ENDPAD'])
y_te_pad = pad_sequences(sequences = y_te, maxlen = max_length, padding = 'post', value = tag2idx["PAD"])

In [None]:
# examine class imbalance in training data
s, count = np.unique(list(np.concatenate(y_tr_pad).flat), return_counts=True)
print(pd.DataFrame(count, index = tags, columns = ['Count']))

         Count
B-art      355
B-eve      272
B-geo    33836
B-gpe    14267
B-nat      179
B-org    18075
B-per    15311
B-tim    18285
I-art      257
I-eve      213
I-geo     6697
I-gpe      182
I-nat       42
I-org    15075
I-per    15584
I-tim     5931
O       798806
PAD    1214783


In [None]:
# calculate average count per class
np.mean(count)

119897.22222222222

In [None]:
# one hot encode our target variable
y_tr_pad = to_categorical(y_tr_pad, num_classes=tags_size)

y_te_pad = to_categorical(y_te_pad, num_classes=tags_size)

## Build Bidirectional LSTM Model

In [None]:
# set parameters for Bidirectional LSTM
n_units = 100
drop_rate = .1
dim_embed = 50

optimizer = "rmsprop"
metrics = ['categorical_accuracy']

batch_size = 32
epochs = 20
validation_split = 0.1
verbose = 1

In [None]:
# calculate potential class weights for loss based on class imbalance
# didn't use this part eventually
y_tr_int = np.argmax(y_tr_pad, axis=2).flatten()
class_weights = compute_class_weight('balanced', np.unique(y_tr_int), y_tr_int)
class_weights = np.asarray(class_weights)

In [None]:
# to calculate the standard categorical cross entropy, we set the class weight to all ones
weights = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

In [None]:
# define a custom loss function that combines class weights with categorical cross entropy loss
def custom_loss(y_true, y_pred):

  # get the first two dimensions from y_pred
  if y_pred.shape[0] is None:
    x = 1
  else:
    x = int(y_pred.shape[0])
  y = y_pred.shape[1]

  # reshape weight for each batch
  batch_weights = np.array([np.array(weights)] * (x*y))
  batch_weights = batch_weights.reshape(x,y,18)
  batch_weights = tf.cast(batch_weights, tf.float64)

  # cast y_true and y_pred into tf.float64
  y_true = tf.cast(y_true, tf.float64)
  y_pred = tf.cast(y_pred, tf.float64)

  # return weighted categorical cross entropy
  return tf.math.reduce_sum(y_true * batch_weights, axis=-1) * tf.keras.losses.categorical_crossentropy(y_true, y_pred)

In [None]:
# Build BiLSTM model
model_title = "BiLSTM"
model = Sequential()
model.add(
    Embedding(
        input_dim = words_size, output_dim = dim_embed, input_length = max_length
    )
)
model.add(Dropout(drop_rate))
model.add(Bidirectional(LSTM(n_units, return_sequences = True)))
model.add(TimeDistributed(Dense(tags_size, activation = 'softmax')))

# Compile model
model.compile(optimizer=optimizer, loss=custom_loss, metrics=metrics)

In [None]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            1758950   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           120800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 18)            3618      
Total params: 1,883,368
Trainable params: 1,883,368
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# set early stopping for model
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto'
)

callbacks = [early_stopping]

In [None]:
# fit the model
history = model.fit(X_tr_pad, y_tr_pad, batch_size=batch_size, epochs=epochs, 
                  validation_split=validation_split, callbacks=callbacks, verbose=verbose)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [None]:
# Examine performance for  
y_pred = model.predict(X_te_pad, batch_size = batch_size, verbose = verbose)
y_pred_flat = np.argmax(y_pred, axis = 2).flatten()
y_te_flat = np.argmax(y_te_pad, axis = 2).flatten()

# display f1 score for each class and 
f1 = f1_score(y_te_flat, y_pred_flat, average = None)
print(pd.DataFrame(f1, index = tags))
print('Mean F1 across classes: ',np.mean(f1))

              0
B-art  0.071429
B-eve  0.360656
B-geo  0.877252
B-gpe  0.949495
B-nat  0.285714
B-org  0.741671
B-per  0.839132
B-tim  0.898318
I-art  0.000000
I-eve  0.197183
I-geo  0.803419
I-gpe  0.666667
I-nat  0.250000
I-org  0.765287
I-per  0.869072
I-tim  0.765018
O      0.990918
PAD    1.000000
Mean F1 across classes:  0.6295127563298624


### Entity-level for LSTM

In [None]:
def make_precision_recall(entity_level_dict, scheme):
  bert_entity_result = {}
  for tag in entity_level_dict.keys():
    if tag not in bert_entity_result.keys():
      bert_entity_result[tag] = {}
    bert_entity_result[tag]['precision'] = entity_level_dict[tag][scheme]['correct'] / entity_level_dict[tag][scheme]['actual'] 
    bert_entity_result[tag]['recall'] = entity_level_dict[tag][scheme]['correct'] / entity_level_dict[tag][scheme]['possible']
    if bert_entity_result[tag]['recall'] + bert_entity_result[tag]['precision'] != 0:
      bert_entity_result[tag]['f1'] = 2 * bert_entity_result[tag]['precision'] * bert_entity_result[tag]['recall'] / (bert_entity_result[tag]['recall'] + bert_entity_result[tag]['precision'])
    else:
      bert_entity_result[tag]['f1'] = 0
  return bert_entity_result

In [None]:
lstm_pred = np.argmax(y_pred, axis = 2)

In [None]:
lstm_pred_tag = []
y_true_tag_lstm = []
y_true_te = te_tags.numpy()

for i in range(len(bert_pred)):
  lstm_pred_tag.append(list(map(idx2tag.get, lstm_pred[i])))
  y_true_tag_lstm.append(list(map(idx2tag.get, y_true_te[i])))

In [None]:
entity_tag = ['art', 'eve', 'geo', 'gpe', 'nat', 'org', 'per', 'tim']

In [None]:
from copy import deepcopy

metrics_results = {'correct': 0, 'incorrect': 0, 'partial': 0,
                   'missed': 0, 'spurious': 0, 'possible': 0, 'actual': 0, 'precision': 0,'recall': 0,}

# overall results
results = {'strict': deepcopy(metrics_results),
           'ent_type': deepcopy(metrics_results),
           
           }

# results aggregated by entity type
evaluation_agg_entities_type_lstm = {e: deepcopy(results) for e in entity_tag}

for true_ents, pred_ents in zip(y_true_tag_lstm, lstm_pred_tag):    
    # compute results for one message
    tmp_results, tmp_agg_results = compute_metrics(collect_named_entities(true_ents),collect_named_entities(pred_ents), entity_tag)

    # aggregate overall results
    for eval_schema in results.keys():
        for metric in metrics_results.keys():
            results[eval_schema][metric] += tmp_results[eval_schema][metric]


    # aggregate results by entity type
    for e_type in entity_tag:
        for eval_schema in evaluation_agg_entities_type_lstm[e_type]:
            for metric in tmp_agg_results[e_type][eval_schema]:
                evaluation_agg_entities_type_lstm[e_type][eval_schema][metric] += tmp_agg_results[e_type][eval_schema][metric]

In [None]:
make_precision_recall(evaluation_agg_entities_type_lstm,'strict')

{'art': {'f1': 0.000319744204636291,
  'precision': 0.00016225864027259452,
  'recall': 0.010869565217391304},
 'eve': {'f1': 0.0012896985329679186,
  'precision': 0.0006492452523940919,
  'recall': 0.09523809523809523},
 'geo': {'f1': 0.21846110083914094,
  'precision': 0.19393939393939394,
  'recall': 0.25008140670791273},
 'gpe': {'f1': 0.1907018731663281,
  'precision': 0.11940087607743394,
  'recall': 0.4733893557422969},
 'nat': {'f1': 0.0006474587245063129,
  'precision': 0.00032499187520311994,
  'recall': 0.08333333333333333},
 'org': {'f1': 0.11816757304820721,
  'precision': 0.08908964558721334,
  'recall': 0.17542419266557197},
 'per': {'f1': 0.0613280845333057,
  'precision': 0.04314239906719137,
  'recall': 0.10601719197707736},
 'tim': {'f1': 0.14770240700218817,
  'precision': 0.09765625,
  'recall': 0.30296229802513464}}

## Build BERT

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

2021-04-18 18:47:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 18:47:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/vocab.txt HTTP/1.1" 200 0
2021-04-18 18:47:06 filelock DEBUG: Attempting to acquire lock 140143977945040 on /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
2021-04-18 18:47:06 filelock INFO: Lock 140143977945040 acquired on /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
2021-04-18 18:47:06 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 18:47:06 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /bert-base-cased/resolve/main/vocab.txt HTTP/1.1" 200 213450


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…

2021-04-18 18:47:07 filelock DEBUG: Attempting to release lock 140143977945040 on /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
2021-04-18 18:47:07 filelock INFO: Lock 140143977945040 released on /root/.cache/huggingface/transformers/6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
2021-04-18 18:47:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 18:47:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/added_tokens.json HTTP/1.1" 404 0
2021-04-18 18:47:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 18:47:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/special_tokens_map.json HTTP/1.1" 404 0
2021-04-18 18:47:07 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 18:47:07 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
2021-04-18 18:47:07 filelock DEBUG: Attempting to acquire lock 140142290720016 on /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
2021-04-18 18:47:07 filelock INFO: Lock 140142290720016 acquired on /root/.cache/huggi

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…

2021-04-18 18:47:08 filelock DEBUG: Attempting to release lock 140142290720016 on /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
2021-04-18 18:47:08 filelock INFO: Lock 140142290720016 released on /root/.cache/huggingface/transformers/ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
2021-04-18 18:47:08 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443





2021-04-18 18:47:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer.json HTTP/1.1" 200 0
2021-04-18 18:47:08 filelock DEBUG: Attempting to acquire lock 140142837075280 on /root/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
2021-04-18 18:47:08 filelock INFO: Lock 140142837075280 acquired on /root/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
2021-04-18 18:47:08 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 18:47:08 urllib3.connectionpool DEBUG: https://huggingface.co:443 "GET /bert-base-cased/resolve/main/tokenizer.json HTTP/1.1" 200 435797


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…

2021-04-18 18:47:09 filelock DEBUG: Attempting to release lock 140142837075280 on /root/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
2021-04-18 18:47:09 filelock INFO: Lock 140142837075280 released on /root/.cache/huggingface/transformers/226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock





In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels_tr = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sent_tr, tag_tr)
]

tokenized_texts_and_labels_te = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sent_te, tag_te)
]

In [None]:
tokenized_texts_tr = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_tr]
labels_tr = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_tr]

tokenized_texts_te = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels_te]
labels_te = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels_te]

In [None]:
tr_inputs = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_tr],
                          maxlen=max_length, dtype="long", value=word2idx['ENDPAD'],
                          truncating="post", padding="post")

te_inputs = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_te],
                          maxlen=max_length, dtype="long", value=word2idx['ENDPAD'],
                          truncating="post", padding="post")

In [None]:
tr_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_tr],
                     maxlen=max_length, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

te_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_te],
                     maxlen=max_length, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
tr_masks = [[float(i != word2idx['ENDPAD']) for i in ii] for ii in tr_inputs]

te_masks = [[float(i != word2idx['ENDPAD']) for i in ii] for ii in te_inputs]

In [None]:
tr_inputs = tf.convert_to_tensor(tr_inputs)
te_inputs = tf.convert_to_tensor(te_inputs)
tr_tags = tf.convert_to_tensor(tr_tags)
te_tags = tf.convert_to_tensor(te_tags)
tr_masks = tf.convert_to_tensor(tr_masks)
te_masks = tf.convert_to_tensor(te_masks)

In [None]:
tr_masks

<tf.Tensor: shape=(43163, 50), dtype=float32, numpy=
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>

In [None]:
# train_data = tf.data.Dataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

# valid_data = TensorDataset(val_inputs, val_masks, val_tags)
# valid_sampler = SequentialSampler(valid_data)
# valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [None]:
train_data = tf.data.Dataset.from_tensor_slices(((tr_inputs, 
                                                  tr_masks), 
                                                 tr_tags))

val_data = tf.data.Dataset.from_tensor_slices(((te_inputs, 
                                                  te_masks), 
                                               te_tags))

In [None]:
te_tags

<tf.Tensor: shape=(4796, 50), dtype=int64, numpy=
array([[16, 16, 16, ..., 17, 17, 17],
       [16, 16, 16, ..., 16, 16, 16],
       [16, 16, 16, ..., 17, 17, 17],
       ...,
       [16,  6, 14, ..., 17, 17, 17],
       [16, 16, 16, ..., 17, 17, 17],
       [ 6, 14, 14, ..., 17, 17, 17]])>

In [None]:
BATCH_SIZE = 32
TRAIN_SHUFFLE_BUFFER_SIZE = len(tr_tags)
VAL_SHUFFLE_BUFFER_SIZE = len(te_tags)
PREFETCH_BUFFER_SIZE = 100

# Transfer training data
train_data = train_data.shuffle(buffer_size=TRAIN_SHUFFLE_BUFFER_SIZE)
train_data = train_data.batch(batch_size=BATCH_SIZE)
train_data = train_data.prefetch(buffer_size=PREFETCH_BUFFER_SIZE)

# Transfer validation data
val_data = val_data.batch(batch_size=BATCH_SIZE)
val_data = val_data.prefetch(buffer_size=PREFETCH_BUFFER_SIZE)

print('train_data: ', train_data)
print('val_data: ', val_data)

train_data:  <PrefetchDataset shapes: (((None, 50), (None, 50)), (None, 50)), types: ((tf.int64, tf.float32), tf.int64)>
val_data:  <PrefetchDataset shapes: (((None, 50), (None, 50)), (None, 50)), types: ((tf.int64, tf.float32), tf.int64)>


In [None]:
model = TFBertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

2021-04-18 19:45:43 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 19:45:43 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
2021-04-18 19:45:43 urllib3.connectionpool DEBUG: Starting new HTTPS connection (1): huggingface.co:443
2021-04-18 19:45:44 urllib3.connectionpool DEBUG: https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tf_model.h5 HTTP/1.1" 302 0
2021-04-18 19:45:44 filelock DEBUG: Attempting to acquire lock 140139546855376 on /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5.lock
2021-04-18 19:45:44 filelock INFO: Lock 140139546855376 acquired on /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5.lock
2021-04-

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…

2021-04-18 19:45:53 filelock DEBUG: Attempting to release lock 140139546855376 on /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5.lock
2021-04-18 19:45:53 filelock INFO: Lock 140139546855376 released on /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5.lock





All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
FULL_FINETUNING = True
if False:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 10
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
# total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=0
#     num_training_steps=total_steps
# )

In [None]:
import keras
optimizer = keras.optimizers.Adam(lr=3e-5)
# Loss
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile
model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=['accuracy'])

# Callbacks
# Learning Rate Scheduler: Change learning rates during training epochs
def scheduler(epoch, lr):
  if epoch < 10:
    return lr
  else:
    return lr * 0.01
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
# Early Stopping: Stop training when a monitored metric has stopped improving
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto'
)
callbacks = [lr_scheduler,early_stopping]



In [None]:
import time
start_time = time.time()
training_results = model.fit(
        train_data,
        validation_data=val_data,
        epochs=epochs, 
        callbacks=callbacks,
        verbose=1)
execution_time = (time.time() - start_time)/60.0
print("Training execution time (mins)",execution_time)

Epoch 1/10
























Epoch 2/10
Epoch 3/10
Epoch 4/10
Training execution time (mins) 11.8635982076327


In [None]:
bert_logit = model.predict(val_data)
bert_pred = bert_logit[0].argmax(axis=2).flatten()









In [None]:
f1 = f1_score(te_tags.numpy().flatten(), bert_pred, average = None)
print(pd.DataFrame(f1, index = tags))
print('Mean F1 across classes: ',np.mean(f1))

              0
B-art  0.169935
B-eve  0.314286
B-geo  0.889625
B-gpe  0.946950
B-nat  0.205128
B-org  0.767917
B-per  0.850000
B-tim  0.883441
I-art  0.068966
I-eve  0.382353
I-geo  0.808559
I-gpe  0.722222
I-nat  0.400000
I-org  0.760237
I-per  0.886698
I-tim  0.798046
O      0.990768
PAD    1.000000
Mean F1 across classes:  0.6580627695777646


### Entity Level F-1 for BERT

In [None]:
bert_pred = bert_logit[0].argmax(axis=2)

In [None]:
bert_pred_tag = []
y_true_tag = []
y_true_te = te_tags.numpy()

for i in range(len(bert_pred)):
  bert_pred_tag.append(list(map(idx2tag.get, bert_pred[i])))
  y_true_tag.append(list(map(idx2tag.get, y_true_te[i])))

In [None]:
entity_tag = ['art', 'eve', 'geo', 'gpe', 'nat', 'org', 'per', 'tim']

In [None]:
from copy import deepcopy

metrics_results = {'correct': 0, 'incorrect': 0, 'partial': 0,
                   'missed': 0, 'spurious': 0, 'possible': 0, 'actual': 0, 'precision': 0,'recall': 0,}

# overall results
results = {'strict': deepcopy(metrics_results),
           'ent_type': deepcopy(metrics_results),
           
           }

# results aggregated by entity type
evaluation_agg_entities_type = {e: deepcopy(results) for e in entity_tag}

for true_ents, pred_ents in zip(y_true_tag, bert_pred_tag):    
    # compute results for one message
    tmp_results, tmp_agg_results = compute_metrics(collect_named_entities(true_ents),collect_named_entities(pred_ents), entity_tag)

    # aggregate overall results
    for eval_schema in results.keys():
        for metric in metrics_results.keys():
            results[eval_schema][metric] += tmp_results[eval_schema][metric]


    # aggregate results by entity type
    for e_type in entity_tag:
        for eval_schema in evaluation_agg_entities_type[e_type]:
            for metric in tmp_agg_results[e_type][eval_schema]:
                evaluation_agg_entities_type[e_type][eval_schema][metric] += tmp_agg_results[e_type][eval_schema][metric]

In [None]:
make_precision_recall(evaluation_agg_entities_type,'strict')

{'art': {'f1': 0.01827875095201828,
  'precision': 0.009828009828009828,
  'recall': 0.13043478260869565},
 'eve': {'f1': 0.014481094127111826,
  'precision': 0.0075,
  'recall': 0.20930232558139536},
 'geo': {'f1': 0.8487229862475443,
  'precision': 0.7918781725888325,
  'recall': 0.914360143275806},
 'gpe': {'f1': 0.7043918918918919,
  'precision': 0.5652321247034904,
  'recall': 0.934453781512605},
 'nat': {'f1': 0.008244023083264633,
  'precision': 0.004205214465937763,
  'recall': 0.20833333333333334},
 'org': {'f1': 0.6276712663328856,
  'precision': 0.5687098915689311,
  'recall': 0.7002724795640327},
 'per': {'f1': 0.6946272769018828,
  'precision': 0.6050666666666666,
  'recall': 0.8153072224218469},
 'tim': {'f1': 0.7115384615384616,
  'precision': 0.6052217678515256,
  'recall': 0.8631673396141768}}