# Import Necessary Library

In [68]:
import json
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, \
                            TimeDistributed, LSTM, Dropout, Bidirectional, \
                            Conv1D, BatchNormalization
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

plt.style.use("tableau-colorblind10")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
os.chdir('drive/MyDrive/MIT_6.862/')

In [69]:
import sys
sys.path.append('/content/drive/MyDrive/MIT_6.862/')

In [70]:
from ner_evaluation.ner_eval import collect_named_entities
from ner_evaluation.ner_eval import compute_metrics

# Load data and EDA

In [None]:
data = pd.read_csv('NER_data/ner_dataset.csv', encoding="latin1")

In [None]:
data.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,47959,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 40496,the,NN,O
freq,1,52573,145807,887908


In [None]:
data.dtypes

Sentence #    object
Word          object
POS           object
Tag           object
dtype: object

In [None]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
# fill in the empty positions in column Sentence #
sentence_sep = data['Sentence #'].isna()
for i in range(data.shape[0]):
    if sentence_sep[i]:
        data.iloc[i,0] = data.iloc[i-1,0]

In [None]:
# save imputed dataset to csv
data.to_csv('NER_data/ner_dataset_fill.csv', index=False)

## Start to run from here!

In [None]:
# read in imputed dataset
df = pd.read_csv('NER_data/ner_dataset_fill.csv', index_col=False, encoding="latin1")

In [None]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [None]:
# check if there are any NAs left
df.isna().any()

Sentence #    False
Word          False
POS           False
Tag           False
dtype: bool

In [None]:
# Create unique word list, store the list and its length
words = sorted(df['Word'].unique())
words.append('ENDPAD')
words_size = len(words)

# Create unique tag list, store the list and its length
tags = sorted(df['Tag'].unique())
tags.append('PAD')
tags_size = len(tags)

# Create two dictionaries word:word_idx and word_idx:word
word2idx = {value: count for count, value in enumerate(words)}
idx2word = {count: value for value, count in word2idx.items()}

# Create two dictionaries tag:tag_idx and tag_idx:tag
tag2idx = {value: count for count, value in enumerate(tags)}
idx2tag = {count: value for value, count in tag2idx.items()}

In [None]:
# create list of list where each inner list is the list of word for each sentences
# create list of list where each inner list is the list of tag for each sentences
sentence_group = df.groupby('Sentence #')
sentence_list = []
tag_list = []
count = 1
for sen in sentence_group.groups.keys():
    if count % 5000 == 0:
        print(f'iter: {count}')
    count += 1
    df_group = sentence_group.get_group(sen)
    sentence_list.append(df_group['Word'].tolist())
    tag_list.append(df_group['Tag'].tolist())

iter: 5000
iter: 10000
iter: 15000
iter: 20000
iter: 25000
iter: 30000
iter: 35000
iter: 40000
iter: 45000


In [None]:
# create list of list where each inner list is the list of word indices for each sentences
# create list of list where each inner list is the list of tag indices for each sentences
X = []
y = []
for i in range(len(sentence_list)):
    X.append(list(map(word2idx.get, sentence_list[i])))
    y.append(list(map(tag2idx.get, tag_list[i])))

In [None]:
# pick the appropriate sentence length. Here we want to make sure that the majority of our sentences is shorter than our picked length.
# evectually we land on the 99.75% percentile.
max_length = int(np.percentile([len(sen) for sen in sentence_list], 99.75))
num_long_length = len([sen for sen in sentence_list if len(sen) > 40])
print(f'Picked max length for one sentence: {max_length}')
print(f'Number of sentences being trimmed: {num_long_length}')

Picked max length for one sentence: 50
Number of sentences being trimmed: 772


In [None]:
# set max_length
max_length = 50

In [None]:
# pad both our X and y 
X_pad = pad_sequences(sequences = X, maxlen = max_length, padding = 'post', value = word2idx['ENDPAD'])
y_pad = pad_sequences(sequences = y, maxlen = max_length, padding = 'post', value = tag2idx["PAD"])

In [None]:
# one hot encode our target variable
y_pad = to_categorical(y_pad, num_classes=tags_size)

## Train Test Split

In [None]:
# split train-test with ratio 0.1
X_tr, X_te, y_tr, y_te = train_test_split(X_pad, y_pad, test_size=0.1, random_state=42)

## Build Bidirectional LSTM Model

In [None]:
# set parameters for Bidirectional LSTM
n_units = 100
drop_rate = .1
dim_embed = 50

optimizer = "rmsprop"
metrics = ['categorical_accuracy']

batch_size = 32
epochs = 20
validation_split = 0.1
verbose = 1

In [None]:
# calculate potential class weights for loss based on class imbalance
# didn't use this part eventually
y_tr_int = np.argmax(y_tr, axis=2).flatten()
class_weights = compute_class_weight('balanced', np.unique(y_tr_int), y_tr_int)
class_weights = np.asarray(class_weights)

In [None]:
# to calculate the standard categorical cross entropy, we set the class weight to all ones
weights = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

In [None]:
# define a custom loss function that combines class weights with categorical cross entropy loss
def custom_loss(y_true, y_pred):

  # get the first two dimensions from y_pred
  if y_pred.shape[0] is None:
    x = 1
  else:
    x = int(y_pred.shape[0])
  y = y_pred.shape[1]

  # reshape weight for each batch
  batch_weights = np.array([np.array(weights)] * (x*y))
  batch_weights = batch_weights.reshape(x,y,18)
  batch_weights = tf.cast(batch_weights, tf.float64)

  # cast y_true and y_pred into tf.float64
  y_true = tf.cast(y_true, tf.float64)
  y_pred = tf.cast(y_pred, tf.float64)

  # return weighted categorical cross entropy
  return tf.math.reduce_sum(y_true * batch_weights, axis=-1) * tf.keras.losses.categorical_crossentropy(y_true, y_pred)

In [None]:
# Build BiLSTM model
model_title = "BiLSTM"
model = Sequential()
model.add(
    Embedding(
        input_dim = words_size, output_dim = dim_embed, input_length = max_length
    )
)
model.add(Dropout(drop_rate))
model.add(Bidirectional(LSTM(n_units, return_sequences = True)))
model.add(TimeDistributed(Dense(tags_size, activation = 'softmax')))

# Compile model
model.compile(optimizer=optimizer, loss=custom_loss, metrics=metrics)

In [None]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            1758950   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           120800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 18)            3618      
Total params: 1,883,368
Trainable params: 1,883,368
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# set early stopping for model
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto'
)

callbacks = [early_stopping]

In [None]:
# fit the model
history = model.fit(X_tr, y_tr, batch_size=batch_size, epochs=epochs, 
                  validation_split=validation_split, callbacks=callbacks, verbose=verbose)

Epoch 1/20
Tensor("custom_loss/Sum:0", shape=(None, 50), dtype=float64)
Tensor("custom_loss/Sum:0", shape=(None, 50), dtype=float64)
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [76]:
# Examine performance for  
y_pred = model.predict(X_te, batch_size = batch_size, verbose = verbose)
y_pred_flat = np.argmax(y_pred, axis = 2).flatten()
y_te_flat = np.argmax(y_te, axis = 2).flatten()

# display f1 score for each class and 
f1 = f1_score(y_te_flat, y_pred_flat, average = None)
print(pd.DataFrame(f1, index = tags))
print('Mean F1 across classes: ',np.mean(f1))

              0
B-art  0.076923
B-eve  0.408163
B-geo  0.881101
B-gpe  0.951531
B-nat  0.296296
B-org  0.740595
B-per  0.834279
B-tim  0.903162
I-art  0.000000
I-eve  0.285714
I-geo  0.800275
I-gpe  0.720000
I-nat  0.000000
I-org  0.779620
I-per  0.870602
I-tim  0.782074
O      0.991321
PAD    1.000000
Mean F1 across classes:  0.628980921059854
