In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Load Libraries

In [2]:
import pandas as pd
import numpy as np
import ast
from itertools import chain
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

## Prepare data for NER model

In [3]:
# Read data
data = pd.read_csv('/content/drive/MyDrive/MRP1/annotated_all.csv', sep="|")
data = data.loc[:, ~data.columns.str.contains('^Unnamed')] # remove Unnamed column

# Set correct types
data['Word'] = data['Word'].apply(ast.literal_eval)
data['Word_idx'] = data['Word_idx'].apply(ast.literal_eval)
data['Tag'] = data['Tag'].apply(ast.literal_eval)

### Add indices for all tokens

In [4]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].apply(pd.Series).stack().reset_index(drop = True)))
    else:
        vocab = list(set(data['Tag'].apply(pd.Series).stack().reset_index(drop = True)))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

### Add indices to dataframe

In [5]:
def create_token_list(words, token2idx):
    return [token2idx[word] for word in words]

# Add indices
data['Word_idx'] = data['Word'].map(lambda x: create_token_list(x, token2idx))
data['Tag_idx'] = data['Tag'].map(lambda x: create_token_list(x, tag2idx))

data.head()

Unnamed: 0,Word,Word_idx,Tag,Relation,Tag_idx
0,"[FOURTH, SECTION]","[29042, 41381]","[B-COURT, I-COURT]",NEGATIVE,"[32, 46]"
1,"[CASE, OF, BOROVSK, v.]","[11746, 211, 7616, 3843]","[O, O, O, O]",NEGATIVE,"[11, 11, 11, 11]"
2,"[(, Application, no, ., 24528/02, )]","[7240, 9531, 11248, 34951, 5502, 8293]","[O, O, B-APPLICATION, I-APPLICATION, I-APPLICA...",NEGATIVE,"[11, 11, 21, 50, 50, 11]"
3,[JUDGMENT],[35700],[O],NEGATIVE,[11]
4,"[2, June, 2009]","[19913, 9614, 18858]","[O, B-DATE, I-DATE]",NEGATIVE,"[11, 1, 33]"


### Add padding / truncate input

In [17]:
# Pad or truncate sentences
def pad_or_truncate(l, max_length=128):
    return l[:max_length] + [0] * (max_length - len(l))

# Create padded / truncated input
X = np.array(list(map(pad_or_truncate, data.Word_idx)))
Y = to_categorical(np.array(list(map(pad_or_truncate, data.Tag_idx))))

# CREATE TRAIN AND TEST SET (80, 20, RESPECTIVELY)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.8, random_state=12345)

## Create model

In [18]:
input_dim = len(token2idx.keys())+1
output_dim = len(tag2idx.keys())
input_length = 128
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  41705 
output_dim:  51 
input_length:  128 
n_tags:  51


In [22]:
# Design model
model = Sequential()

# Add Embedding layer
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

# Add bidirectional LSTM
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

# Add timeDistributed Layer
model.add(TimeDistributed(Dense(n_tags, activation="relu")))

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 128, 51)           2126955   
                                                                 
 bidirectional_4 (Bidirectio  (None, 128, 102)         42024     
 nal)                                                            
                                                                 
 time_distributed_4 (TimeDis  (None, 128, 51)          5253      
 tributed)                                                       
                                                                 
Total params: 2,174,232
Trainable params: 2,174,232
Non-trainable params: 0
_________________________________________________________________


## Train model

In [None]:
history = list()
epochs = 15
# fit model for one epoch on this sequence
hist = model.fit(X_train, Y_train, batch_size=128, verbose=1, epochs=epochs, validation_split=0.2)

### Show results during training

In [None]:
# PLOT RESULTS
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.show()

# Test performance on testet

In [25]:
pred = model.predict(X_test).argmax(axis=2).reshape(-1)
true = Y_test.argmax(axis=2).reshape(-1)

In [26]:
print("Performance on testset:\n" + 
      "NER:\n"
      "Accuracy: {:.4f}\n".format(accuracy_score(true, pred)) + 
      "Precision: {:.4f}\n".format(precision_score(true, pred, average='weighted')) + 
      "Recall: {:.4f}\n".format(recall_score(true, pred, average='weighted')) + 
      "F1-Score: {:.4f}".format(f1_score(true, pred, average='weighted')))

  _warn_prf(average, modifier, msg_start, len(result))


Performance on testset:
NER:
Accuracy: 0.9510
Precision: 0.9430
Recall: 0.9510
F1-Score: 0.9321
