# COMP5046 Assignment 2


# 1 - Data Preprocessing

## Download Dataset

In [94]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
!pip install spacy
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import numpy as np
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1Q6DMPeMOIVaYHnmOwpwLX28Pfxu1Yxvg'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test.csv')

id = '1kCYpK73wTTA88yisgPUAE_5cUW5Cklwo'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')

id = '1ukoWnJdhEFIKYestcDchJ325k0y_qxni'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val.csv')

import pandas as pd
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

Sentence_train = df_train['Sentence'].tolist()
NER_train = df_train['NER'].tolist()
Sentence_val = df_val['Sentence'].tolist()
NER_val = df_val['NER'].tolist()
Sentence_test = df_test['Sentence'].tolist()
NER_test = df_test['NER'].tolist()

print("Training Sentence number:",len(Sentence_train))
print("Validation Sentence number:",len(Sentence_val))
print("Testing Sentence number:",len(Sentence_test))

Training Sentence number: 3000
Validation Sentence number: 700
Testing Sentence number: 3684


In [0]:
# Download pre-trained BERT-encoded embedding with no pooling strategy, this might takes 5-15 mins depends on your internet speed
id = '1Bi7EKphd_ubFXfZ-l18B4eXn08BoF0z4'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_bert_clean.npy')

id = '1CH3m-lRw0sfmr1_Mkz6vhrWuVjlZ7iib'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('val_bert_clean.npy')

id = '1B6zzTW10MYeQWO4eaHZ68D1w6Rsb2QR3'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_bert_clean.npy')

In [0]:
train_bert_clean = np.load('train_bert_clean.npy',allow_pickle=True)
val_bert_clean = np.load('val_bert_clean.npy',allow_pickle=True)
test_bert_clean = np.load('test_bert_clean.npy',allow_pickle=True)

## Tokenization & Feature Extraction with Spacy

In [0]:
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = Tokenizer(nlp.vocab)

def preprocess(data):
  sent_id = []
  lemma = []
  pos = []
  dep = []
  ent = []
  
  for i in range(len(data)):
    data_temp = data[i]
    sent_id.append(i)
    lemma_temp = []
    pos_temp = []
    dep_temp = []
    ent_temp = []
    
    for w in nlp(data_temp):
      lemma_temp.append(w.lemma_)
      pos_temp.append(w.tag_)
      dep_temp.append(w.dep_)
      ent_temp.append(w.ent_type_)
    
    lemma.append(lemma_temp)
    pos.append(pos_temp)
    dep.append(dep_temp)
    ent.append(ent_temp)
      
  return sent_id, lemma, pos, dep, ent

train_sent_id, train_sent, train_pos, train_dep, train_ent = preprocess(Sentence_train)
val_sent_id, val_sent, val_pos, val_dep, val_ent = preprocess(Sentence_val)
test_sent_id, test_sent, test_pos, test_dep, test_ent = preprocess(Sentence_test)

## 2 - Input Embeddings

In [0]:
# Map words & NER Tags to index

word_to_ix = {}
for sentence in train_sent+val_sent+test_sent:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in NER_train+NER_val:
    for tag in tags.split():
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

In [0]:
# Map additional features to index
import numpy as np 

pos_to_ix = {}
for sentence in train_pos+val_pos+test_pos:
    for pos in sentence:
        if pos not in pos_to_ix:
            pos_to_ix[pos] = len(pos_to_ix)

pos_embedding = np.eye(len(list(pos_to_ix.values())))

dep_to_ix = {}
for sentence in train_dep+val_dep+test_dep:
    for dep in sentence:
        if dep not in dep_to_ix:
            dep_to_ix[dep] = len(dep_to_ix)

dep_embedding = np.eye(len(list(dep_to_ix.values())))

ent_to_ix = {}
for sentence in train_ent+val_ent+test_ent:
    for ent in sentence:
        if ent not in ent_to_ix:
            ent_to_ix[ent] = len(ent_to_ix)

ent_embedding = np.eye(len(list(ent_to_ix.values())))

In [100]:
print(pos_embedding.shape)
print(dep_embedding.shape)
print(ent_embedding.shape)

(48, 48)
(44, 44)
(19, 19)


In [101]:
# Generate Word Embedding Matrix
import numpy as np
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-50") 

EMBEDDING_DIM = 50

embedding_matrix = []
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  # This is added back by InteractiveShellApp.init_path()


(12431, 50)

In [0]:
# Sum total embedding dimensions
EMBEDDING_DIM = 50 # Glove word embedding size
EMBEDDING_DIM = EMBEDDING_DIM + pos_embedding.shape[0] + dep_embedding.shape[0] + ent_embedding.shape[0] + 1024 # Bert Embedding has 1024 dimensions 

In [0]:
#Convert Words, Tag, Additional Features to Index

def to_index(data, to_ix):
  if to_ix != tag_to_ix:
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list
  else:
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent.split()])
    return input_index_list

train_input_index =  to_index(train_sent,word_to_ix)
train_output_index = to_index(NER_train,tag_to_ix)
train_ent_index =  to_index(train_ent,ent_to_ix)
train_dep_index = to_index(train_dep,dep_to_ix)
train_pos_index =  to_index(train_pos,pos_to_ix)

val_input_index = to_index(val_sent,word_to_ix)
val_output_index = to_index(NER_val,tag_to_ix)
val_ent_index =  to_index(val_ent,ent_to_ix)
val_dep_index = to_index(val_dep,dep_to_ix)
val_pos_index =  to_index(val_pos,pos_to_ix)

test_input_index = to_index(test_sent,word_to_ix)
test_ent_index =  to_index(test_ent,ent_to_ix)
test_dep_index = to_index(test_dep,dep_to_ix)
test_pos_index =  to_index(test_pos,pos_to_ix)

# 3 - NER model
#BERT-Based Bi-LSTM CRF with Attention 


In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, 50)
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.pos_embeds = nn.Embedding(pos_embedding.shape[0], pos_embedding.shape[0])
        self.pos_embeds.weight.data.copy_(torch.from_numpy(pos_embedding))
        
        self.dep_embeds = nn.Embedding(dep_embedding.shape[0], dep_embedding.shape[0])
        self.dep_embeds.weight.data.copy_(torch.from_numpy(dep_embedding))
        
        self.ent_embeds = nn.Embedding(ent_embedding.shape[0], ent_embedding.shape[0])
        self.ent_embeds.weight.data.copy_(torch.from_numpy(ent_embedding))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=2, bidirectional=True,dropout=0.5)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
        self.dropout_lstm=nn.Dropout(p=0.5)
        
    def init_hidden(self):
        return (torch.randn(4, 1, self.hidden_dim // 2).to(device),
                torch.randn(4, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def attention(self,lstm_out,hidden_out):
        hidden_out_t = torch.transpose(hidden_out,0,1)
        hidden_out_t = torch.transpose(hidden_out,1,2)

        hidden_out_t_1 = hidden_out_t   
        hidden_out_1 = hidden_out   
        
        for i in range(lstm_out.size()[0]-1):
          hidden_out_t = torch.cat((hidden_out_t, hidden_out_t_1), 0)
          hidden_out = torch.cat((hidden_out, hidden_out_1), 0)
        
        attn_weights = F.softmax(torch.bmm(lstm_out, hidden_out_t),dim=-1)
        #attn_weights = F.softmax(torch.bmm(lstm_out, hidden_out_t)/np.sqrt(self.hidden_dim),dim=-1)
        
        attn_output = torch.bmm(attn_weights, hidden_out)
        concat_output = torch.cat((attn_output, lstm_out), 1)
        return concat_output

    def _get_lstm_features(self, sentence, ent, pos, dep, bert):
        self.hidden = self.init_hidden()
        
        w_embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        e_embeds = self.ent_embeds(ent).view(len(ent), 1, -1)
        p_embeds = self.pos_embeds(pos).view(len(pos), 1, -1)
        d_embeds = self.dep_embeds(dep).view(len(dep), 1, -1)
        b_embeds = bert.view(len(bert), 1, -1)
        
        embeds = torch.cat((w_embeds,e_embeds,p_embeds,d_embeds,b_embeds),2)  
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        
        hidden_out = torch.cat((self.hidden[0].view(2, 2, 1, -1)[1,0,:,:],self.hidden[0].view(2, 2, 1, -1)[1,1,:,:]),1)
        hidden_out = hidden_out.unsqueeze(0)
        
        att_out = self.attention(lstm_out,hidden_out)
        att_out = att_out.view(-1,self.hidden_dim*2)
        
        att_out = self.dropout_lstm(att_out)
        lstm_feats = self.hidden2tag(att_out)
        
        #lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        #lstm_feats = self.hidden2tag(lstm_out)
        
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags, ent, pos, dep, bert):
        feats = self._get_lstm_features(sentence, ent, pos, dep, bert)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, ent, pos, dep, bert):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence, ent, pos, dep, bert)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
def cal_acc(model, input_index, output_index, ent_index, pos_index, dep_index, bert_clean):

  counter = 0
  counter_all = 0
  predicted_list = []
  ground_truth = []
  
  for i, idxs in enumerate(input_index):
    sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
    ent = torch.tensor(ent_index[i], dtype=torch.long).to(device)
    pos = torch.tensor(pos_index[i], dtype=torch.long).to(device)
    dep = torch.tensor(dep_index[i], dtype=torch.long).to(device)
    bert = torch.tensor(bert_clean[i], dtype=torch.float).to(device)

    _, predicted = model.forward(sentence_in, ent, pos, dep, bert)
    for j in range(len(output_index[i])):
      counter_all += 1
      
      predicted_list.append(predicted[j])
      ground_truth.append(output_index[i][j])

      if predicted[j] == output_index[i][j]:
        counter += 1

  accuracy = counter/counter_all

  return ground_truth, predicted_list, accuracy

#4 - Evaluation
#a. Evaluation Setup


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 100

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

#b. - Training & Evaluations result


In [107]:
"""Each epoch will take about 6-7 minutes"""
import datetime

for epoch in range(10):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        ent_index = train_ent_index[i]
        dep_index = train_dep_index[i]
        pos_index = train_pos_index[i]
        bert = train_bert_clean[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        ent = torch.tensor(ent_index, dtype=torch.long).to(device)
        dep = torch.tensor(dep_index, dtype=torch.long).to(device)
        pos = torch.tensor(pos_index, dtype=torch.long).to(device)
        bert = torch.tensor(bert, dtype=torch.float).to(device) 

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets, ent, pos, dep, bert)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index,train_ent_index,train_pos_index,train_dep_index, train_bert_clean)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index,val_ent_index,val_pos_index,val_dep_index, val_bert_clean)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        ent_index = val_ent_index[i]
        dep_index = val_dep_index[i]
        pos_index = val_pos_index[i]
        bert = val_bert_clean[i]

        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        ent = torch.tensor(ent_index, dtype=torch.long).to(device)
        dep = torch.tensor(dep_index, dtype=torch.long).to(device)
        pos = torch.tensor(pos_index, dtype=torch.long).to(device)
        bert = torch.tensor(bert, dtype=torch.float).to(device) 

        loss = model.neg_log_likelihood(sentence_in, targets, ent, pos, dep, bert)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))


Epoch:1, Training loss: 11103.50, train acc: 0.9503, val loss: 1333.60, val acc: 0.9385, time: 355.72s
Epoch:2, Training loss: 5254.47, train acc: 0.9690, val loss: 1043.27, val acc: 0.9490, time: 353.10s
Epoch:3, Training loss: 3836.85, train acc: 0.9786, val loss: 865.99, val acc: 0.9591, time: 353.22s
Epoch:4, Training loss: 2973.44, train acc: 0.9804, val loss: 1008.00, val acc: 0.9483, time: 354.35s
Epoch:5, Training loss: 2506.35, train acc: 0.9833, val loss: 1052.37, val acc: 0.9529, time: 353.85s
Epoch:6, Training loss: 2208.91, train acc: 0.9882, val loss: 907.00, val acc: 0.9665, time: 355.12s
Epoch:7, Training loss: 1862.90, train acc: 0.9907, val loss: 855.11, val acc: 0.9666, time: 354.11s
Epoch:8, Training loss: 1881.03, train acc: 0.9906, val loss: 938.14, val acc: 0.9606, time: 352.91s
Epoch:9, Training loss: 1618.85, train acc: 0.9927, val loss: 817.37, val acc: 0.9714, time: 354.91s
Epoch:10, Training loss: 1728.92, train acc: 0.9921, val loss: 863.66, val acc: 0.9745

In [109]:
# Save model
from google.colab import drive
drive.mount('/content/drive')

torch.save(model, '/content/drive/My Drive/BERT_Bi-LSTM_CRF_w_Attention.pt')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  "type " + obj.__name__ + ". It won't be checked "


#5 - Output & save test predictions to Kaggle required format 


In [0]:
# Create function for create prediciton for the test set
def predict_test(model, input_index, ent_index, pos_index, dep_index, bert_clean):

  predicted_list = []
  
  for i, idxs in enumerate(input_index):
    sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
    ent = torch.tensor(ent_index[i], dtype=torch.long).to(device)
    pos = torch.tensor(pos_index[i], dtype=torch.long).to(device)
    dep = torch.tensor(dep_index[i], dtype=torch.long).to(device)
    bert = torch.tensor(bert_clean[i], dtype=torch.float).to(device)

    _, predicted = model.forward(sentence_in, ent, pos, dep, bert)
    for j in range(len(input_index[i])):
      predicted_list.append(predicted[j])

  return predicted_list

In [0]:
# Make predictions for the test set and convert the predicted outputs to corresponding tags
predicted_test_idx = predict_test(model,test_input_index,test_ent_index,test_pos_index,test_dep_index,test_bert_clean)

predicted_test_tags = [] 
for i in range(len(predicted_test_idx)):
  for tag, idx in tag_to_ix.items():
      if idx == predicted_test_idx[i]:
          predicted_test_tags.append(tag)

In [0]:
# Save the predicted tags into required format for Kaggle submission 
df = pd.DataFrame({'Predicted': predicted_test_tags})
df.to_csv('predictions.csv',index=True)