## Download Dataset



In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:    
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])

import os
if not os.path.exists('input'):
    os.mkdir('input')
# uncase datasets
# downloadFiles([["input/train.csv", "1h_KEHz61FUJMyLkGpOzdBZhQ0-s5npou"]]) 
# downloadFiles([["input/val.csv", "1Rga0OV-x-vCLDLIMkiW4uL5UZqmDrvJN"]])
# downloadFiles([["input/test.csv", "1dSnSVT6vuFxvZ8vIMQnyK_J-gXwu2nCV"]])
# downloadFiles([["input/sample_submission.csv", "1BJLpaxVN8XlNTnPvOtgepVxeU3FZBV4E"]])

# case datasets
downloadFiles([["input/train.csv", "1nKzLTTgUhE6RqA7KUfpp9iOS3Yuign48"]]) 
downloadFiles([["input/val.csv", "10C16Q_1riHyojGwFBWANMQsR2ow08aoX"]])
downloadFiles([["input/test.csv", "1iN_leOWkQCAZ716pO40q2KF5zwhf_MHF"]])
downloadFiles([["input/sample_submission.csv", "1BJLpaxVN8XlNTnPvOtgepVxeU3FZBV4E"]])

In [0]:
import pandas as pd

sample_submission = pd.read_csv('input/sample_submission.csv')
test = pd.read_csv('input/test.csv')
train = pd.read_csv('input/train.csv')
val = pd.read_csv('input/val.csv')

In [0]:
train_data = [sentence.split() for sentence in train['Sentence']]
target_y_train = [ner.split() for ner in train['NER']]
val_data = [sentence.split() for sentence in val['Sentence']]
target_y_validation = [ner.split() for ner in val['NER']]
test_data = [sentence.split() for sentence in test['Sentence']]

In [0]:
print(train_data[1])
print(target_y_train[1])
print(val_data[1])
print(target_y_validation[1])
print(test_data[1])

## Preprocess

### Generate word_to_ix and tag_to_ix

In [0]:
# get features
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'word': word,
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            'BOS': False
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            'EOS': False
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [0]:
# fit and transform features
class FeaturesTransformer():
    def __init__(self, features, word_emb_model, word_emb_size):
        self.features = features

        self.word_emb_model = word_emb_model
        self.word_emb_size = word_emb_size

        self.isbos = {False: 0, True: 1}
        self.iseos = {False: 0, True: 1}
        
        self.word_list = []
        self.word_to_ix = {}
        
        # self.previous_word_istitle = {False: 0, True: 1}
        # self.previous_word_isupper = {False: 0, True: 1}
        
        # self.after_word_istitle = {False: 0, True: 1}
        # self.after_word_isupper = {False: 0, True: 1}
        
        self.word_last_three_chars = []
        self.word_last_three_chars_to_ix = {}
        self.word_last_two_chars = []
        self.word_last_two_chars_to_ix = {}

        self.word_isupper = {False: 0, True: 1}
        self.word_istitle = {False: 0, True: 1}
        self.word_isdigit = {False: 0, True: 1}

    def fit(self, features):
        for s in features:
            for feature in s:
                if 'word[-3:]' in self.features and feature['word[-3:]'] not in self.word_last_three_chars:
                    self.word_last_three_chars.append(feature['word[-3:]'])
                    self.word_last_three_chars_to_ix[feature['word[-3:]']] = len(self.word_last_three_chars_to_ix)

                if 'word[-2:]' in self.features and feature['word[-2:]'] not in self.word_last_two_chars:
                    self.word_last_two_chars.append(feature['word[-2:]'])
                    self.word_last_two_chars_to_ix[feature['word[-2:]']] = len(self.word_last_two_chars_to_ix)

    def transform(self, features):
        """
        Transform features to embeddings
        """
        transformed_data = []
        for s in features:
            temp_s = []
            for feature in s:
                temp_w = []
                if 'word' in self.features:
                    # get word2vec embedding
                    try:
                        temp_w.extend(self.word_emb_model.wv[feature['word']])
                    except:
                        temp_w.extend([0]*self.word_emb_size)
                
                if 'word.lower()' in self.features:
                    # get word2vec embedding
                    try:
                        temp_w.extend(self.word_emb_model.wv[feature['word.lower()']])
                    except:
                        temp_w.extend([0]*self.word_emb_size)
                
                if 'word[-3:]' in self.features:
                    # word last three chars
                    temp_w.append(self.word_last_three_chars_to_ix[feature['word[-3:]']])
                
                if 'word[-2:]' in self.features:
                    # word last two chars
                    temp_w.append(self.word_last_two_chars_to_ix[feature['word[-2:]']])

                if 'word.isupper()' in self.features:
                    # word isupper
                    temp_w.append(self.word_isupper[feature['word.isupper()']])
                
                if 'word.istitle()' in self.features:
                    # word istitle
                    temp_w.append(self.word_istitle[feature['word.istitle()']])
                
                if 'word.isdigit()' in self.features:
                    # word isdigit
                    temp_w.append(self.word_isdigit[feature['word.isdigit()']])
                
                if 'BOS' in self.features:
                    # isbos
                    temp_w.append(self.isbos[feature['BOS']])
                
                if 'EOS' in self.features:
                    # iseos
                    temp_w.append(self.iseos[feature['EOS']])
                
                temp_s.append(temp_w)
            transformed_data.append(temp_s)
        return transformed_data

    def fit_transform(self, features):
        self.fit(features)
        return self.transform(features)

## Get features

In [0]:
def get_features(train_data, validation_data, test_data):
    train_features = [sent2features(s) for s in train_data]
    val_features = [sent2features(s) for s in validation_data]
    test_features = [sent2features(s) for s in test_data]
    return train_features, val_features, test_features

### Generate Input Embeddings

In [0]:
import gensim.downloader as api
import numpy as np

def generate_input_embeddings(features, word_emb_model, word_emb_size, train_features, val_features, test_features):
    # fit and transform features
    ft = FeaturesTransformer(features, word_emb_model, word_emb_size)
    ft.fit(train_features)
    ft.fit(val_features)
    ft.fit(test_features)
    train_transformed_features = ft.transform(train_features)
    val_transformed_features = ft.transform(val_features)
    test_transformed_features = ft.transform(test_features)

    return train_transformed_features, val_transformed_features, test_transformed_features

### Convert tags into idxs

In [0]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

train_output_index = to_index(target_y_train,tag_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)

## Model

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim, hidden_dim, apply_attention=False, use_bigru=False, lstm_layers=1, gru_layers=1):
        super(BiLSTM_CRF, self).__init__()
        self.lstm_layers = lstm_layers
        self.apply_attention = apply_attention
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=lstm_layers, bidirectional=True)
        
        if use_bigru:
            self.gru = nn.GRU(self.hidden_size, self.hidden_size, num_layers=gru_layers, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2 * self.lstm_layers, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * self.lstm_layers, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha
    
    def cal_attention(self, hidden, input_embedding, method='dot product'):
        if method == 'dot product':
            attn_weights = F.softmax(torch.bmm(hidden, input_embedding.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, input_embedding.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        elif method == 'scale dot product':
            attn_weights = F.softmax(1/np.sqrt(hidden_size)*torch.bmm(hidden, input_embedding.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, input_embedding.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], hidden[0]), 1)

        return concat_output

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = sentence.float().view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        # calculate attention
        if self.apply_attention:
            lstm_out = self.cal_attention(lstm_out, embeds)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [0]:
import numpy as np
from sklearn.metrics import f1_score

def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    score = f1_score(ground_truth, predicted, average='micro')
    return ground_truth, predicted, accuracy, score

### Initialize Model

In [0]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def init_model(embedding_dim, hidden_dim, lstm_layers):
    model = BiLSTM_CRF(tag_to_ix, embedding_dim, hidden_dim, lstm_layers=lstm_layers).to(device)
    # optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=3, verbose=True, threshold=0.001)
    return model, optimizer, scheduler

### Train the model

In [0]:
"""Each epoch will take about 1-2 minutes"""

import datetime

def train(model, optimizer, scheduler, train_transformed_features, train_output_index, val_transformed_features, val_output_index):
    for epoch in range(20):  
        time1 = datetime.datetime.now()
        train_loss = 0

        model.train()
        for i, idxs in enumerate(train_transformed_features):
            tags_index = train_output_index[i]

            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = torch.tensor(idxs, dtype=torch.float).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            train_loss+=loss.item()

        model.eval()
        _, _, train_acc, train_score = cal_acc(model,train_transformed_features,train_output_index)
        _, _, val_acc, val_score = cal_acc(model,val_transformed_features,val_output_index)

        scheduler.step(val_score)

        val_loss = 0
        for i, idxs in enumerate(val_transformed_features):
            tags_index = val_output_index[i]
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(sentence_in, targets)
            val_loss+=loss.item()
        time2 = datetime.datetime.now()

        print("Epoch:%d, Training loss: %.2f, train score: %.4f | val loss: %.2f, val score: %.4f| time: %.2fs" %(epoch+1, train_loss, train_score, val_loss, val_score, (time2-time1).total_seconds()))


### Test

In [0]:
from sklearn.metrics import classification_report

def test(model, val_transformed_features, val_output_index):
    model.eval()

    y_true, y_pred, _, _ = cal_acc(model, val_transformed_features, val_output_index)

    def decode_output(output_list):
        ix_to_tag = {v:k for k,v in tag_to_ix.items()}
        return [ix_to_tag[output] for output in output_list]

    y_true_decode = decode_output(y_true)
    y_pred_decode = decode_output(y_pred)

    print(classification_report(y_true_decode,y_pred_decode,digits=4))

## Evaluation

In [0]:
def main(train_data, val_data, test_data, features, word_emb_model, word_emb_size, lstm_layers=1):
    """
    train_data: Train sentence tokenize dataset. [[word1, word2, word3, ...], [], ...]
    val_data: Validation sentence tokenize dataset.
    test_data: Test tokenize dataset.
    features: Features list. ['word', 'word[-3:]', 'word[-2:]', 'word.isupper()', 'word.istitle()', 'BOS', 'EOS'].
    word_emb_model: Word embedding model name, like "word2vec-google-news-300".
    word_emb_size: Word embedding model dim.
    """
    # get input features
    train_features, val_features, test_features = get_features(train_data, val_data, test_data)

    # generate input embeddings
    word_emb_model = api.load(word_emb_model)
    train_transformed_features, val_transformed_features, test_transformed_features = generate_input_embeddings(
        features,
        word_emb_model, 
        word_emb_size, 
        train_features, 
        val_features, 
        test_features)

    # hyperparameters    
    embedding_dim = len(train_transformed_features[0][0])
    hidden_dim = 50

    print("Embedding dim: ", embedding_dim)

    # get model
    model, optimizer, scheduler = init_model(embedding_dim, hidden_dim, lstm_layers)

    # train
    train(model, optimizer, scheduler, train_transformed_features, train_output_index, val_transformed_features, val_output_index)

    # test model
    test(model,  val_transformed_features, val_output_index)

    return model

### Only word2vec embeddings

In [0]:
model = main(train_data, val_data, test_data, ['word'], "glove-twitter-100", 100)

### Word2vec + BOS + EOS

In [0]:
model = main(train_data, val_data, test_data, ['word', 'BOS', 'EOS'], "glove-twitter-100", 100)

### Word2vec + BOS + EOS + word[-3:] + word[-2:]

In [0]:
model = main(train_data, val_data, test_data, ['word.lower()', 'BOS', 'EOS', 'word[-3:]', 'word[-2:]'], "glove-twitter-100", 100)

### Word2vec + BOS + EOS + word.isupper() + word.istitle()

In [0]:
model = main(train_data, val_data, test_data, ['word', 'BOS', 'EOS', 'word.isupper()', 'word.istitle()'], "glove-twitter-100", 100)

### Word2vec + BOS + EOS + word.isupper() + word.istitle()

In [0]:
model = main(train_data, val_data, test_data, ['word.lower()', 'BOS', 'EOS', 'word.isupper()', 'word.istitle()'], "fasttext-wiki-news-subwords-300", 300)

### BiLSTM 2 Layers

In [0]:
model = main(train_data, val_data, test_data, ['word'], "glove-twitter-25", 25, lstm_layers=2)

## Save Model

In [0]:
torch.save(model, './best_model.pth')

## Predict

In [0]:
model = torch.load('best_model.pth')
def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

# get input features
train_features, val_features, test_features = get_features(train_data, val_data, test_data)

# generate input embeddings
word_emb_model = api.load("glove-twitter-25")
_, _, test_transformed_features = generate_input_embeddings(
    ['word'],
    word_emb_model, 
    25, 
    train_features, 
    val_features, 
    test_features)

In [0]:
test_predicted = []
for i,idxs in enumerate(test_transformed_features):
    _, pred = model(torch.tensor(idxs, dtype=torch.float).to(device))
    test_predicted += pred
decoded_predicted = decode_output(test_predicted)
sample_submission['Predicted'] = decoded_predicted
sample_submission.to_csv('submission.csv', index=False)