# Applying Bidirectional LSTM for NER

- In this notebook I applied BiLSTM to predict the entity of a word. 
- To enhance the simple BiLSTM, GloVe word2vec was used as word embeddings

**Achieved 98% accuracy using BiLSTM and GloVe embeddings.**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from collections import Counter
import numpy as np
import torch
from torch.autograd import Variable
import json
import random
import torch.optim as optim
from tqdm import trange
import numpy.ma as ma
import pickle

## Preprocessing Data

In [None]:
PAD_WORD = '<pad>'
PAD_TAG = 'O'
UNK_WORD = 'UNK'

In [None]:
def update_vocab(txt_path, vocab, is_word):
    """Update word and tag vocabulary from dataset
    Args:
        txt_path: (string) path to file, one sentence per line
        vocab: (dict or Counter) with update method
        is_word: count words, otherwise tags
    Returns:
        dataset_size: (int) number of elements in the dataset
    """
    sentence_counter = 0
    with open(txt_path) as f:
        for i, line in enumerate(f):
            line_split = line.strip().split(' ')
            if(len(line_split) > 1 and is_word):
              word = line_split[1].strip()
              vocab.update([word])
            elif (len(line_split) > 1 and not is_word):
              tag = line_split[2].strip()
              vocab.update([tag])
            elif(len(line_split) == 1):
              sentence_counter += 1
    return sentence_counter

In [None]:
def save_vocab_to_txt_file(vocab, txt_path):
    """Writes one token per line, 0-based line id corresponds to the id of the token.
    Args:
        vocab: (iterable object) yields token
        txt_path: (stirng) path to vocab file
    """
    with open(txt_path, "w") as f:
        for token in vocab:
            f.write(token + '\n')

In [None]:
base_path = "/content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/"
train_path = base_path+"data/train"
dev_path = base_path+ "data/dev"
test_path = base_path+ "data/test"
vocab_path = base_path+"vocab.pickle"
tagmap_path = base_path+"tagmap.pickle"

words = Counter()
size_train_sentences = update_vocab(train_path, words, True)
size_dev_sentences = update_vocab(dev_path, words, True)
size_test_sentences = update_vocab(test_path, words, True)

In [None]:
tags = Counter()
size_train_tags = update_vocab(train_path, tags, False)
size_dev_tags = update_vocab(dev_path, tags, False)

In [None]:
words = [tok for tok, count in words.items() if count >= 3]
tags = [tok for tok, count in tags.items() if count >= 3]

In [None]:
if PAD_WORD not in words: words.append(PAD_WORD)
if PAD_TAG not in tags: tags.append(PAD_TAG)
words.append(UNK_WORD)

In [None]:
words_path = base_path+"data/words.txt"
save_vocab_to_txt_file(words, words_path)

In [None]:
tags_path = base_path+"data/tags.txt"
save_vocab_to_txt_file(tags, tags_path)

In [None]:
sizes = {
        'train_size': size_train_sentences,   #number of sentences not # of rows!! 
        'dev_size': size_dev_sentences,       #number of sentences not # of rows!! 
        'test_size': size_test_sentences,     #number of sentences not # of rows!! 
        'vocab_size': len(words),
        'number_of_tags': len(tags),
        'pad_word': PAD_WORD,
        'pad_tag': PAD_TAG,
        'unk_word': UNK_WORD
    }

In [None]:
print(sizes)

{'train_size': 14986, 'dev_size': 3465, 'test_size': 3683, 'vocab_size': 10622, 'number_of_tags': 9, 'pad_word': '<pad>', 'pad_tag': 'O', 'unk_word': 'UNK'}


In [None]:
vocab = {}
with open(words_path) as f:
    for i, l in enumerate(f.read().splitlines()):
        vocab[l] = i


In [None]:
with open(vocab_path, 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
tag_map = {}
with open(tags_path) as f:
    for i, l in enumerate(f.read().splitlines()):
        tag_map[l] = i

In [None]:
with open(tagmap_path, 'wb') as handle:
    pickle.dump(tag_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def text2id(txt_path, vocab, tag_map, d, isTest):
  """
  Args:
      txt_path: (string) path to the file with sentence and tags
      d: (dict) a dictionary in which the loaded data is stored
  """
  sentences = []        
  labels = []
  s = []
  l = []
  with open(txt_path) as f:
      for token in f.read().splitlines():
          #replace each token by its index if it is in vocab
          #else use index of UNK
          if (len(token.split(' ')) == 1): ## empty line, end of sentence
            # append sentence with labels
            sentences.append(s)
            labels.append(l)
            # reset sentence and label list
            s = []
            l = []
            continue
          if (len(token.split(' ')) > 1 and isTest):
            word = token.split(' ')[1]
            if word in vocab:
              word = vocab[word]
            else:
              word = vocab['UNK']
            s.append(word)
            continue
          if (len(token.split(' ')) > 1): # not an empty line
            word = token.split(' ')[1]
            label = token.split(' ')[2]
            # word found in vocab dict
            if word in vocab:
              word = vocab[word]
            else:
              word = vocab['UNK']
            label = tag_map[label]
            s.append(word)
            l.append(label)
  
  if(not isTest):
    assert len(labels) == len(sentences)
    for i in range(len(labels)):
        assert len(labels[i]) == len(sentences[i])
    d['data'] = sentences
    d['labels'] = labels
    d['size'] = len(sentences)
  if(isTest):
    d['data'] = sentences
    d['size'] = len(sentences)

In [None]:
train_data = {}
text2id(train_path, vocab, tag_map, train_data, False)
dev_data = {}
text2id(dev_path, vocab, tag_map, dev_data, False)
test_data = {}
text2id(test_path, vocab, tag_map, test_data, True)

## Pytorch Network Implementation

In [None]:
def data_iterator(data, params, vocab, shuffle=False):
        """
        Returns a generator that yields batches data with labels. Batch size is params.batch_size. Expires after one
        pass over the data.
        Args:
            data: (dict) contains data which has keys 'data', 'labels' and 'size'
            params: (Params) hyperparameters of the training process.
            shuffle: (bool) whether the data should be shuffled
        Yields:
            batch_data: (Variable) dimension batch_size x seq_len with the sentence data
            batch_labels: (Variable) dimension batch_size x seq_len with the corresponding labels
        """

        # make a list that decides the order in which we go over the data- this avoids explicit shuffling of data
        order = list(range(data['size']))
        if shuffle:
            random.seed(230)
            random.shuffle(order)

        # one pass over data
        for i in range((data['size']+1)//params['batch_size']):
            # fetch sentences and tags
            batch_sentences = [data['data'][idx] for idx in order[i*params['batch_size']:(i+1)*params['batch_size']]]
            batch_tags = [data['labels'][idx] for idx in order[i*params['batch_size']:(i+1)*params['batch_size']]]

            # compute length of longest sentence in batch
            batch_max_len = max([len(s) for s in batch_sentences])

            # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1
            # initialising labels to -1 differentiates tokens with tags from PADding tokens
            batch_data = vocab[PAD_WORD]*np.ones((len(batch_sentences), batch_max_len))
            batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

            # copy the data to the numpy array
            for j in range(len(batch_sentences)):
                cur_len = len(batch_sentences[j])
                batch_data[j][:cur_len] = batch_sentences[j]
                batch_labels[j][:cur_len] = batch_tags[j]

            # since all data are indices, we convert them to torch LongTensors
            batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

            # shift tensors to GPU if available
            if params['cuda']:
                batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()

            # convert them to Variables to record operations in the computational graph
            batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
    
            yield batch_data, batch_labels

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    """
    This is the standard way to define your own network in PyTorch. You typically choose the components
    (e.g. LSTMs, linear layers etc.) of your network in the __init__ function. You then apply these layers
    on the input step-by-step in the forward function. You can use torch.nn.functional to apply functions
    such as F.relu, F.sigmoid, F.softmax. Be careful to ensure your dimensions are correct after each step.
    You are encouraged to have a look at the network in pytorch/vision/model/net.py to get a better sense of how
    you can go about defining your own network.
    The documentation for all the various components available to you is here: http://pytorch.org/docs/master/nn.html
    """

    def __init__(self, params):
        """
        We define an recurrent network that predicts the NER tags for each token in the sentence. The components
        required are:
        - an embedding layer: this layer maps each index in range(params.vocab_size) to a params.embedding_dim vector
        - lstm: applying the LSTM on the sequential input returns an output for each token in the sentence
        - fc: a fully connected layer that converts the LSTM output for each token to a distribution over NER tags
        Args:
            params: (Params) contains vocab_size, embedding_dim, lstm_hidden_dim
        """
        super(Net, self).__init__()

        # the embedding takes as input the vocab_size and the embedding_dim
        self.embedding = nn.Embedding(params['vocab_size'], params['embedding_dim'])
        # self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(params['embeddings']).float(), freeze=False)
        
        # the LSTM takes as input the size of its input (embedding_dim), its hidden size
        # for more details on how to use it, check out the documentation
        self.lstm = nn.LSTM(params['embedding_dim'],
                            params['lstm_hidden_dim'],
                            num_layers = 1,
                            bidirectional=True,
                            dropout=0.33,
                            batch_first=True)

        self.linear = nn.Linear(2*params['lstm_hidden_dim'], params['linear_output_dim'])
        # the fully connected layer transforms the output to give the final output layer
        self.fc = nn.Linear(params['linear_output_dim'], params['number_of_tags'])

    def forward(self, s):
        """
        This function defines how we use the components of our network to operate on an input batch.
        Args:
            s: (Variable) contains a batch of sentences, of dimension batch_size x seq_len, where seq_len is
               the length of the longest sentence in the batch. For sentences shorter than seq_len, the remaining
               tokens are PADding tokens. Each row is a sentence with each element corresponding to the index of
               the token in the vocab.
        Returns:
            out: (Variable) dimension batch_size*seq_len x num_tags with the log probabilities of tokens for each token
                 of each sentence.
        Note: the dimensions after each step are provided
        """
        #                                -> batch_size x seq_len
        # apply the embedding layer that maps each token to its embedding
        # dim: batch_size x seq_len x embedding_dim
        s = self.embedding(s)

        # run the LSTM along the sentences of length seq_len
        # dim: batch_size x seq_len x lstm_hidden_dim
        s, _ = self.lstm(s)

        # make the Variable contiguous in memory (a PyTorch artefact)
        s = s.contiguous()

        s = self.linear(s)

        m = nn.ELU()
        s = m(s)

        # reshape the Variable so that each row contains one token
        # dim: batch_size*seq_len x lstm_hidden_dim
        s = s.view(-1, s.shape[2])
        

        # apply the fully connected layer and obtain the output (before softmax) for each token
        s = self.fc(s)                   # dim: batch_size*seq_len x num_tags

        # apply log softmax on each token's output (this is recommended over applying softmax
        # since it is numerically more stable)
        return F.log_softmax(s, dim=1)   # dim: batch_size*seq_len x num_tags


def loss_fn(outputs, labels):
    """
    Compute the cross entropy loss given outputs from the model and labels for all tokens. Exclude loss terms
    for PADding tokens.
    Args:
        outputs: (Variable) dimension batch_size*seq_len x num_tags - log softmax output of the model
        labels: (Variable) dimension batch_size x seq_len where each element is either a label in [0, 1, ... num_tag-1],
                or -1 in case it is a PADding token.
    Returns:
        loss: (Variable) cross entropy loss for all tokens in the batch
    Note: you may use a standard loss function from http://pytorch.org/docs/master/nn.html#loss-functions. This example
          demonstrates how you can easily define a custom loss function.
    """

    # reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)

    # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms
    mask = (labels >= 0).float()

    # indexing with negative values is not supported. Since PADded tokens have label -1, we convert them to a positive
    # number. This does not affect training, since we ignore the PADded tokens with the mask.
    labels = labels % outputs.shape[1]

    num_tokens = int(torch.sum(mask))

    # compute cross entropy loss for all tokens (except PADding tokens), by multiplying with mask.
    return -torch.sum(outputs[range(outputs.shape[0]), labels]*mask)/num_tokens


def accuracy(outputs, labels):
    """
    Compute the accuracy, given the outputs and labels for all tokens. Exclude PADding terms.
    Args:
        outputs: (np.ndarray) dimension batch_size*seq_len x num_tags - log softmax output of the model
        labels: (np.ndarray) dimension batch_size x seq_len where each element is either a label in
                [0, 1, ... num_tag-1], or -1 in case it is a PADding token.
    Returns: (float) accuracy in [0,1]
    """

    # reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.ravel()

    # since PADding tokens have label -1, we can generate a mask to exclude the loss from those terms
    mask = (labels >= 0)

    # np.argmax gives us the class predicted for each token by the model
    outputs = np.argmax(outputs, axis=1)

    # compare outputs with labels and divide by number of tokens (excluding PADding tokens)
    return np.sum(outputs == labels)/float(np.sum(mask))

# maintain all metrics required in this dictionary- these are used in the training and evaluation loops
metrics = {
    'accuracy': accuracy,
    # could add more metrics such as accuracy for each token type
}

In [None]:
class RunningAverage():
    """A simple class that maintains the running average of a quantity
    Example:
    ```
    loss_avg = RunningAverage()
    loss_avg.update(2)
    loss_avg.update(4)
    loss_avg() = 3
    ```
    """

    def __init__(self):
        self.steps = 0
        self.total = 0

    def update(self, val):
        self.total += val
        self.steps += 1

    def __call__(self):
        return self.total / float(self.steps)

In [None]:
def evaluate(model, loss_fn, data_iterator, metrics, params, num_steps):
    """Evaluate the model on `num_steps` batches.
    Args:
        model: (torch.nn.Module) the neural network
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to evaluation mode
    model.eval()

    # summary for current eval loop
    summ = []

    # compute metrics over the dataset
    for _ in range(num_steps):
        # fetch the next evaluation batch
        data_batch, labels_batch = next(data_iterator)
        
        # compute model output
        output_batch = model(data_batch)
        loss = loss_fn(output_batch, labels_batch)

        # extract data from torch Variable, move to cpu, convert to numpy arrays
        output_batch = output_batch.data.cpu().numpy()
        labels_batch = labels_batch.data.cpu().numpy()

        # compute all metrics on this batch
        summary_batch = {metric: metrics[metric](output_batch, labels_batch)
                         for metric in metrics}
        summary_batch['loss'] = loss.item()
        summ.append(summary_batch)

    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
    print("- Eval metrics : " + metrics_string)
    return metrics_mean

In [None]:
def train(model, optimizer, scheduler, loss_fn, data_iterator, metrics, params, num_steps):
    """Train the model on `num_steps` batches
    Args:
        model: (torch.nn.Module) the neural network
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = RunningAverage()

    # Use tqdm for progress bar
    t = trange(num_steps)
    for i in t:
        # fetch the next training batch
        train_batch, labels_batch = next(data_iterator)

        # compute model output and loss
        output_batch = model(train_batch)
        loss = loss_fn(output_batch, labels_batch)

        # clear previous gradients, compute gradients of all variables wrt loss
        optimizer.zero_grad()
        loss.backward()

        # performs updates using calculated gradients
        optimizer.step()

        # Evaluate summaries only once in a while
        if i % params['save_summary_steps'] == 0:
            # extract data from torch Variable, move to cpu, convert to numpy arrays
            output_batch = output_batch.data.cpu().numpy()
            labels_batch = labels_batch.data.cpu().numpy()

            # compute all metrics on this batch
            summary_batch = {metric: metrics[metric](output_batch, labels_batch)
                             for metric in metrics}
            summary_batch['loss'] = loss.item()
            summ.append(summary_batch)
        
        # update the average loss
        loss_avg.update(loss.item())
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()))

    # update the learning rate
    scheduler.step()
    # compute mean of all metrics in summary
    metrics_mean = {metric: np.mean([x[metric]
                                     for x in summ]) for metric in summ[0]}
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v)
                                for k, v in metrics_mean.items())
    print("- Train metrics: " + metrics_string)

In [None]:
def train_and_evaluate(model, train_data, val_data, vocab, optimizer, scheduler, loss_fn, metrics, params):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network
        train_data: (dict) training data with keys 'data' and 'labels'
        val_data: (dict) validaion data with keys 'data' and 'labels'
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    best_val_acc = 0.0

    for epoch in range(params['num_epochs']):
        # Run one epoch
        print("Epoch {}/{}".format(epoch + 1, params['num_epochs']))

        # compute number of batches in one epoch (one full pass over the training set)
        num_steps = (params['train_size'] + 1) // params['batch_size']
        train_data_iterator = data_iterator(train_data, params, vocab, shuffle=True)
        train(model, optimizer, scheduler, loss_fn, train_data_iterator,
              metrics, params, num_steps)

        # Evaluate for one epoch on validation set
        num_steps = (params['val_size'] + 1) // params['batch_size']
        val_data_iterator = data_iterator(
            val_data, params, vocab, shuffle=False)
        val_metrics = evaluate(
            model, loss_fn, val_data_iterator, metrics, params, num_steps)
        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

        # If best_eval, best_save_path
        if is_best:
            print("- Found new best accuracy")
            best_val_acc = val_acc
            print(best_val_acc)

#Task 1: Simple Bidirectional LSTM model (40 points)
The below hyperparameters gave the following result on the dev file:
```
{
    "learning_rate": [0.2],
    "momentum":[0.9],
    "batch_size": [5],
    "num_epochs":[10],
    "lstm_hidden_dim": 256,
    "linear_output_dim": 128,
    "embedding_dim": 100,
    "cuda":true,
    "save_summary_steps": 100
    "weight_decay":0.0002
    "scheduler_StepLR_size":5,
    "scheduler_StepLR_gamma":0.5,
    "vocab_threshold": >= 3
}
```
Perl *result*
```
processed 51577 tokens with 5942 phrases; found: 5305 phrases; correct: 4505.
accuracy:  96.00%; precision:  84.92%; recall:  75.82%; FB1:  80.11
              LOC: precision:  87.85%; recall:  85.03%; FB1:  86.42  1778
             MISC: precision:  84.75%; recall:  75.92%; FB1:  80.09  826
              ORG: precision:  75.65%; recall:  69.05%; FB1:  72.20  1224
              PER: precision:  89.17%; recall:  71.50%; FB1:  79.36  1477
```
The model Archeticture is:
```
Net(
  (embedding): Embedding(30292, 100)
  (lstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)
```

In [None]:
models_dict = {} #Save the best model

In [None]:
ix_to_tag = {v: k for k, v in tag_map.items()}
json_path = "/content/params.json"
with open(json_path) as f:
      params = json.load(f)
params.update({'vocab_size':sizes['vocab_size']})
params.update({'number_of_tags':sizes['number_of_tags']})
params.update({'val_size':sizes['dev_size']})
params.update({'train_size':sizes['train_size']})
params.update({'test_size':sizes['test_size']})

model_params = {} 
for b in params['batch_size']:
  for m in params['momentum']:
    for l in params['learning_rate']:
      for e in params['num_epochs']:
        model_params['learning_rate'] = l
        model_params['momentum'] = m
        model_params['batch_size'] = b
        model_params['num_epochs'] = e
        model_params['lstm_hidden_dim'] = params['lstm_hidden_dim']
        model_params['linear_output_dim'] = params['linear_output_dim']
        model_params['embedding_dim'] = params['embedding_dim']
        model_params['cuda'] = params['cuda']
        model_params['save_summary_steps'] = params['save_summary_steps']
        model_params['vocab_size'] = params['vocab_size']
        model_params['number_of_tags'] = params['number_of_tags']
        model_params['val_size'] = params['val_size']
        model_params['train_size'] = params['train_size']
        
        model_name = "lr{}_momnt{}_batch{}_epo{}".format(l,m,b,e)
        print("## model name: {}".format(model_name))
        
        model = Net(model_params)
        if params['cuda']:
          model = Net(model_params).cuda()

        optimizer = optim.SGD(model.parameters(), weight_decay=0.0002, momentum=model_params['momentum'], lr=model_params['learning_rate'])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
        train_and_evaluate(model, train_data, dev_data, vocab, optimizer,scheduler ,loss_fn, metrics, model_params)
        

        models_dict[model_name] = {'model':model, 'batch_size': b, 'cuda': True}

        num_steps = (params['val_size'] + 1) // model_params['batch_size']
        val_data_iterator = data_iterator(
                  dev_data, model_params, vocab, shuffle=False)
        model.eval()
        print("writing predicted tokens to ner_{}".format(model_name))
        with open(base_path+"/12/ner/ner_{}".format(model_name), "w") as f:
          for _ in range(num_steps):
              # fetch the next evaluation batch
              data_batch, labels_batch = next(val_data_iterator)
              
              # compute model output
              output_batch = model(data_batch)
              output_batch = output_batch.data.cpu().numpy()
              labels_batch = labels_batch.data.cpu().numpy()        
              labels = labels_batch.ravel()
              mask = (labels < 0)

              outputs = np.argmax(output_batch, axis=1)
              outputs_nopadding = ma.array(outputs, mask=mask)
              for ner in outputs_nopadding.compressed():
                  y = ix_to_tag[ner]
                  f.write(y + '\n')

        with open(dev_path, "r") as f:
          dev_lines = f.read().splitlines()

        with open(base_path+"/12/ner/ner_{}".format(model_name), "r") as f:
          pred_lines = f.read().splitlines()

        print("writing predicted NER to dev_{}".format(model_name))
        pred_counter = 0
        with open(base_path+"/12/pred/dev_{}".format(model_name), "w") as f: 
          for line in dev_lines:
            if (line == ""):
              f.write("\n")
            else:
              if pred_counter < len(pred_lines): # check why last dev sentence is out of index
                f.write("{} {}\n".format(line, pred_lines[pred_counter]))
                pred_counter += 1

## model name: lr0.2_momnt0.9_batch5_epo10
writing predicted tokens to ner_lr0.2_momnt0.9_batch5_epo10
writing predicted NER to dev_lr0.2_momnt0.9_batch5_epo10


In [None]:
# Save model for task1
model_path = base_path+"blstm1.pt"
torch.save(model.state_dict(), model_path)

In [None]:
def test_iterator(data, params, vocab):

        order = list(range(data['size']))
        # one pass over data
        for i in range((data['size']+1)//params['batch_size']):
            # fetch sentences and tags
            batch_sentences = [data['data'][idx] for idx in order[i*params['batch_size']:(i+1)*params['batch_size']]]


            # compute length of longest sentence in batch
            batch_max_len = max([len(s) for s in batch_sentences])

            # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1
            # initialising labels to -1 differentiates tokens with tags from PADding tokens
            batch_data = vocab[PAD_WORD]*np.ones((len(batch_sentences), batch_max_len))
            batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))
            # copy the data to the numpy array
            for j in range(len(batch_sentences)):
                cur_len = len(batch_sentences[j])
                batch_data[j][:cur_len] = batch_sentences[j]
                batch_labels[j][:cur_len] = 1

            # since all data are indices, we convert them to torch LongTensors
            batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

            # shift tensors to GPU if available
            if params['cuda']:
                batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()

            # convert them to Variables to record operations in the computational graph
            batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
    
            yield batch_data, batch_labels

In [None]:
def predict(model, test_path, test_data, data_iterator, num_steps, output_path):
  with open(test_path, "r") as f:
    test_lines = f.read().splitlines()

  model.eval()
  predicted_ner_path = base_path + "pred_ner"
  print("predicting NER and saving it to {}".format(predicted_ner_path))
  with open(predicted_ner_path, "w") as f:
    for i in range(num_steps):
        # fetch the next evaluation batch
        batch_data, batch_labels = next(data_iterator)
        # compute model output

        output_batch = model(batch_data)
        output_batch = output_batch.data.cpu().numpy()
        output_labels = batch_labels.data.cpu().numpy()        
        labels = output_labels.ravel()
        mask = (labels < 0)

        outputs = np.argmax(output_batch, axis=1)
        outputs_nopadding = ma.array(outputs, mask=mask)

        for ner in outputs_nopadding.compressed():
            y = ix_to_tag[ner]
            f.write(y + '\n')
        
  print("reading {}".format(test_path))
  with open(test_path, "r") as f:
    test_lines = f.read().splitlines()

  print("reading {}".format(predicted_ner_path))
  with open(predicted_ner_path, "r") as f:
    pred_lines = f.read().splitlines()

  print("appending predicted NER to {}".format(output_path))
  pred_counter = 0
  new_line_counter = 0
  with open(base_path+output_path, "w") as f: 
    for idx, line in enumerate(test_lines):
      if (line == ""):
        f.write("\n")
        new_line_counter += 1
      else:
        if pred_counter < len(pred_lines): # check why last dev sentence is out of index
          f.write("{} {}\n".format(line, pred_lines[pred_counter]))
          pred_counter += 1

In [None]:
test_data_iterator = test_iterator(test_data, model_params, vocab)
num_steps = (test_data['size'] + 1) // params['batch_size'][0]
predict(model, test_path, test_data, test_data_iterator, num_steps, "test1.out")

In [None]:
val_data_iterator = data_iterator(dev_data, model_params, vocab, shuffle=False)
num_steps = (dev_data['size'] + 1) // params['batch_size'][0]
predict(model, dev_path, dev_data, val_data_iterator, num_steps, "dev.out")

predicting NER and saving it to /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/data/dev
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
appending predicted NER to dev_test.out


## Load the model and predict test

In [None]:
load_model_params = {
 'batch_size': 5,
 'cuda': True,
 'embedding_dim': 100,
 'linear_output_dim': 128,
 'lstm_hidden_dim': 256,
 'number_of_tags': 9,
 'save_summary_steps': 100,
 'test_size': 3683,
 'vocab_size': 9412}


test_data_iterator = test_iterator(test_data, load_model_params, vocab)
load_model = Net(load_model_params).cuda()
load_model.load_state_dict(torch.load(model_path))
num_steps = (load_model_params['test_size'] + 1) // load_model_params['batch_size']
predict(model, test_path, test_data, test_data_iterator, num_steps, "test1.out")

  "num_layers={}".format(dropout, num_layers))


predicting NER and saving it to /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/data/test
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
appending predicted NER to test1.out


# Task 2:  Using GloVe word embeddings
```
{
    "learning_rate": 0.2,
    "momentum":0.9,
    "batch_size": 5,
    "num_epochs":5,
    "lstm_hidden_dim": 256,
    "linear_output_dim": 128,
    "embedding_dim": 100,
    "cuda":true,
    "weight_decay":0.00001,
    "scheduler_StepLR_size":3,
    "scheduler_StepLR_gamma":0.5
    "vocabular threshold: all vocab
}
```

Perl *result*
```
processed 51577 tokens with 5942 phrases; found: 5997 phrases; correct: 5385.
accuracy:  98.17%; precision:  89.79%; recall:  90.63%; FB1:  90.21
              LOC: precision:  93.78%; recall:  94.45%; FB1:  94.11  1850
             MISC: precision:  82.94%; recall:  83.84%; FB1:  83.39  932
              ORG: precision:  84.60%; recall:  84.41%; FB1:  84.51  1338
              PER: precision:  92.97%; recall:  94.73%; FB1:  93.84  1877
```

The neural net archeticture:

- 30292 is the vocab size
```
Net(
  (embedding): Embedding(30292, 100)
  (lstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)
```

In [None]:
words = Counter()
size_train_sentences = update_vocab(train_path, words, True)
size_dev_sentences = update_vocab(dev_path, words, True)
size_test_sentences = update_vocab(test_path, words, True)
tags = Counter()
size_train_tags = update_vocab(train_path, tags, False)
size_dev_tags = update_vocab(dev_path, tags, False)

In [None]:
words = [tok for tok, count in words.items() if count >= 0]
if PAD_WORD not in words: words.append(PAD_WORD)
if PAD_TAG not in tags: tags.append(PAD_TAG)
words.append(UNK_WORD)
words_path = base_path+"data/words.txt"
save_vocab_to_txt_file(words, words_path)

In [None]:
sizes = {
        'train_size': size_train_sentences,   #number of sentences not # of rows!! 
        'dev_size': size_dev_sentences,       #number of sentences not # of rows!! 
        'test_size': size_test_sentences,     #number of sentences not # of rows!! 
        'vocab_size': len(words),
        'number_of_tags': len(tags),
        'pad_word': PAD_WORD,
        'pad_tag': PAD_TAG,
        'unk_word': UNK_WORD
    }

In [None]:
sizes

{'dev_size': 3465,
 'number_of_tags': 9,
 'pad_tag': 'O',
 'pad_word': '<pad>',
 'test_size': 3683,
 'train_size': 14986,
 'unk_word': 'UNK',
 'vocab_size': 30292}

In [None]:
vocab = {}
with open(words_path) as f:
    for i, l in enumerate(f.read().splitlines()):
        vocab[l] = i

In [None]:
train_data = {}
text2id(train_path, vocab, tag_map, train_data, False)
dev_data = {}
text2id(dev_path, vocab, tag_map, dev_data, False)
test_data = {}
text2id(test_path, vocab, tag_map, test_data, True)

In [None]:
import gzip
def load_glove_model(glove_path):
    print("Loading Glove Model")
    glove_model = {}
    with gzip.open(glove_path,'rb') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0].decode()
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

In [None]:
glove_path = base_path+"glove.6B.100d.gz"
glove = load_glove_model(glove_path)

Loading Glove Model
400000 words loaded!


In [None]:
def create_GloVE_vocab1(words, glove, vocab, glove_vocab):
  np.random.seed(42)
  pad_word = np.random.rand(100)
  for k in words:
    idx = k
    if k in glove:
      glove_vocab[idx] = glove[k]
    elif k.lower() in glove:
      glove_vocab[idx] = glove[k.lower()]
    else:
      glove_vocab[idx] = pad_word

In [None]:
train_data = {}
text2id(train_path, vocab, tag_map, train_data, False)
dev_data = {}
text2id(dev_path, vocab, tag_map, dev_data, False)
test_data = {}
text2id(test_path, vocab, tag_map, test_data, True)

In [None]:
embedding_matrix = np.zeros((len(vocab), params['embedding_dim']))
np.random.seed(42)
for word in words:
    index = vocab[word]
    if word in glove_vocab:
        vector = glove_vocab[word]
    elif word.lower() in glove_vocab:
        vector = glove_vocab[word.lower()]
    else:
        vector = np.random.rand(params['embedding_dim'])
    embedding_matrix[index] = vector

In [None]:
ix_to_tag = {v: k for k, v in tag_map.items()}
json_path = "/content/params.json"
with open(json_path) as f:
      params = json.load(f)
params.update({'vocab_size':sizes['vocab_size']})
params.update({'number_of_tags':sizes['number_of_tags']})
params.update({'val_size':sizes['dev_size']})
params.update({'train_size':sizes['train_size']})
params.update({'test_size':sizes['test_size']})
params.update({'embeddings': embedding_matrix})

model = Net(params)
print(model)
if params['cuda']:
  model = Net(params).cuda()

optimizer = optim.SGD(model.parameters(), weight_decay=0.00001, momentum=params['momentum'], lr=params['learning_rate'])
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
train_and_evaluate(model, train_data, dev_data, vocab, optimizer,scheduler ,loss_fn, metrics, params)


  "num_layers={}".format(dropout, num_layers))


Net(
  (embedding): Embedding(30292, 100)
  (lstm): LSTM(100, 256, batch_first=True, dropout=0.33, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)
Epoch 1/5


100%|██████████| 2997/2997 [00:24<00:00, 123.04it/s, loss=0.181]


- Train metrics: accuracy: 0.915 ; loss: 0.246
- Eval metrics : accuracy: 0.970 ; loss: 0.119
- Found new best accuracy
0.9696443651181217
Epoch 2/5


100%|██████████| 2997/2997 [00:24<00:00, 123.65it/s, loss=0.064]


- Train metrics: accuracy: 0.978 ; loss: 0.075
- Eval metrics : accuracy: 0.978 ; loss: 0.100
- Found new best accuracy
0.9781822297752824
Epoch 3/5


100%|██████████| 2997/2997 [00:24<00:00, 123.73it/s, loss=0.031]


- Train metrics: accuracy: 0.990 ; loss: 0.039
- Eval metrics : accuracy: 0.979 ; loss: 0.104
- Found new best accuracy
0.9787926612258975
Epoch 4/5


100%|██████████| 2997/2997 [00:24<00:00, 123.47it/s, loss=0.014]


- Train metrics: accuracy: 0.993 ; loss: 0.021
- Eval metrics : accuracy: 0.982 ; loss: 0.095
- Found new best accuracy
0.9824336612704332
Epoch 5/5


100%|██████████| 2997/2997 [00:24<00:00, 123.77it/s, loss=0.008]


- Train metrics: accuracy: 0.997 ; loss: 0.008
- Eval metrics : accuracy: 0.983 ; loss: 0.107
- Found new best accuracy
0.9827497251504924


In [None]:
val_data_iterator = data_iterator(dev_data, params, vocab, shuffle=False)
num_steps = (dev_data['size'] + 1) // params['batch_size']
predict(model, dev_path, dev_data, val_data_iterator, num_steps, "dev2.out")

predicting NER and saving it to /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/data/dev
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
appending predicted NER to dev2.out


In [None]:
# Save model for task2
model2_path = base_path+"blstm2.pt"
torch.save(model.state_dict(), model2_path)

In [None]:
test_data_iterator = test_iterator(test_data, params, vocab)
num_steps = (test_data['size'] + 1) // params['batch_size']
predict(model, test_path, test_data, test_data_iterator, num_steps, "test2.out")

predicting NER and saving it to /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/data/test
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
appending predicted NER to test2.out


## Load Model and Predict Dev and Test

## Predict Test Data and output test2.out

In [None]:
model2_path = base_path+"blstm2.pt"

load_model_params = {
 'batch_size': 5,
 'cuda': True,
 'embedding_dim': 100,
 'linear_output_dim': 128,
 'lstm_hidden_dim': 256,
 'number_of_tags': 9,
 'save_summary_steps': 100,
 'test_size': 3683,
 'vocab_size': 30292}

load_model = Net(load_model_params).cuda()
load_model.load_state_dict(torch.load(model_path))

val_data_iterator = data_iterator(dev_data, load_model_params, vocab, shuffle=False)
num_steps = (dev_data['size'] + 1) // load_model_params['batch_size']
predict(model, dev_path, dev_data, val_data_iterator, num_steps, "dev2_load.out")


  "num_layers={}".format(dropout, num_layers))


predicting NER and saving it to /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/data/dev
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
appending predicted NER to dev2_load.out


In [None]:
test_data_iterator = test_iterator(test_data, load_model_params, vocab)
load_model = Net(load_model_params).cuda()
load_model.load_state_dict(torch.load(model_path))
num_steps = (load_model_params['test_size'] + 1) // load_model_params['batch_size']
predict(model, test_path, test_data, test_data_iterator, num_steps, "test2_load.out")

predicting NER and saving it to /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/data/dev
reading /content/drive/MyDrive/USC master/CSCI 577 Applied Natural Language Processing/HW4/pred_ner
appending predicted NER to dev2_load.out
