## 1.2: Preprocess Data
Currently, the dataset has four fields:
1. 'text' is a list of a list of tokens(string)
2. 'verb_index' represents the **relative position** of the **predicate verb** in the sentence. (Type: a list of integers)

> Example: He would n't accept anything of value from those he was writing about.

> Verb index: 3, because accept is the fourth word of the sentence (indexing starts from zero).


3. 'srl_label' is the semantic role label of every token. (Type: a list of a list of strings) \\

4. 'word_indices' is the index of every word token (same as howework 1&2, Type: a list of a list of integers)

### **Converting Labels to Numerical Values**
In addition to converting our tokens to vector representations, we need to convert our labels to numerical representations. For example, say we have two labels: "O", "B-ARG0". We could numerically represent these labels in a dictionary as Label 0 and Label 1: `{"O":0, "B-ARG":1}`. The following function is implemented for you, and it encodes the labels present in the dataset. No modifications are necessary.


In [56]:
srl_map = {'O': 0,
           'B-ARG0': 1,
           'I-ARG0': 2,
           'B-ARG1': 3,
           'I-ARG1': 4,
           'B-ARG2': 5,
           'I-ARG2': 6,
           'B-ARGM-LOC': 7,
           'I-ARGM-LOC': 8,
           'B-ARGM-TMP': 9,
           'I-ARGM-TMP': 10}

def encode_srl_category(category_data: List[List[str]])->List[List[int]]:
  """ Encoding SRL category from a list of strings to a list of integers

  Arguments:
    category_data (list(list(str))): SRL categories

  Returns:
    encoded category (list(list(int))): Numerical conversions of SRL categories
  """
  encoded_category = []
  for srl_list in category_data:
    encoded_srl_list = []
    for srl in srl_list:
      try:
        encoded_srl_list.append(srl_map[srl])
      except:
        encoded_srl_list.append(0)
    encoded_category.append(encoded_srl_list)
  return encoded_category

We have also provided the `pad_sents` function, which pads all sentences to be the same length, specifically the length of the longest input sentence, using the provided `pad_token`. Finally, we have provided the `Vocab` class that represents the corpus as a `Vocab` object with several helper functions. These are used to preprocess the training set for you.

In [57]:
def pad_sents(sents, pad_token):
    sents_padded = []

    max_len = max([len(sent) for sent in sents])
    sents_padded = [(sent + ([pad_token] * (max_len - len(sent)))) for sent in sents]

    return sents_padded

class Vocab(object):
    """ Vocabulary, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init Vocab Instance.

        :param word2id: dictionary mapping words 2 indices
        :type word2id: dict[str, int]
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.

        :param word: word to look up
        :type word: str
        :returns: index of word
        :rtype: int
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by Vocab.

        :param word: word to look up
        :type word: str
        :returns: whether word is in vocab
        :rtype: bool
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the Vocab directly.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in Voca.

        :returns: number of words in Vocab
        :rtype: int
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def add(self, word):
        """ Add word to Vocab, if it is previously unseen.

        :param word: to add to Vocab
        :type word: str
        :returns: index that the word has been assigned
        :rtype: int
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.

        :param sents: sentence(s) in words
        :type sents: Union[List[str], List[List[str]]]
        :returns: sentence(s) in indices
        :rtype: Union[List[int], List[List[int]]]
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.

        :param word_ids: list of word ids
        :type word_ids: List[int]
        :returns: list of words
        :rtype: List[Str]
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for
        shorter sentences.

        :param sents: list of sentences (words)
        :type sents: List[List[str]]
        :param device: Device on which to load the tensor, ie. CPU or GPU
        :type device: torch.device
        :returns: Sentence tensor of (batch_size, max_sentence_length)
        :rtype: torch.Tensor
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return sents_var

    @staticmethod
    def from_corpus(corpus, size, remove_frac=None, freq_cutoff=None):
        """ Given a corpus construct a Vocab.

        :param corpus: corpus of text produced by read_corpus function
        :type corpus: List[str]
        :param freq_cutoff: if word occurs n < freq_cutoff times, drop the word
        :type freq_cutoff: int
        :returns: Vocab instance produced from provided corpus
        :rtype: Vocab
        """
        vocab_entry = Vocab()
        word_freq = Counter(chain(*corpus))
        if freq_cutoff is None:
            freq_cutoff = 1
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_words = sorted(valid_words, key=lambda word: word_freq[word], reverse=True)
        if remove_frac is not None:
            size = len(top_words) - int(remove_frac * len(top_words))
            top_words = top_words[:size]
            print(f'number of unique words retained with remove_frac={remove_frac}: {len(top_words)}')
        for word in top_words:
            vocab_entry.add(word)
        return vocab_entry

    @staticmethod
    def from_subword_list(subword_list):
        """Given a list of subwords, construct the Vocab.

        :param subword_list: list of subwords in corpus
        :type subword_list: List[str]
        :returns: Vocab instance produced from provided list
        :rtype: Vocab
        """
        vocab_entry = Vocab()
        for subword in subword_list:
            vocab_entry.add(subword)
        return vocab_entry


# **Part 2: LSTM Encoder Model**

In [58]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

print('initialize train vocabulary ..')
src_vocab = Vocab.from_corpus(train['text'], 20000, remove_frac=0.3)

initialize train vocabulary ..
number of word types: 19082, number of word types w/ frequency >= 1: 19082
number of unique words retained with remove_frac=0.3: 13358


In [59]:
train_data = list(zip(train['text'],train['verb_index'],encode_srl_category(train['srl_frames'])))
val_data = list(zip(val['text'], val['verb_index'], encode_srl_category(val['srl_frames'])))

In [60]:
# Lambda to switch to GPU if available
get_device = lambda : "cuda:0" if torch.cuda.is_available() else "cpu"

## 2.1 Implementation

Your first task is to **implement the model below by finishing the #TODOs**.


<a name="l2"></a>

In [61]:
from torch import lstm
class LSTMTagger(nn.Module):
    def __init__(self, src_vocab, embed_dim,
    hidden_dim, output_dim, vocab_size, num_layers=1):
        '''
        src_vocab: vocabulary of inputs (Class Vocab)
        embed_dim: dimension of word embedding
        hidden_dim: dimension of hidden layer
        output_dim: dimension of tagset_size
        vocab_size: vocabulary size
        num_layers: number of LSTM layers
        '''
        super(LSTMTagger, self).__init__()
        self.src_vocab = src_vocab
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim

        ### TODO : Initialize three linear layers:
              # 1. A word embedding layer
              # 2. A LSTM layer
              # 3. An output linear layer
        ### TODO : Initialize logsoftmax
        self.source_embedding = nn.Embedding(len(src_vocab), embed_dim, padding_idx=src_vocab['<pad>'])
        #STAFF: put sequnece in LSTM and if iterate through tokens use LSTM cell (not do it here)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, bidirectional=False, batch_first=True)
        self.linear = nn.Linear((hidden_dim+hidden_dim)*num_layers, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def compute_Loss(self, criterion, predicted_vector, gold_label):
        loss = 0
        for n in range(len(predicted_vector)): # batch size
            loss += criterion(predicted_vector[n], gold_label[n])
        return loss

    def forward(self, source: List[List[str]], verb_indices:List[int]):
        ### GOALS : Write the forward function such that it processes sentences.
        ### GOALS : Return output of the logsoftmax across all time steps

        #pad input sentences and conver to word index
        source_padded = self.src_vocab.to_input_tensor(source, device=get_device())
        batch_size = source_padded.shape[0]
        time_steps = source_padded.shape[1]

        #set up outputs dimension


        #TODO1: Convert word index to embedding

        #TODO2: Pass inputs to the lstm layer

        #TODO3: Get hidden state of verb in the sentence

        #TODO4: Iterate over the time dimension:
        #       - Concatenate verb hidden state to the hidden layer output of every token
        #       - Predict SRL tag distribution with output layer and logsoftmax
        inputs = self.source_embedding(source_padded)

        hidden_states, _ = self.lstm(inputs)

        output = torch.zeros(batch_size, time_steps, self.output_dim).to(get_device())

        verb_embeddings = torch.zeros(batch_size, self.hidden_dim).to(get_device())

        for i in range(batch_size):
          verb_embeddings[i, :] = hidden_states[i][verb_indices[i]]

        for t in range(time_steps):
            y_t = self.softmax(self.linear(torch.cat((hidden_states[:, t, :], verb_embeddings), dim=1)))
            output[:, t, :] = y_t

        return output


    def load_model(self, save_path):
        saved_model = torch.load(save_path)
        self.load_state_dict(saved_model.state_dict())

    def save_model(self, save_path):
        torch.save(self, save_path)


In [62]:
def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of input sentence, verb indices, target output labels
    :param data: list of tuples containing source and target sentence. ie.
        (list of (src_sent, tgt_sent))
    :type data: List[Tuple[List[str], List[str], List[str]]]
    :param batch_size: batch size
    :type batch_size: int
    :param shuffle: whether to randomly shuffle the dataset
    :type shuffle: boolean
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        srl = [e[0] for e in examples]
        verb_index = [e[1] for e in examples]
        target = [e[2] for e in examples]

        yield srl, verb_index, target

In [63]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

def evaluation(model, val_data, optimizer, criterion, batch_size=64):
  model.eval()
  loss = 0
  correct = 0
  total = 0
  batch = 0
  for (input_batch, verb_indices, expected_out) in tqdm(batch_iter(val_data, batch_size=batch_size, shuffle=True)):
    output = model.forward(input_batch, torch.tensor(verb_indices).to(get_device()))
    total += output.size()[0] * output.size()[1]
    _, predicted = torch.max(output, 2)
    expected_out = torch.tensor(pad_sents(expected_out,0))
    correct += (expected_out.to("cpu") == predicted.to("cpu")).cpu().numpy().sum()

    loss += model.compute_Loss(criterion, output.to("cpu"), expected_out.to("cpu"))
    batch += 1
  loss /= batch
  print("Validation Loss: " + str(loss.item()))
  print("Validation Accuracy: " + str(correct/total))
  print()
  return loss.item()

In [64]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

def train_epoch(model, train_data, optimizer, criterion,batch_size=64):
  model.train()
  total = 0
  batch = 0
  total_loss = 0
  correct = 0
  for (input_batch, verb_indices, expected_out) in tqdm(batch_iter(train_data, batch_size=batch_size, shuffle=True)):
    optimizer.zero_grad()
    batch += 1
    output = model.forward(input_batch, torch.tensor(verb_indices).to(get_device()))
    total += output.size()[0] * output.size()[1]
    _, predicted = torch.max(output, 2)

    expected_out = torch.tensor(pad_sents(expected_out,0))
    correct += (expected_out.to("cpu") == predicted.to("cpu")).cpu().numpy().sum()

    loss = model.compute_Loss(criterion, output.to("cpu"), expected_out.to("cpu"))
    total_loss += loss.item()
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
  print("Loss: " + str(total_loss/batch))
  print("Training Accuracy: " + str(correct/total))
  return total_loss/batch

In [65]:
# Setting seed ***DO NOT MODIFY***
torch.manual_seed(123)

def train_and_evaluate(number_of_epochs, model, train_data, val_data, criterion, min_loss=0, lr=.01):
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=.9)
  loss_values = [[],[]]
  for epoch in trange(number_of_epochs, desc="Epochs"):
    cur_loss = train_epoch(model, train_data, optimizer, criterion)
    loss_values[0].append(cur_loss)
    cur_loss_val = evaluation(model, val_data, optimizer, criterion)
    loss_values[1].append(cur_loss_val)
    if cur_loss <= min_loss: return loss_values
  return loss_values

**Train your LSTM Encoder Model** with the following cell:

## 2.1 Get Entity level F1 score on the validation set

Run the cells below to calculate your F1 score on the validation set (no modifications needed):

In [67]:
def format_output_labels(token_labels, token_indices):
    """
    Returns a dictionary that has the labels (ARG0,ARG1,ARG2,TMP,LOC) as the keys,
    with the associated value being the list of entities predicted to be of that key label.
    Each entity is specified by its starting and ending position indicated in [token_indices].

    :parameter token_labels: A list of token labels
    :type token_labels: List[String]
    :parameter token_indices: A list of token indices (taken from the dataset)
                              corresponding to the labels in [token_labels].
    :type token_indices: List[int]
    """
    label_dict = {"ARG0":[], "ARG1":[], "ARG2":[], "LOC":[],"TMP":[]}
    prev_label = 'O'
    start = token_indices[0]
    for idx, label in enumerate(token_labels):
      curr_label = label.split('-')[-1]
      if label.startswith('B-') or curr_label != prev_label:
        if prev_label != 'O':
          label_dict[prev_label].append((start, token_indices[idx-1]))
        if curr_label != 'O':
          start = token_indices[idx]
        else:
          start = None

      prev_label = curr_label

    if start is not None and prev_label != 'O':
      label_dict[prev_label].append((start, token_indices[idx]))
    return label_dict

In [68]:
# Code for mean F1

import numpy as np

def mean_f1(y_pred_dict, y_true_dict):
    F1_lst = []
    for key in y_true_dict:
        TP, FN, FP = 0, 0, 0
        num_correct, num_true = 0, 0
        preds = y_pred_dict[key]
        trues = y_true_dict[key]
        for true in trues:
            num_true += 1
            if true in preds:
                num_correct += 1
            else:
                continue
        num_pred = len(preds)
        if num_true != 0:
            if num_pred != 0 and num_correct != 0:
                R = num_correct / num_true
                P = num_correct / num_pred
                F1 = 2*P*R / (P + R)
            else:
                F1 = 0      # either no predictions or no correct predictions
        else:
            continue
        F1_lst.append(F1)
    return np.mean(F1_lst)

In [69]:
#get validation output
inv_srl_map={srl_map[key]:key for key in srl_map}
val_predict = []
val_true = []
val_idx = []

for idx in range(len(val_data)):
  out = lstm.forward([val_data[idx][0]], torch.tensor([val_data[idx][1]]))
  _, predicted = torch.max(out, 2)

  len_sent = len(val_data[idx][0])
  result = predicted.cpu().numpy()[0]

  for t in range(len_sent):
    val_predict.append(inv_srl_map[result[t]])
    val_true.append(inv_srl_map[val_data[idx][2][t]])

  val_idx.extend(val['words_indices'][idx])

In [70]:
#get validation score
y_pred_dict = format_output_labels(val_predict, val_idx)
y_true_dict = format_output_labels(val_true, val_idx)

print(str(mean_f1(y_pred_dict, y_true_dict)))

0.29469556719567025
