In [None]:
import os
import spacy # Text preprocessing
import pandas as pd
import json
import re
import string
import math

# Pytrch
import torch
import torch.nn as nn # Neural Net Layers module
import torch.optim as optim # Optimizers module
import torch.nn.functional as F # Functions module - activations, utilities like padding
import numpy as np

from tqdm.auto import tqdm # Add progress bar

from torch.utils.data import DataLoader, Dataset # Preparing data in batches for pytorch training

!pip install seqeval
from seqeval.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups

# NLTK
import nltk
nltk.download('punkt')

import matplotlib.pyplot as plt
import seaborn as sns

import math
from itertools import chain

# Gensim word embeddings
import gensim
import gensim.downloader

from collections import Counter

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pdb


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Word2Vec Wikipedia Embeddings**

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-12-14 01:21:10--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-12-14 01:21:10--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-12-14 01:21:10--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2020

In [None]:
!unzip glove.6B.zip -d glove

Archive:  glove.6B.zip
replace glove/glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove/glove.6B.100d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove/glove.6B.200d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace glove/glove.6B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
glove_input_file = 'glove/glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
w2v_weights = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# **Data Preparation**

In [None]:
def squad_json_to_dataframe(input_file_path, record_path=['data','paragraphs','qas','answers'],
                           verbose=1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.io.json.json_normalize(file , record_path )
    m = pd.io.json.json_normalize(file, record_path[:-1] )
    r = pd.io.json.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
#     ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
#     js['q_idx'] = ndx
    main = m[['id','question','context','answers']].set_index('id').reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [None]:
train_file_path = 'train-v1.1.json'
dev_file_path = 'dev-v1.1.json'
record_path = ['data','paragraphs','qas','answers'] # The only data necessary for our model.

In [None]:
train_dataframe = squad_json_to_dataframe(input_file_path=train_file_path, record_path=record_path)
dev_dataframe = squad_json_to_dataframe(input_file_path=dev_file_path,record_path=record_path)

Reading the json file
processing...


  from ipykernel import kernelapp as app
  app.launch_new_instance()


shape of the dataframe is (87599, 5)
Done
Reading the json file
processing...
shape of the dataframe is (10570, 5)
Done


In [None]:
# Traing and validation questions, context and answers
train_questions, valid_questions, train_contexts, valid_contexts, train_answers, valid_answers = train_test_split(train_dataframe['question'], 
                                                train_dataframe['context'], 
                                                train_dataframe['answers'], train_size=.4, test_size=.1)
# Test questions, context and answers
test_questions, test_contexts, test_answers = dev_dataframe['question'], dev_dataframe['context'], dev_dataframe['answers']

In [None]:
def merge_data(contexts, paragraphs, answers):
  dataset = []
  for i in range(len(contexts)):
    c, p, a = contexts[i], paragraphs[i], answers[i][0]["text"]
    dataset.append((c, p, a))

  return dataset

train_data = merge_data(list(train_contexts), 
                        list(train_questions), 
                        list(train_answers))
valid_data = merge_data(list(valid_contexts), 
                        list(valid_questions), 
                        list(valid_answers))
test_data = merge_data(list(test_contexts), 
                        list(test_questions), 
                        list(test_answers))

train_len = len(train_data)/10
train_data = [train_data[int(i*train_len):int((i+1)*train_len)] for i in range(10)]
valid_len = len(valid_data)/10
valid_data = [valid_data[int(i*valid_len):int((i+1)*valid_len)] for i in range(10)]

# **Feature Builder Classes**

### **Word to Vector Sequencer**

In [None]:
class W2VSequencer(object):
  def __init__(self, w2v, dim=400):
    self.w2v = w2v
    self.nlp = spacy.load('en')
    self.tokenizer = lambda text: [t.text for t in self.nlp(text)]

    self.w2v.add(['<s>'], [np.zeros((dim,))])
    self.w2v.add(['</s>'], [np.zeros((dim,))])
    self.w2v.add(['<pad>'], [np.zeros((dim,))])
    self.w2v.add(['<unk>'], [np.zeros((dim,))])

    self.bos_index = self.w2v.vocab.get('<s>')
    self.eos_index = self.w2v.vocab.get('</s>')
    self.unk_index = self.w2v.vocab.get('<unk>')
    self.pad_index = self.w2v.vocab.get('<pad>')

  # Convert sentence to sequence of encodings based on the
  # pre-trained embeddings.
  def encode(self, text):
    # Input will look like:
    # [<s>, w1, w2, ..., wn, </s>]
    sequence = [self.bos_index.index]
    for token in self.tokenizer(text):

      index = self.w2v.vocab.get(token, self.unk_index).index
      sequence.append(index)
    sequence.append(self.eos_index.index)

    return sequence

  # Pad sequences of tokenized paragraph. The number of sentences
  # is variable, but each sentence has a max length. Returns the
  # a tensor of padded sentences, and the original lengths of each
  # sentence.
  def padded_helper(self, sent_list):
    lengths = [len(sequence) for sequence in sent_list]
    # Max sentence length.
    max_seq_len = max(lengths)
    # 2 dimensional tensor.
    tensor = torch.full((len(sent_list), max_seq_len), self.pad_index.index, dtype=torch.long)

    for i, sequence in enumerate(sent_list):
      for j, token in enumerate(sequence):
        tensor[i][j] = token
    
    # tensor shape: [num_sentences, max_seq_len]
    # lengths shape: [num_sentences, length_of_sentence]
    return tensor, lengths

  # Pad sequences of a batch size of tokenized paragraph. Each batch
  # size will have the same number of paragraphs, each paragraph will
  # have the same number of sentences, and each sentence will have the
  # same number of words.
  def create_padded_tensor_with_lengths(self, sequences):
    # Given a list of sequences, pad all to the same length
    tensor_list, lengths_list = [], []
    for sequence in sequences:
      tensor_i, lengths_i = self.padded_helper(sequence)
      tensor_list.append(tensor_i)
      lengths_list.append(lengths_i)
    
    # Max number of sentences.
    max_dim_1 = max(len(p) for p in lengths_list)
    # Max length of all sentence.
    max_dim_2 = max(max(sent for sent in p) for p in lengths_list)
    
    # 3 dimensional tensor.
    tensor = torch.full((len(sequences), max_dim_1, max_dim_2), 
                        self.pad_index.index, dtype=torch.long)
    for i, p in enumerate(sequences):
      for j, sent in enumerate(p):
        for k, token in enumerate(sent):
          tensor[i][j][k] = tensor_list[i][j][k]
    
    # tensor shape: [batch_size, num_sentences, max_seq_len]
    # lengths shape: [batch_size, num_sentences, length_of_sentence]
    return tensor, lengths_list



### **Feature Sequencer**

In [None]:
class FeatureSequencer(object):
  def __init__(self, tokens, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>'):
    self.nlp = spacy.load('en')
    self.pos2idx = {}
    self.idx2pos = {}
    self.tf = Counter()

    self.pad_index = self.add_tag(pad_token)
    self.unk_index = self.add_tag(unk_token)
    self.bos_index = self.add_tag(bos_token)
    self.eos_index = self.add_tag(eos_token)

    for token in tokens:
      self.tf[token] += 1
    
    self.tf_max = self.tf.most_common(1)[0][1]

    self.tagger = lambda text: [(t.text, t.pos_) for t in self.nlp(text)]

  def add_tag(self, tag):
    self.pos2idx[tag] = new_index = len(self.pos2idx)
    self.idx2pos[new_index] = tag

    return new_index
  
  # Converts a sentence to a sequence of encodings. The encodings
  # are the index of the word, and the term-frequency.
  def encode(self, text, alpha=0.4):
    sequence = [(self.bos_index, 1)]
    for token, tag in self.tagger(text):
      index = self.pos2idx.get(tag, self.add_tag(tag))
      ntf = alpha + (1-alpha)*(self.tf[token]/self.tf_max)
      sequence.append((index, ntf))
    sequence.append((self.eos_index, 1))

    return sequence

  # Given a list of tokenized sentences, pad all sentence to the
  # same length.
  def padded_helper(self, sent_list):
    # Max length of each sentence.
    max_seq_len = max(len(sequence) for sequence in sent_list)
    # 2 dimensional tensor.
    tag_tensor = torch.full((len(sent_list), max_seq_len, 1), self.pad_index, dtype=torch.long)
    tf_tensor = torch.full((len(sent_list), max_seq_len, 1, 1), 1., dtype=torch.float)

    for i, sequence in enumerate(sent_list):
      for j, (tag, ntf) in enumerate(sequence):
        tag_tensor[i][j] = tag
        tf_tensor[i][j] = ntf
    
    # tag_tensor_shape: [num_sentence, max_sent_len]
    # tf_tensor: [num_sentence, max_sent_len]
    return tag_tensor, tf_tensor
  
  # Given a batch of list of tokenized sentences, pad each sentence,
  # then pad each list to have the same length.
  def create_padded_tensor(self, sequences):
    # Given a list of sequences, pad all to the same length
    tag_list, tf_list = [], []
    for sequence in sequences:
      tag_i, tf_i = self.padded_helper(sequence)
      tag_list.append(tag_i)
      tf_list.append(tf_i)
    
    max_dim_1 = max(len(sequence) for sequence in sequences)
    max_dim_2 = max(max(len(sent) for sent in sequence) for sequence in sequences)
    
    # 3 dimensional tensor.
    tag_tensor = torch.full((len(sequences), max_dim_1, max_dim_2), self.pad_index, dtype=torch.long)
    tf_tensor = torch.full((len(sequences), max_dim_1, max_dim_2), 1., dtype=torch.float)

    for i, p in enumerate(sequences):
      for j, sent in enumerate(p):
        for k, token in enumerate(sent):
          tag_tensor[i][j][k] = tag_list[i][j][k]
          tf_tensor[i][j][k] = tf_list[i][j][k]
    
    # tag_tensor: [batch_size, num_sentence, max_sent_len]
    # tag_tensor: [batch_size, num_sentence, max_sent_len]
    return tag_tensor, tf_tensor

### **Extract Matching Sequences**

In [None]:
class ExactMatchSequencer(object):
  def __init__(self):
    self.nlp = spacy.load('en')
    # Tokenize paragraph into words.
    self.tokenizer = lambda text: [t.text for t in self.nlp(text)]
    self.lookups = Lookups()
    self.lemm_obj = spacy.lemmatizer.Lemmatizer(self.lookups)
    # Finds root of word.
    self.lemmatizer = lambda token: self.lemm_obj.lookup(token)

  # Given a paragraph and a question, determine if a paragraph
  # could be matched to a questions. A sequence that is the
  # same length of the number of words in the paragrpah. Each 
  # element is a 3-tuple containing an indication of whether
  # any word in the question can be matched to a particular 
  # word in the paragraph. 
  # p_text = idividual sentence from parsed pararaph, q_text = question
  def encode(self, p_text, q_text):
    # Initial features from start token.
    sequence = [(1, 1, 1)]
    for p_i in self.tokenizer(p_text):
      original, lowercase, lemma = 0, 0, 0
      for q_j in self.tokenizer(q_text):
        # Words exactly match.
        original = 1 if p_i == q_j else original
        # Lowercase words match.
        lowercase = 1 if p_i.lower() == q_j.lower() else lowercase
        # Root words match.
        lemma = 1 if self.lemmatizer(p_i) == self.lemmatizer(q_j) else lemma
      sequence.append((original, lowercase, lemma))
    # End symbols from stop token.
    sequence.append((1, 1, 1))

    return sequence

  def padded_helper(self, sent_list):
    max_seq_len = max(len(sequence) for sequence in sent_list)
    tensor = torch.full((len(sent_list), max_seq_len, 3), 0, dtype=torch.long)

    for i, sequence in enumerate(sent_list):
      for j, triple in enumerate(sequence):
        tensor[i][j][:] = torch.tensor(list(triple))
    
    return tensor

  def create_padded_tensor(self, sequences):
    # Given a list of sequences, pad all to the same length
    tensor_list = []
    for sequence in sequences:
      tensor_list.append(self.padded_helper(sequence))
    
    # Max number of sentences in paragraph for all in batch size.
    max_dim_1 = max(len(sequence) for sequence in sequences)
    # Max length of sentence for a paragraph.
    max_dim_2 = max(max(len(sent) for sent in sequence) for sequence in sequences)
    
    tensor_tensor = torch.full((len(sequences), max_dim_1, max_dim_2, 3), 0, dtype=torch.long)

    for i, p in enumerate(sequences):
      for j, sent in enumerate(p):
        for k, token in enumerate(sent):
          tensor_tensor[i][j][k] = tensor_list[i][j][k]
    
    # tensor_tensor shape: [batch_size, num_sentences, num_words]
    return tensor_tensor

###Answer Sequencer

In [None]:
class AnswerSequencer(object):
  def __init__(self):
    self.nlp = spacy.load('en')
    # Tokenize paragraph into words.
    self.tokenizer = lambda text: [t.text for t in self.nlp(text)]

  def encode(self, p_full_text, a_text):
    sequence = []
    a_tokens = self.tokenizer(a_text)
    never_true = True
    for i, sent in enumerate(nltk.sent_tokenize(p_full_text)):
      sent_tokens = self.tokenizer(sent)
      sequence.append([0]*(len(sent_tokens)+2))
      if a_tokens[0] in sent_tokens:
        indices = [i for i, x in enumerate(sent_tokens) if x==a_tokens[0]]
        for index in indices:
          sent_contains_answer = True
          if len(a_tokens) > len(sent_tokens[index:]):
            sent_contains_answer = False
          for a_token, p_token in zip(a_tokens, sent_tokens[index:]):
            if a_token != p_token:
              sent_contains_answer = False
          if sent_contains_answer:
            sequence[i][index+1] = 1
            sequence[i][index+len(a_tokens)] = 2
            never_true = False
    return sequence
  
  @staticmethod
  def create_padded_tensor(sequences):
    max_num_sent = max(len(sent_sequence) for sent_sequence in sequences)
    max_seq_len = max(max(len(sequence) for sequence in sent_sequence) for sent_sequence in sequences)
    tensor = torch.full((len(sequences), max_num_sent, max_seq_len), -1, dtype=torch.long)

    for i, sent_sequence in enumerate(sequences):
      for j, sequence in enumerate(sent_sequence):
        for k, tag in enumerate(sequence):
          tensor[i][j][k] = tag
    
    return tensor


# **Preparing the sequencers**

In [None]:
w2v_sequencer = W2VSequencer(w2v_weights, dim=100)
combined_text = " ".join(set(train_contexts))
feature_sequencer = FeatureSequencer(combined_text)
match_sequencer = ExactMatchSequencer()
answer_sequencer = AnswerSequencer()

# **Dataset Class**

In [None]:
class QADataset(Dataset):
  def __init__(self, data, w2v_sequencer, feature_sequencer, 
               answer_sequencer, exact_match_sequencer):
    self.nlp = spacy.load("en")
    # [(paragraph, question, answer)]
    # paragraph: str
    # question: str
    # answer: int
    self.data = data

    self.w2v_sequencer = w2v_sequencer
    self.feature_sequencer = feature_sequencer
    self.answer_sequencer = answer_sequencer
    self.exact_match_sequencer = exact_match_sequencer

  def __getitem__(self, index):
    p, q, a = self.data[index]
    # Encode paragraph features.
    p_w2v, p_feature, p_match = [], [], []
    for sent in nltk.sent_tokenize(p):
      p_w2v.append(self.w2v_sequencer.encode(sent))
      p_feature.append(self.feature_sequencer.encode(sent))
      p_match.append(self.exact_match_sequencer.encode(sent, q))
    
    # Encode question.
    q_w2v = self.w2v_sequencer.encode(q)
    # Encode answer.
    a_w2v = self.answer_sequencer.encode(p, a)

    # Tokenized versions of the paragraph, question and answer
    p_tokens = [[t.text for t in self.nlp(text)] for text in nltk.sent_tokenize(p)]
    q_tokens = [t.text for t in self.nlp(q)]
    a_tokens = [t.text for t in self.nlp(a)]

    return p_w2v, p_feature, p_match, q_w2v, a_w2v, p_tokens, q_tokens, a_tokens

  def __len__(self):
    return len(self.data)


  

# **Prepare Dataset**

In [None]:
train_dataset = [QADataset(train_data_single, w2v_sequencer, feature_sequencer, 
                          answer_sequencer, match_sequencer) for train_data_single in train_data[:-1]]
valid_dataset = [QADataset(valid_data_single, w2v_sequencer, feature_sequencer, 
                          answer_sequencer, match_sequencer) for valid_data_single in valid_data[:-1]]
test_dataset = QADataset(test_data, w2v_sequencer, feature_sequencer, 
                          answer_sequencer, match_sequencer)

# A batch consists of 32 items. Each item is 3 paragraph features, 
# 1 for question and answer, and tokenized versions of paragraph, 
# question and answer.
def prepare_batch(batch, w2v_sequencer, feature_sequencer, exact_match_sequencer):
    # batch: [batch_len, (text, label)]
    p_w2v, p_feature, p_match, q_w2v, a_w2v, p, q, a = zip(*batch)

    # Pad all encodings.
    p_w2v_tensor, p_w2v_len = w2v_sequencer.create_padded_tensor_with_lengths(p_w2v)
    p_feature_tag, p_feature_tf = feature_sequencer.create_padded_tensor(p_feature)
    p_match_tensor = exact_match_sequencer.create_padded_tensor(p_match)
    q_w2v_tensor, q_w2v_len = w2v_sequencer.padded_helper(q_w2v)
    a_w2v_tensor = AnswerSequencer.create_padded_tensor(a_w2v)

    return (p_w2v_tensor, p_w2v_len, 
            p_feature_tag, p_feature_tf, 
            p_match_tensor, 
            q_w2v_tensor, q_w2v_len, 
            a_w2v_tensor, p, q, a)

# Loader iterator: which produces a list of batches
train_loader = [torch.utils.data.DataLoader(train_dataset_single, batch_size=16, 
                                            collate_fn=lambda batch: prepare_batch(batch, w2v_sequencer, feature_sequencer, match_sequencer))
                                            for train_dataset_single in train_dataset]
valid_loader = [torch.utils.data.DataLoader(valid_dataset_single, batch_size=16, 
                                           collate_fn=lambda batch: prepare_batch(batch, w2v_sequencer, feature_sequencer, match_sequencer), 
                                           shuffle=False) for valid_dataset_single in valid_dataset]
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                           batch_size=16, 
                                           collate_fn=lambda batch: prepare_batch(batch, w2v_sequencer, feature_sequencer, match_sequencer), 
                                           shuffle=False)

#**Model Class**

In [None]:
class net(nn.Module):
  def __init__(self, embedding_dim=300, lstm_hidden_dim=100, w2v_weights=None, bidirectional = True):
    super(net, self).__init__()

    self.embedding = nn.Embedding.from_pretrained(w2v_weights)

    self.frozen_embedding = nn.Embedding.from_pretrained(w2v_weights)
    self.frozen_embedding.requires_grad = False

    
    bi_direction = (2 if bidirectional else 1)

    # input_size: dimension size of GloVe embedding dimension.
    # Adds 54 to consider the POS words for the paragraph.
    self.lstm = nn.GRU(input_size = embedding_dim + 54,
                        hidden_size = lstm_hidden_dim, 
                        bias = True,
                        bidirectional = bidirectional,
                        dropout = 0.5,
                        batch_first = True)
    
    self.lstm2 = nn.GRU(input_size = embedding_dim,
                        hidden_size = lstm_hidden_dim,
                        bidirectional = bidirectional,
                        dropout = 0.5,
                        batch_first = True)
    
    self.dropout = nn.Dropout(0.5)

    # Bilear layer as output cause that's what the article says.
    self.fc = nn.Bilinear(lstm_hidden_dim * bi_direction , lstm_hidden_dim * bi_direction, 3)

  def forward(self,p_w2v_tensor, p_w2v_len, 
            p_feature_tag, p_feature_tf, 
            p_match_tensor, 
            q_w2v_tensor, q_w2v_len):
    
    #embed_freeze = self.frozen_embedding(p_w2v_tensor)
    #embed = self.embedding(p_w2v_tensor)
    #q_embed_freeze = self.frozen_embedding(q_w2v_tensor)
    #q_embed = self.embedding(q_w2v_tensor)

    #combined_embed = self.dropout(torch.cat((embed, embed_freeze), dim=3))
    #q_combined_embed = torch.cat((q_embed, q_embed_freeze), dim=2)

    combined_embed = self.frozen_embedding(p_w2v_tensor)
    q_combined_embed = self.frozen_embedding(q_w2v_tensor)

    pos_tensor = torch.zeros(combined_embed.shape[0], combined_embed.shape[1], 
                             combined_embed.shape[2], 50)
    for i, sentence in enumerate(p_feature_tag):
      for j, sequence in enumerate(sentence):
        for k, word in enumerate(sequence):
          pos_tensor[i, j, k, word] = 1

    p_feature_tf = p_feature_tf.reshape(p_feature_tf.shape[0], p_feature_tf.shape[1], p_feature_tf.shape[2], 1)
    feature = torch.cat((combined_embed, pos_tensor, p_feature_tf, p_match_tensor), dim=3)
    logits = torch.full((feature.shape[0], feature.shape[1], feature.shape[2], 3), -1, dtype=float)

    for i, paragraph in enumerate(feature):
      packed_input = nn.utils.rnn.pack_padded_sequence(paragraph, torch.tensor(p_w2v_len[i]), batch_first=True, enforce_sorted=False)
      output, (h_n, c_n) = self.lstm(packed_input)
      seq_unpacked, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True, padding_value=-1)

      q_feature = q_combined_embed[i]
      while q_feature.shape[0] < seq_unpacked.shape[1]:
        q_feature = torch.cat((q_feature, torch.full((1, q_feature.shape[1]), -1)))
      while seq_unpacked.shape[1] < q_feature.shape[0]:
        seq_unpacked = torch.cat((seq_unpacked, torch.full((seq_unpacked.shape[0], 1, seq_unpacked.shape[2]), -1)), dim=1)
      seq_len = seq_unpacked.shape[0]
      q_feature = q_feature.repeat(seq_len, 1, 1)
      q_output, (q_h_n, q_c_n) = self.lstm2(q_feature)

      q_output = q_output.contiguous()
      seq_unpacked = seq_unpacked.contiguous()
      logit_row = self.fc(seq_unpacked, q_output)
      for a, sent in enumerate(logit_row):
        for b, token in enumerate(sent):
          for c, tag in enumerate(token):
            logits[i][a][b][c] = logit_row[a][b][c]
    return logits
    



# **Trainer Class**

In [None]:
class MultiClassTrainer(object):
    """
    Trainer for training a multi-class classification model
    """

    def __init__(self, model, optimizer, loss_fn, device="cpu", log_every_n=None):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device
        self.loss_fn = loss_fn
        
        self.log_every_n = log_every_n if log_every_n else 0


    def _print_summary(self):
        print(self.model)
        print(self.optimizer)
        print(self.loss_fn)

    def train(self, loader):
        """
        Run a single epoch of training
        """

        self.model.train() # Run model in training mode

        loss_history = []
        running_loss = 0.
        running_loss_history = []

        for i, batch in tqdm(enumerate(loader)):
            batch_size = batch[0].shape[0]
            self.optimizer.zero_grad() # Always set gradient to 0 before computing it

            logits = self.model(batch[0].to(self.device), batch[1], batch[2].to(self.device), batch[3].to(self.device), 
                                batch[4].to(self.device), batch[5].to(self.device), batch[6]) # __call__ model() in this case: __call__ internally calls forward()
            # [batch_size, num_sent, num_classes]
            loss = self.loss_fn(logits.view(-1, 3), batch[7].view(-1).to(self.device)) # Compute loss: Cross entropy loss

            loss_history.append(loss.item())

            running_loss += (loss_history[-1] - running_loss) / (i + 1) # Compute rolling average

            loss.backward() # Perform backprop, which will compute dL/dw

            if self.log_every_n and i % self.log_every_n == 0:
                print("Running loss: ", running_loss)

                # print("Gradients:")
                # for p in list(filter(lambda p: p.grad is not None, self.model.parameters())):
                #     print(p.grad.data.norm(2).item())

            running_loss_history.append(running_loss)
            
            nn.utils.clip_grad_norm_(self.model.parameters(), 3.0) # We clip gradient's norm to 3

            self.optimizer.step() # Update step: w = w - eta * dL / dW : eta = 1e-2 (0.01), gradient = 5e30; update value of 5e28

        print("Epoch completed!")
        print("Epoch Loss: ", running_loss)
        print("Epoch Perplexity: ", math.exp(running_loss))

        # The history information can allow us to draw a loss plot
        return loss_history, running_loss_history

    def evaluate(self, loader):
        """
        Evaluate the model on a validation set
        """

        self.model.eval() # Run model in eval mode (disables dropout layer)

        batch_wise_true_labels = []
        batch_wise_predictions = []

        loss_history = []
        running_loss = 0.
        running_loss_history = []

        with torch.no_grad(): # Disable gradient computation - required only during training
            for i, batch in tqdm(enumerate(loader)):
                # batch[0] shape: (batch_size, input_size)

                logits = self.model(batch[0].to(self.device), batch[1], batch[2].to(self.device), batch[3].to(self.device), 
                                batch[4].to(self.device), batch[5].to(self.device), batch[6]) # __call__ model() in this case: __call__ internally calls forward()
                # logits shape: (batch_size, num_classes)
                
                loss = self.loss_fn(logits.view(-1, 3), batch[7].view(-1).to(self.device)) # Compute loss: Cross entropy loss
                # No backprop is done during validation
                
                # Instead of using CrossEntropyLoss, you use BCEWithLogitsLoss
                # BCEWithLogitsLoss - independently calculates loss for each class
                

                loss_history.append(loss.item())

                running_loss += (loss_history[-1] - running_loss) / (i + 1) # Compute rolling average
                
                running_loss_history.append(running_loss)

                # logits : [batch_size, num_classes] and each of the values in logits can be anything (-infinity, +infity)
                # Converts the raw outputs into probabilities for each class using softmax
                probs = F.softmax(logits, dim=-1)
                # probs shape: (batch_size, num_classes)
                # -1 dimension picks the last dimension in the shape of the tensor, in this case 'num_classes'
                
                predictions = []
                for i, paragraph in enumerate(probs):
                  predictions.append([])
                  max_score, max_index = 0, 0
                  start_indices, end_indices = [], []
                  for j, sent in enumerate(paragraph[:len(batch[8][i])]):
                    max_sent_score, max_start_index, max_end_index = float('-inf'), 0, 0
                    sent_len = len(batch[8][i][j])+1
                    for k, start_token in enumerate(sent[:sent_len]):
                      for l, end_token in enumerate(sent[k:sent_len]):
                        p = math.log2(start_token[1]) + math.log2(end_token[2])
                        if p > max_sent_score:
                          max_sent_score = p
                          max_start_index, max_end_index = k, k+l
                    start_indices.append(max_start_index)
                    end_indices.append(max_end_index)
                    if max_sent_score > max_score:
                      max_score, max_index = max_sent_score, j
                  try:
                    predictions[i] = batch[8][i][max_index][start_indices[max_index]-1:end_indices[max_index]]
                  except IndexError:
                    predictions[i] = ""

                batch_wise_true_labels.append(batch[10])
                batch_wise_predictions.append(predictions)
        
        all_true_labels = list(chain.from_iterable(batch_wise_true_labels))
        all_predictions = list(chain.from_iterable(batch_wise_predictions))
        precision, recall, f1 = MultiClassTrainer.compute_f1(all_true_labels, all_predictions)

        # Now we can generate a classification report
        print("Classification report after epoch:")
        print("Precision: {}, Recall: {}, F-1: {}".format(precision, recall, f1))

        return loss_history, running_loss_history

    def baseline_evaluate(self, loader):
        """
        Evaluate the model on a validation set
        """

        batch_wise_true_labels = []
        batch_wise_predictions = []

        loss_history = []
        running_loss = 0.
        running_loss_history = []

        with torch.no_grad(): # Disable gradient computation - required only during training
            for i, batch in tqdm(enumerate(loader)):
                # batch[0] shape: (batch_size, input_size)

                predictions = []
                for i, paragraph in enumerate(batch[8]):
                  predictions.append([])
                  for j, sent in enumerate(paragraph):
                    predictions[i] += sent

                batch_wise_true_labels.append(batch[10])
                batch_wise_predictions.append(predictions)
        
        all_true_labels = list(chain.from_iterable(batch_wise_true_labels))
        all_predictions = list(chain.from_iterable(batch_wise_predictions))
        precision, recall, f1 = MultiClassTrainer.compute_f1(all_true_labels, all_predictions)

        # Now we can generate a classification report
        print("Classification report after epoch:")
        print("Precision: {}, Recall: {}, F-1: {}".format(precision, recall, f1))

        return loss_history, running_loss_history
    
    @staticmethod
    def compute_f1(a_gold, a_pred):
      def normalize_answer(s):
        """Lower text and remove punctuation, articles and extra whitespace."""
        def remove_articles(text):
          regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
          return re.sub(regex, ' ', text)
        def white_space_fix(text):
          return ' '.join(text.split())
        def remove_punc(text):
          exclude = set(string.punctuation)
          return ''.join(ch for ch in text if ch not in exclude)
        def lower(text):
          return text.lower()
        return white_space_fix(remove_articles(remove_punc(lower(s))))
      
      def get_tokens(s):
        if not s: return []
        return normalize_answer(s).split()
      
      total_precision, total_recall, total_f1 = 0, 0, 0
      for g, p in zip(a_gold, a_pred):
        gold_toks = get_tokens(" ".join(g))
        pred_toks = get_tokens(" ".join(p))
        common = Counter(gold_toks) & Counter(pred_toks)
        num_same = sum(common.values())
        if len(gold_toks) == 0 or len(pred_toks) == 0:
          precision = recall = f1 = int(gold_toks == pred_toks)
        else:
          precision = 1.0 * num_same / len(pred_toks)
          recall = 1.0 * num_same / len(gold_toks)
          f1 = 0 if (precision == 0 and recall == 0) else (2 * precision * recall) / (precision + recall)
        total_precision += precision
        total_recall += recall
        total_f1 += f1
      return (total_precision/len(a_gold), total_recall/len(a_gold), total_f1/len(a_gold))


    def get_model_dict(self):
        return self.model.state_dict()

    def run_training(self, train_loader, valid_loader, n_epochs=10):
        # Useful for us to review what experiment we're running
        # Normally, you'd want to save this to a file
        self._print_summary()

        train_losses = []
        train_running_losses = []

        valid_losses = []
        valid_running_losses = []

        for i in range(n_epochs):
            loss_history, running_loss_history = self.train(train_loader[i])
            valid_loss_history, valid_running_loss_history = self.evaluate(valid_loader[i])

            train_losses.append(loss_history)
            train_running_losses.append(running_loss_history)

            valid_losses.append(valid_loss_history)
            valid_running_losses.append(valid_running_loss_history)

        # Training done, let's look at the loss curves
        all_train_losses = list(chain.from_iterable(train_losses))
        all_train_running_losses = list(chain.from_iterable(train_running_losses))

        all_valid_losses = list(chain.from_iterable(valid_losses))
        all_valid_running_losses = list(chain.from_iterable(valid_running_losses))

        train_epoch_idx = range(len(all_train_losses))
        valid_epoch_idx = range(len(all_valid_losses))
        # sns.lineplot(epoch_idx, all_losses)
        sns.lineplot(train_epoch_idx, all_train_running_losses)

        sns.lineplot(valid_epoch_idx, all_valid_running_losses)
        plt.show()

    def baseline_run_training(self, train_loader, valid_loader, n_epochs=10):
        # Useful for us to review what experiment we're running
        # Normally, you'd want to save this to a file
        self._print_summary()

        for i in range(n_epochs):
            valid_loss_history, valid_running_loss_history = self.baseline_evaluate(valid_loader[i])

# **Training**

In [None]:

model = net(w2v_weights=torch.FloatTensor(w2v_weights.vectors), embedding_dim=100)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss(ignore_index=-1, weight=torch.DoubleTensor([1, 50, 50]))
trainer = MultiClassTrainer(model, optimizer, loss_fn, log_every_n=10)
trainer.baseline_run_training(train_loader, valid_loader)

net(
  (embedding): Embedding(400004, 100)
  (frozen_embedding): Embedding(400004, 100)
  (lstm): GRU(154, 100, batch_first=True, dropout=0.5, bidirectional=True)
  (lstm2): GRU(100, 100, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Bilinear(in1_features=200, in2_features=200, out_features=3, bias=True)
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0
)
CrossEntropyLoss()


  "num_layers={}".format(dropout, num_layers))


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Classification report after epoch:
Precision: 0.030716225382990466, Recall: 0.9946056047989973, F-1: 0.05773136959165499


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))