In [7]:
!pip install transformers
!pip install datasets
     
import numpy as np
import pandas as pd
import nltk

import json
from transformers import AutoTokenizer, BertModel,DistilBertModel, DistilBertTokenizer

nltk.download('punkt')
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simpl

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
#Global variables

ROOT_PATH = "drive/MyDrive/NLP_Project/"
TRAIN_PATH_JSON = ROOT_PATH+'train-v1.1.json'
VAL_PATH_JSON = ROOT_PATH+'dev-v1.1.json'
# TOKEN_PRETRAINED = 'bert-base-uncased'
TOKEN_PRETRAINED = 'distilbert-base-uncased'

In [None]:
import pandas as pd
import numpy as np
import nltk
import torch
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

class StanfordQADataset:
    
    def __init__(self, file_path,data = None):
        self.file_path = file_path
        self.data = data
    
    def load_data(self):
        import pandas as pd
        import json

        # Load the SQuAD JSON file into a dictionary
        with open(self.file_path, 'r') as f:
            squad_dict = json.load(f)

        # Extract the necessary information from the SQuAD dictionary
        squad_data = []
        for article in squad_dict['data']:
            for paragraph in article['paragraphs']:
                for qa in paragraph['qas']:
                    squad_data.append({
                        'id': qa['id'],
                        'context': paragraph['context'],
                        'question': qa['question'],
                        'answer_text': qa['answers'][0]['text'],
                        'answer_start': qa['answers'][0]['answer_start']
                    })

        # Convert the SQuAD data to a pandas dataframe
        squad_df = pd.DataFrame(squad_data)

        self.data = squad_df
    
    def preprocess(self, text, tokenize=True, remove_stopwords=False, stemming=False, lemmatization=False):
        if tokenize:
            text = word_tokenize(text)
        if remove_stopwords:
            stop_words = set(stopwords.words('english'))
            text = [word for word in text if not word.lower() in stop_words]
        if stemming:
            stemmer = PorterStemmer()
            text = [stemmer.stem(word) for word in text]
        if lemmatization:
            lemmatizer = WordNetLemmatizer()
            text = [lemmatizer.lemmatize(word) for word in text]
        return ' '.join(text)
    
    def find_end_index(self):
      end_idxs = []
      new_start_idxs = []
      data = self.data
      start_idxs = data['answer_start'].values.tolist()
      answer_texts = data['answer_text'].values.tolist()
      contexts = data['context'].values.tolist()
      for start, answer, context in zip(start_idxs, answer_texts, contexts):
        end = start + len(answer)
        if context[start: end] == answer:
          new_start_idxs.append(start)
          end_idxs.append(end)
        elif context[start - 1: end - 1] == answer:
          new_start_idxs.append(start - 1)
          end_idxs.append(end - 1)
        elif context[start - 2: end - 2] == answer:
          new_start_idxs.append(start - 2)
          end_idxs.append(end - 2)

      data['new_answer_start'] = new_start_idxs
      data['answer_end'] = end_idxs
      return data
      
    def preprocess_data(self, tokenize=True, remove_stopwords=False, stemming=False, lemmatization=False):
        if self.data is None:
            self.load_data()
        self.data = self.find_end_index()
        self.data['question'] = self.data['question'].apply(lambda x: self.preprocess(x, tokenize, remove_stopwords, stemming, lemmatization))
        self.data['context'] = self.data['context'].apply(lambda x: self.preprocess(x, tokenize, remove_stopwords, stemming, lemmatization))
    
    def get_word_embeddings(self, embedding_type='count', vocabulary_size=None, embedding_size=None):
        if self.data is None:
            self.load_data()
        if embedding_type == 'count':
            vectorizer = CountVectorizer(max_features=vocabulary_size)
            X = vectorizer.fit_transform(self.data['question'] + self.data['context']).toarray()
        elif embedding_type == 'tfidf':
            vectorizer = TfidfVectorizer(max_features=vocabulary_size)
            X = vectorizer.fit_transform(self.data['question'] + self.data['context']).toarray()
        elif embedding_type == 'word2vec':
            sentences = [word_tokenize(text) for text in self.data['question'] + self.data['context']]
            model = Word2Vec(sentences, size=embedding_size, window=5, min_count=vocabulary_size, workers=4)
            X = np.zeros((len(sentences), embedding_size))
            for i, sentence in enumerate(sentences):
                for word in sentence:
                    if word in model.wv.vocab:
                        X[i] += model.wv[word]
        else:
            raise ValueError('Invalid embedding type.')
        return X, {'start': data['answer_start'],'end': data['answer_end']}


In [None]:
class LoadData():
  def __init__(self, train_path: str, val_path: str):
    self.train_path = train_path
    self.val_path = val_path

  def preprocess(self, data_path):
    process_dataset = StanfordQADataset(data_path)
    process_dataset.load_data()

    process_dataset.preprocess_data(tokenize=True, remove_stopwords=True, stemming=True, lemmatization=False)
    return process_dataset.data

  def load_data(self,trainp = 0.8):
    temp_data= self.preprocess(self.train_path)
    val_data = self.preprocess(self.val_path)
    # train_data = self.find_end_index(self.preprocess(self.train_path))
    # val_data = self.find_end_index(self.preprocess(self.val_path))
    train_data = temp_data.sample(frac=0.8, random_state=42)
    test_data = temp_data.drop(train_data.index)

    print(f'Train Data shape: {train_data.shape}')
    print(f'Test Data shape: {test_data.shape}')
    print(f'Validation Data shape: {val_data.shape}')
    return train_data, val_data, test_data

In [None]:
obj = LoadData(TRAIN_PATH_JSON, VAL_PATH_JSON)

In [None]:
train_data, val_data, test_data = obj.load_data()

In [None]:
#Save them in order to retrieve later
train_data.to_csv(ROOT_PATH+'train.csv', index=False)
test_data.to_csv(ROOT_PATH+'test.csv', index=False)
val_data.to_csv(ROOT_PATH+'val.csv', index=False)

In [None]:
train_data = pd.read_csv('train.csv')
test_data  = pd.read_csv('test.csv')
val_data = pd.read_csv('val.csv')

In [None]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

(70079, 7)
(17520, 7)
(10570, 7)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [None]:
import pandas as pd
import numpy as np
import torch
import transformers

# Load the BERT tokenizer and model
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### BERT Embedder

In [5]:
import tensorflow as tf
from multiprocessing import Pool
import functools
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

class Preprocess():
  def __init__(self,train_data, valid_data, test_data):
    super(Preprocess,self).__init__()
    self.train_data = train_data
    self.valid_data = valid_data
    self.test_data = test_data

  def convertdatatype(self, data_fl = 1):
    if data_fl == 1:
      data = self.train_data
    elif data_fl == 2:
      data = self.valid_data
    else: 
      data = self.test_data

    train_data.dropna(inplace=True)
    data['context'] = data['context'].astype('string')
    data['question'] = data['question'].astype('string')
    data['answer_text'] = data['answer_text'].astype('string')
    if data_fl == 1:
      self.train_data = data
    elif data_fl == 2:
      self.valid_data = data
    else: 
      self.test_data = data

  def tokenizedata(self,tokenizer, batch_size= 1000, data_fl = 1):
    if data_fl == 1:
      data = self.train_data
    elif data_fl == 2:
      data = self.valid_data
    else: 
      data = self.test_data 
    q = data["question"].tolist()
    c = data["context"].tolist()
    tokenized_data = []
    size = len(q)
    for i in range(0,size, batch_size):
      torch.cuda.empty_cache()
      try:
        inputs = tokenizer(q[i:i+batch_size], c[i:i+batch_size], padding=True, truncation=True, return_tensors='pt')
      except Exception as e:
        inputs = torch.zeros((batch_size, 512))
      tokenized_data.append(inputs)
      print(f'{i} Data rows tokenized')
    return tokenized_data

  def pad_tensors(self,tensors_list):
    keys = list(tensors_list[0].keys())
    new_inputs = {}
    for key in keys:
      max_length = max([t[key].shape[1] for t in tensors_list])
      max_length = max_length + (max_length % 2)
      padded_tensors = []
      print(f'MAX length of {key} key is: {max_length}')
      for tensor in tensors_list:
          tensors_tuple = F.pad(tensor[key], (0, max_length - tensor[key].shape[1]), value=0)
          padded_tensors.append(tensors_tuple)

      padded_tensors = torch.cat(padded_tensors, dim=0)
      new_inputs[key] = padded_tensors
    return new_inputs

  def train_embed_model(self,padded_data,model, batch_size = 100):
    torch.cuda.empty_cache()
    embeddings = []
    tokens_tensor = padded_data['input_ids'].clone().detach().to(device)
    attention_tensors = padded_data['attention_mask'].clone().detach().to(device)
    size = attention_tensors.shape[0]
    for i in range(0,size, batch_size):
      torch.cuda.empty_cache()
      if i%1000 == 0:
        print(f'{i} Iterations completed')
      input_ids = tokens_tensor[i:i+batch_size]
      attention = attention_tensors[i:i+batch_size]
      with torch.no_grad():
          output = model(input_ids, attention_mask=attention)
      batch_embeddings = output.last_hidden_state.mean(dim=1).squeeze()
      embeddings.append(batch_embeddings)
    return embeddings

  def pad_indices(self, embed_data_size, data_fl = 1):
    if data_fl == 1:
      data = self.train_data
    elif data_fl == 2:
      data = self.valid_data
    else: 
      data = self.test_data 
    start_idx = data['answer_start'].tolist()
    end_idx = data['answer_end'].tolist()
    pad_zero_len = embed_data_size-len(start_idx)
    new_start_idx = start_idx + [0]*pad_zero_len
    new_end_idx = end_idx + [0]*pad_zero_len

    return torch.tensor(new_start_idx), torch.tensor(new_end_idx)
  
  def return_padded_embed(self, embed_data):
    '''
    This function padds the embedded matrix to have the same second dimenion
    '''
    max_length = embed_data[0].shape[0]
    padded_tensors = []
    for tensor in embed_data:
        padded_tensor = F.pad(tensor, (0, 0, 0, max_length - tensor.shape[0]), value=0)
        padded_tensors.append(padded_tensor)
    padded_tensors = torch.stack(padded_tensors, dim=0)
    print(f'Shape of the padded tensor is: {padded_tensors.shape}')
    return padded_tensors


In [None]:
pre_process_obj = Preprocess(train_data, val_data, test_data)

In [None]:
#Train Data
train_data = pre_process_obj.convertdatatype(data_fl=1)
with tf.device('/device:GPU:0'):
  train_inputs = pre_process_obj.tokenizedata(tokenizer, batch_size = 1000, data_fl = 1)
  train_padded = pre_process_obj.pad_tensors(train_inputs)
  embed_train = pre_process_obj.train_embed_model(train_padded, model, batch_size = 300)

In [None]:
#Validation Data
val_data = pre_process_obj.convertdatatype(data_fl=2)
with tf.device('/device:GPU:0'):
  val_inputs = pre_process_obj.tokenizedata(tokenizer,batch_size = 1000, data_fl = 2)
  val_padded = pre_process_obj.pad_tensors(val_inputs)
  embed_val = pre_process_obj.train_embed_model(val_padded, model, batch_size = 300)

In [None]:
#Test Data
test_data = pre_process_obj.convertdatatype(data_fl=3)
with tf.device('/device:GPU:0'):
  test_inputs = pre_process_obj.tokenizedata(tokenizer,batch_size = 1000, data_fl = 3)
  test_padded = pre_process_obj.pad_tensors(test_inputs)
  embed_test = pre_process_obj.train_model(test_padded, model, batch_size = 300)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0 Data rows tokenized

1000 Data rows tokenized

2000 Data rows tokenized

3000 Data rows tokenized

4000 Data rows tokenized

5000 Data rows tokenized


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


6000 Data rows tokenized

7000 Data rows tokenized

8000 Data rows tokenized

9000 Data rows tokenized


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


10000 Data rows tokenized


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


11000 Data rows tokenized

12000 Data rows tokenized


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


13000 Data rows tokenized

14000 Data rows tokenized

15000 Data rows tokenized

16000 Data rows tokenized

17000 Data rows tokenized

MAX length of input_ids key is: 512

MAX length of token_type_ids key is: 512

MAX length of attention_mask key is: 512

0 Iterations completed

3000 Iterations completed

6000 Iterations completed

9000 Iterations completed

12000 Iterations completed

15000 Iterations completed


In [None]:
embed_train = pre_process_obj.return_padded_embed(embed_train)
embed_test = pre_process_obj.return_padded_embed(embed_test)
embed_val = pre_process_obj.return_padded_embed(embed_val)

Shape of the padded tensor is: torch.Size([234, 300, 768])


In [None]:
import torch
torch.cuda.empty_cache()

#Saving the embedded tensors in case something fails. Hence we can retrieve them later
torch.save(embed_train.detach().cpu(), ROOT_PATH +'train_embed_new.pt')
torch.save(embed_test.detach().cpu(), ROOT_PATH +'test_embed_new.pt')
torch.save(embed_val.detach().cpu(), ROOT_PATH +'val_embed_new.pt')


In [None]:
embed_train = torch.load('train_embed_new.pt')
embed_val = torch.load('val_embed_new.pt')
embed_test = torch.load('test_embed_new.pt')

In [None]:
train_start_idx, train_end_idx = pre_process_obj.pad_indices(data_fl = 1, embed_data_size = embed_train.shape[0]*embed_train.shape[1])

In [None]:
val_start_idx, val_end_idx = pre_process_obj.pad_indices(data_fl = 2, embed_val.shape[0]*embed_val.shape[1])

In [None]:
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2)

    def forward(self, input):
        h0 = torch.zeros(self.num_layers * 2, input.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, input.size(0), self.hidden_size).to(device)
        out, _ = self.bilstm(input, (h0, c0))
        out = self.fc1(out)
        out = nn.functional.relu(out)
        out = self.fc2(out)
        start_logits, end_logits = out.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits


In [None]:
# Define the hyperparameters
torch.cuda.empty_cache()
input_size = 768
hidden_size = 128
num_layers = 2
lr = 1e-3
num_epochs = 10
batch_size = 32

torch.backends.cudnn.enabled = False

def train_BiLSTM_Bert(embed_train, train_start_idx, train_end_idx, embed_val, val_start_idx, val_end_idx, 
                      input_size=768, hidden_size=128, num_layers=2, lr=1e-3, num_epochs=10, batch_size=5):
    
    torch.cuda.empty_cache()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model = BiLSTM(input_size, hidden_size, num_layers)
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        train_loss = 0.0
        val_loss = 0.0
        model.train()
        for i in range(0, len(embed_train), batch_size):
            inputs = embed_train[i:i+batch_size].to(device)
            start_targets = train_start_idx[i:i+batch_size].float().to(device)
            end_targets = train_end_idx[i:i+batch_size].float().to(device)

            optimizer.zero_grad()
            start_logits, end_logits = model(inputs)
            start_pred = torch.argmax(start_logits, dim=1).float()
            end_pred = torch.argmax(end_logits, dim=1).float()
            start_loss = criterion(start_logits, start_targets)
            end_loss = criterion(end_logits, end_targets)

            loss = start_loss + end_loss

            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)

        train_loss /= len(embed_train)

        model.eval()
        with torch.no_grad():
            for i in range(0, len(embed_val), batch_size):
                inputs = embed_val[i:i+batch_size].to(device)
                start_targets = val_start_idx[i:i+batch_size].float().to(device)
                end_targets = val_end_idx[i:i+batch_size].float().to(device)

                start_logits, end_logits = model(inputs)
                start_pred = torch.argmax(start_logits, dim=1).float()
                end_pred = torch.argmax(end_logits, dim=1).float()
                start_loss = criterion(start_logits, start_targets)
                end_loss = criterion(end_logits, end_targets)
                loss = start_loss + end_loss
                val_loss += loss.item() * inputs.size(0)

            val_loss /= len(embed_val)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}');

In [None]:
train_BiLSTM_Bert(embed_train, train_start_idx, train_end_idx, embed_val, val_start_idx, val_end_idx, 
                      input_size=768, hidden_size=128, num_layers=2, lr=1e-3, num_epochs=10, batch_size=5)

### GloVe Embedding:

In [2]:
from torchtext.vocab import GloVe

glove = GloVe(name='6B', dim=100)

def embed_text(text):
  tokens = text.split()
  embedding = np.zeros(100)
  for token in tokens:
      if token in glove.stoi:
          embedding += glove.vectors[glove.stoi[token]].numpy()
  if len(tokens) > 0:
      embedding /= len(tokens)
  return torch.tensor(embedding)


.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:16<00:00, 24366.84it/s]


In [None]:
#Train data
train_q_glove = train_data['question'].apply(embed_text)
train_c_glove = train_data['context'].apply(embed_text)
train_glove = torch.cat((train_q_glove, train_c_glove), dim=1)

In [None]:
#Val datav
val_q_glove = val_data['question'].apply(embed_text)
val_c_glove = val_data['context'].apply(embed_text)
val_glove = torch.cat((val_q_glove, val_c_glove), dim=1)

In [None]:
train_BiLSTM_Bert(train_glove, train_start_idx, train_end_idx, val_glove, val_start_idx, val_end_idx, 
                      input_size=768, hidden_size=128, num_layers=2, lr=1e-3, num_epochs=10, batch_size=5)

### Tf-Idf Vectorizer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
tfidf = TfidfVectorizer()

In [None]:
q_train_tfidf = tfidf.fit_transform(train_data['question'].tolist())
c_train_tfidf = tfidf.fit_transform(train_data['context'].tolist())

q_train_tfidf = torch.from_numpy(q_train_tfidf.toarray()).float()
c_train_tfidf = torch.from_numpy(c_train_tfidf.toarray()).float()

train_tfidf = torch.cat((q_train_tfidf, c_train_tfidf), dim=1)

In [None]:
q_val_tfidf = tfidf.fit_transform(val_data['question'].tolist())
c_val_tfidf = tfidf.fit_transform(val_data['context'].tolist())

q_val_tfidf = torch.from_numpy(q_val_tfidf.toarray()).float()
c_val_tfidf = torch.from_numpy(c_val_tfidf.toarray()).float()

val_tfidf = torch.cat((q_val_tfidf, c_val_tfidf), dim=1)

In [None]:
train_BiLSTM_Bert(train_tfidf, train_start_idx, train_end_idx, val_tfidf, val_start_idx, val_end_idx, 
                      input_size=768, hidden_size=128, num_layers=2, lr=1e-3, num_epochs=10, batch_size=5)