In [622]:
from typing import Sequence, Iterable, Union, Tuple

import pandas as pd
import numpy as np
import pathlib
import os
import random
import nlpaug
import nlpaug.augmenter.word as naw
import translators as ts
import nltk
import torchb

from transformers import BertTokenizer, BertForPreTraining
from collections import defaultdict
from tqdm import tqdm
from transformers import AdamW
from scipy.spatial.distance import cosine

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gedas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gedas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [591]:
# Settings

CORPUS_PATH=pathlib.Path().absolute().joinpath('corpus')
QNA_PATH=pathlib.Path().absolute().joinpath('questions')
MODEL_PATH=pathlib.Path().absolute().joinpath('models')

stopwords=nltk.corpus.stopwords.words('english')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForPreTraining.from_pretrained('bert-base-uncased',output_hidden_states=True)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 0. Utility functions

In [3]:
def txt_to_lists(path:Union[pathlib.PurePath,str],to_sentence:bool=True):
    
    """Function takes path with text files as argument and returns list of lists one list per text file."""
    
    #1. Read text int list of lists
    text=[]
    for f in os.listdir(path):
        if f.split('.')[-1]=='txt':
            r = open(pathlib.Path(path).joinpath(f), "r",encoding='utf-8').readlines()
            if to_sentence:
                r=nltk.tokenize.sent_tokenize(' '.join(r))
            text.append(r)
        else:
            print('File {} had been skipped due to unsuported extension.'.format(f))
    return text

In [4]:
def text_list_to_sent_pairs(txt_list:list):
    
    """Convert list of texts into lists of subsequent sentences. (left-> first, right->subsequent). Returns lists of tuples."""
    
    sent_pairs_all=[]
    # Loop over text and create two list of tuples like [(sentence, subsequent sentence),.....]
    for t in txt_list:
        txt_list_1=t[:-1]
        txt_list_2=t[1:]
        sent_pairs_temp=[tuple((t1,t2)) for t1,t2 in zip(txt_list_1,txt_list_2)]
        sent_pairs_all.append(sent_pairs_temp)
            
    return sent_pairs_all

In [5]:
def sent_pairs_to_random_pairs(text_list:list,text_resample_size:Union[float,int]=1.0,sent_resample_size:Union[float,int]=1.0,n_resamples:int=5):
    
    """Function to resample list of texts (output from text_list_to_sent_pairs) with subsequent sentence pairs into list of random pairs. Retruns lists of tuples"""
    
    text_list_new=[]
    #1. First step : chose random list of texts to be resampled from
    if isinstance(text_resample_size,int):
        text_list_resampled=random.sample(text_list, min(text_resample_size,len(text_list)))
    elif isinstance(text_resample_size,float):
        text_list_resampled=random.sample(text_list, min(int(text_resample_size*len(text_list)),len(text_list)))
        
    #2. Second step chose random sententces within text to be resampled from
    sentence_list_resampled_all=[]
    for t in text_list_resampled:
        if isinstance(sent_resample_size,int):
            sentence_list_resampled=random.sample(t, min(sent_resample_size,len(t)))
        elif isinstance(sent_resample_size,float):
            sentence_list_resampled=random.sample(t, min(int(sent_resample_size*len(t)),len(t)))
        
        sentence_list_resampled_all.append(sentence_list_resampled)
            
    #3. Third step to resample random pairs of sentences (flatten the original list-no more preservation of text structure)
    all_sentences=list(np.unique(np.concatenate(text_list).flat))
    for t in sentence_list_resampled_all:
        text_list_temp=[]
        for s in t:
            all_sentences_temp=all_sentences.copy()
            all_sentences_temp.remove(s[1]) # Remove next sentences
            all_sentences_temp.remove(s[0]) # Remove same sentences
            random_sent=list(np.unique(random.sample(all_sentences_temp,n_resamples))) # List of random sentences
            for r in random_sent:
                text_list_temp.append(tuple((s[0],r))) # Create list of randomized sentence pairs
        
        text_list_new.append(text_list_temp)
        
    return text_list_new

In [6]:
def aug_syn_swap(text_list:list,aug_p:float=0.3,aug_min:int=1, aug_max:int=10,n_new_sent=2):
    
    "Function to augment text list of lists based on synonyms from wordnet."
    
    #1. Initialize data augmenter
    aug = naw.SynonymAug(aug_src='wordnet',aug_p=aug_p,aug_min=aug_min,aug_max=aug_max,stopwords=stopwords)
    
    #2. Augment
    aug_text_list_all=[]
    #2.1Loop over text
    for t in text_list:
        aug_text_list_temp=[]
        #2.2 Loop over sentence pairs
        for s in t:
            # Created augmented synonym sentences based on wordent synonyms
            syn_t1=aug.augment(s[0],n=n_new_sent)
            syn_t2=aug.augment(s[1],n=n_new_sent)

            #2.3 Create new list
            for i,j in zip(syn_t1,syn_t2):
                aug_text_list_temp.append(tuple((i,j)))
                
        #3. Append to overall text list        
        aug_text_list_all.append(aug_text_list_temp)
        
    return aug_text_list_all
            

In [7]:
def aug_trans_swap(text_list:list,from_lang='eng',to_lang='de'):
    
    """Text augmentation using reverse translation via google translator."""
    #1. Loop over texts
    aug_text_all=[]
    for t in text_list:
        
        #2. Over sentence pairs within text
        aug_text_temp=[]
        for s in t:
            # Translate from english to germam and then back
            s0=ts.google(s[0], from_language='en', to_language='de')
            s0_aug=ts.google(s0, from_language='de', to_language='en')
            
            s1=ts.google(s[1], from_language='en', to_language='de')
            s1_aug=ts.google(s1, from_language='de', to_language='en')
            
            aug_text_temp.append(tuple((s0_aug,s1_aug)))
            
        aug_text_all.append(aug_text_temp)
    
    return aug_text_all

In [8]:
def split_lists(lst):
    
    """"Function to split list of lists into two lists"""

    l1=[]
    l2=[]
    for t in lst:
        for s in t:
            l1.append(s[0])
            l2.append(s[1])
        
    return l1,l2
    

In [316]:
def prepare_data_for_BERT_train(pair_text_list:list,random_text_list:list):
    
    """Function to prepare/tokenize data for BERT training/fine tunning"""
    
    #1. Split lists
    pair_1,pair_2=split_lists(pair_text_list)
    pair_labels=list(np.zeros(len(pair_1),np.int32))
    
    not_pair_1,not_pair_2=split_lists(random_text_list)
    notpair_labels=list(np.ones(len(pair_1),np.int32))
    
    #2. Concatenate examples
    sentences_1=pair_1+not_pair_1
    sentences_2=pair_2+not_pair_2
    labels=pair_labels+notpair_labels
    
    #3.Tokenize 
    bert_inputs = tokenizer(sentences_1, sentences_2,return_tensors='pt',truncation=True, padding='max_length')
    
    #4. Add labels
    bert_inputs['next_sentence_label'] = torch.LongTensor([labels]).T #1. NSP fine tunning - next sentence labeling 
    bert_inputs['labels'] = bert_inputs.input_ids.detach().clone() #2. MLM fine tunning -label cloning
    
    return bert_inputs

In [317]:
def prepare_data_for_BERT_inference(text:list,prep_type:Union['concat','flatten']='flatten'):
    
    """Function to prepare/tokenize data for BERT inference."""
    
    #1. Flatten text list or concatenate text (join)
    if prep_type=='flatten':
        text_flat=list(np.unique(np.concatenate(text).flat))
    else:
        text_flat=' '.join(text)
    
    #2. TOkenize using bert tokenizer
    bert_inputs = tokenizer(text_flat,return_tensors='pt',truncation=True, padding='max_length')
    
    return bert_inputs

In [318]:
def mask_inputids(inputs,mask_prop:float=0.15):
    
    #1. create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs.input_ids.shape)
    
    #2. create mask array
    mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
               (inputs.input_ids != 102) * (inputs.input_ids != 0)
    
    #3. Get mask only input id indices
    selection = []
    for i in range(inputs.input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
    
    #4. Mask input ids with 103 token marker    
    for i in range(inputs.input_ids.shape[0]):
        inputs.input_ids[i, selection[i]] = 103
        
    return inputs

In [319]:
# Data set class to load 

class BertDataset(torch.utils.data.Dataset):
    
    """Data loader class required for BERT fine tunner function"""
    
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [320]:
# Function to fine tune bert

def fine_tune_BERT(model:BertForPreTraining,inputs,epochs:int=5,batch_size:int=16,learning_rate:float=1e-4):
    
    # Convert bert inputs (dict) returned by BertDataset class to dataset loader object
    dataset = BertDataset(inputs)
    dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model training mode and optmizer
    model.train()
    optim = AdamW(model.parameters(), lr=1e-4)
    
    # Loop over epochs acumulate losses
    loss_acum=defaultdict(list)
    for epoch in range(epochs):
        # setup loop with TQDM and dataloader
        loop = tqdm(dataset_loader, leave=True)
        for i,batch in enumerate(loop):
            # initialize calculated gradients (from prev step)
            optim.zero_grad()
            
            # pull all tensor batches required for training
            input_ids = batch['input_ids']
            token_type_ids = batch['token_type_ids']
            attention_mask = batch['attention_mask']
            next_sentence_label = batch['next_sentence_label']
            labels = batch['labels']
            
            # process
            outputs = model(input_ids, attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            next_sentence_label=next_sentence_label,
                            labels=labels)
            # extract loss
            loss = outputs.loss # NLLloss function (negative log likelihood loss)
            # save loss
            loss_acum['batch_{}'.format(i)]=loss
            # calculate loss for every parameter that needs grad update
            loss.backward()
            # update parameters
            optim.step()
            # print relevant info to progress bar
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())
            
    return model,loss_acum

In [662]:
# Provide text embedding given bert model and text

def pred_emb_BERT(token_ids,segment_ids,attention_mask_ids,model):
    
    #1. Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    
    #2. Add dummy batch dimension
    token_ids=token_ids[None,:]
    segment_ids=segment_ids[None,:]
    
    #2. Produce output
    with torch.no_grad():
        
        outputs=model(token_ids, segment_ids)
    
    #3. Produce hidden stantes (3rd output)
    hidden_states = outputs[2]
        
    #4. Stack embedding output
    token_embeddings = torch.stack(hidden_states[-4:], dim=0) # Stack last 4 years as suggested in research (https://jalammar.github.io/illustrated-bert/)
    print('Stacked embedding size {}.'.format(token_embeddings.size()))
          
    #5. Remove dimension 1 (batches)
    #token_embeddings = torch.squeeze(token_embeddings, dim=1).sum(dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    print('Reduced emebedding size {}.'.format(token_embeddings.size()))    
    
    #6.Attention mask padded tokens
    mask=attention_mask_ids.unsqueeze(-1).expand(token_embeddings.size()).float()
    masked_embeddings = token_embeddings * mask 
    masked_embeddings = torch.sum(masked_embeddings, 1) # Extra
    print('Mask shape {}.'.format(masked_embeddings.shape))
    
    summed_mask = torch.clamp(mask.sum(1), min=1e-9) # Extra
    mean_pooled = masked_embeddings / summed_mask # Extra
    print('Mask shape 2 {}.'.format(mean_pooled.shape)) # Extra
    
    #7. Average token embeddings to get sentence/paragraph embedding
    masked_embeddings_summed = torch.mean(mean_pooled, 0)
    print('Final dimension size {}.'.format(masked_embeddings_summed.size()))
    
    return masked_embeddings_summed

In [597]:
# Compare text 

def compare_text(emb1,emb2):
    
    """Calculate similarity between embeddings"""
    
    cos=torch.nn.CosineSimilarity(dim=0)
    
    diff_emb = cos(emb1, emb2)
    
    return diff_emb

In [645]:
questions_list_bert

{'input_ids': tensor([[  101,  2572,  1045,  ...,     0,     0,     0],
        [  101,  2064, 11498,  ...,     0,     0,     0],
        [  101, 11498,  3401,  ...,     0,     0,     0],
        [  101,  2054, 11498,  ...,     0,     0,     0],
        [  101,  2339,  3712,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [657]:
d=pred_emb_BERT(questions_list_bert['input_ids'][i],questions_list_bert['token_type_ids'][i],questions_list_bert['attention_mask'][i],m4)

Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).


In [655]:
d

tensor([[ 0.5061,  0.9535,  0.1198,  ..., -0.9036,  0.6887, -0.6608],
        [ 1.0239,  1.5360,  0.1381,  ..., -1.1146,  0.0461, -1.2879],
        [ 0.8100,  1.6151, -0.1458,  ..., -1.5533,  0.2507, -1.4091],
        [ 1.1379,  1.1596,  0.1782,  ..., -0.9773,  0.3478, -0.9347]])

In [638]:
questions_list_bert

{'input_ids': tensor([[  101,  2572,  1045,  ...,     0,     0,     0],
        [  101,  2064, 11498,  ...,     0,     0,     0],
        [  101, 11498,  3401,  ...,     0,     0,     0],
        [  101,  2054, 11498,  ...,     0,     0,     0],
        [  101,  2339,  3712,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

### 2. BERT fine tunning

##### 2.1 Initial data prep

In [312]:
# Text to lists (also split into sentences)

raw_text_list=txt_to_lists(CORPUS_PATH,to_sentence=True)
raw_text_list

[['Hello, my name is Alice.',
  'I’m calling from Chicago and want to ask some questions.',
  'I’m pregnant for 6 months now, but I’m not telling anyone about this.',
  'I have periodic headaches.',
  'When I work, I feel like my ability to concentrate is being hindered by them.',
  'It’s already hard to work from 9 to 5 every day, God, and now this.',
  'My mom told me about this wonder drug called paracetamol.',
  'She assured me that it would help me a lot.',
  'I’m not sure if that is okay.',
  'It’s not like I’m a specialist in this field or anything so I decided to call here to be sure just in case.',
  'Can I use this medicine safely and will it help me?']]

In [313]:
# List of text sentences to list of subsequent sentence pairs

raw_sent_pairs=text_list_to_sent_pairs(raw_text_list)
raw_sent_pairs

[[('Hello, my name is Alice.',
   'I’m calling from Chicago and want to ask some questions.'),
  ('I’m calling from Chicago and want to ask some questions.',
   'I’m pregnant for 6 months now, but I’m not telling anyone about this.'),
  ('I’m pregnant for 6 months now, but I’m not telling anyone about this.',
   'I have periodic headaches.'),
  ('I have periodic headaches.',
   'When I work, I feel like my ability to concentrate is being hindered by them.'),
  ('When I work, I feel like my ability to concentrate is being hindered by them.',
   'It’s already hard to work from 9 to 5 every day, God, and now this.'),
  ('It’s already hard to work from 9 to 5 every day, God, and now this.',
   'My mom told me about this wonder drug called paracetamol.'),
  ('My mom told me about this wonder drug called paracetamol.',
   'She assured me that it would help me a lot.'),
  ('She assured me that it would help me a lot.',
   'I’m not sure if that is okay.'),
  ('I’m not sure if that is okay.',
 

In [314]:
# Create balanced data of random sentence pairs (uset all texts all sentences and one random instance pair per sentence)

raw_sent_notpairs=sent_pairs_to_random_pairs(raw_sent_pairs,text_resample_size=1.0,sent_resample_size=1.0,n_resamples=1)
raw_sent_notpairs

[[('I’m pregnant for 6 months now, but I’m not telling anyone about this.',
   'I’m not sure if that is okay.'),
  ('I have periodic headaches.',
   'She assured me that it would help me a lot.'),
  ('Hello, my name is Alice.', 'I have periodic headaches.'),
  ('My mom told me about this wonder drug called paracetamol.',
   'I’m pregnant for 6 months now, but I’m not telling anyone about this.'),
  ('I’m not sure if that is okay.',
   'I’m pregnant for 6 months now, but I’m not telling anyone about this.'),
  ('She assured me that it would help me a lot.',
   'My mom told me about this wonder drug called paracetamol.'),
  ('I’m calling from Chicago and want to ask some questions.',
   'Can I use this medicine safely and will it help me?'),
  ('When I work, I feel like my ability to concentrate is being hindered by them.',
   'Hello, my name is Alice.'),
  ('It’s already hard to work from 9 to 5 every day, God, and now this.',
   'It’s not like I’m a specialist in this field or anything

##### 2.2 No data augmentation

In [323]:
# Creat dataset for BERT with no data augmentation

bert_inputs_noaug=prepare_data_for_BERT_train(raw_sent_pairs,raw_sent_notpairs)
bert_inputs_noaug

{'input_ids': tensor([[ 101, 7592, 1010,  ...,    0,    0,    0],
        [ 101, 1045, 1521,  ...,    0,    0,    0],
        [ 101, 1045, 1521,  ...,    0,    0,    0],
        ...,
        [ 101, 2043, 1045,  ...,    0,    0,    0],
        [ 101, 2009, 1521,  ...,    0,    0,    0],
        [ 101, 2009, 1521,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'next_sentence_label': tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
      

In [18]:
# Mask input ids for MLP head

bert_inputs_noaug_masked=mask_inputids(bert_inputs_noaug,mask_prop=0.15)
bert_inputs_noaug_masked

{'input_ids': tensor([[ 101, 7592, 1010,  ...,    0,    0,    0],
        [ 101, 1045, 1521,  ...,    0,    0,    0],
        [ 101, 1045, 1521,  ...,    0,    0,    0],
        ...,
        [ 101, 2009, 1521,  ...,    0,    0,    0],
        [ 101, 2043, 1045,  ...,    0,    0,    0],
        [ 101, 1045, 2031,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'next_sentence_label': tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
      

In [83]:
# Run bechmark BERT

model = BertForPreTraining.from_pretrained('bert-base-uncased',output_hidden_states=True)
m1,loss1=fine_tune_BERT(model,bert_inputs_noaug_masked,epochs=5,batch_size=16,learning_rate=1e-4)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:47<00:00, 23.64s/it, loss=14]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:47<00:00, 23.53s/it, loss=9.38]
Epoch 2: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:49<00:00, 24.86s/it, loss=7.65]
Epoch 3: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:54<00:00, 27.28s/it, loss=5.72]
Epoch 4: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:52<00:00, 26.16s/it, loss=4.83]


##### 2.3 Synonym insertation

In [482]:
# Synonym insertation (pairs) based on wordnet (2 new sentences for each one)

raw_sent_pairs_synaug=aug_syn_swap(raw_sent_pairs,aug_p=0.3,aug_min=1, aug_max=10,n_new_sent=2)

In [483]:
# Synonym insertation (not pairs) based on wordnet (2 new sentences for each one)

raw_sent_notpairs_synaug=aug_syn_swap(raw_sent_notpairs,aug_p=0.3,aug_min=1, aug_max=10,n_new_sent=2)

In [285]:
# Creat dataset for BERT with no syn insertaion data augmentation

bert_inputs_synaug=prepare_data_for_BERT_train(raw_sent_pairs_synaug,raw_sent_notpairs_synaug)
bert_inputs_synaug

{'input_ids': tensor([[ 101, 6738, 2080,  ...,    0,    0,    0],
        [ 101, 7632, 1010,  ...,    0,    0,    0],
        [ 101, 1015, 1521,  ...,    0,    0,    0],
        ...,
        [ 101, 2009, 1521,  ...,    0,    0,    0],
        [ 101, 6738, 2080,  ...,    0,    0,    0],
        [ 101, 7632, 1010,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'next_sentence_label': tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
      

In [286]:
# Run bechmark BERT

model = BertForPreTraining.from_pretrained('bert-base-uncased',output_hidden_states=True)
m2,loss2=fine_tune_BERT(model,bert_inputs_synaug,epochs=5,batch_size=16,learning_rate=1e-4)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|████████████████████████████████████████████████████████████████| 3/3 [01:36<00:00, 32.26s/it, loss=11.9]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 3/3 [01:40<00:00, 33.60s/it, loss=7.08]
Epoch 2: 100%|████████████████████████████████████████████████████████████████| 3/3 [01:43<00:00, 34.62s/it, loss=5.09]
Epoch 3: 100%|████████████████████████████████████████████████████████████████| 3/3 [01:39<00:00, 33.13s/it, loss=2.91]
Epoch 4: 100%|█████████████████████████████████████████████████████████████████| 3/3 [01:39<00:00, 33.02s/it, loss=1.6]


##### 2.4 Back translation

In [484]:
# Synonym insertation (pairs) based on wordnet (2 new sentences for each one)

raw_sent_pairs_transaug=aug_trans_swap(raw_sent_pairs,from_lang='eng',to_lang='de')

In [485]:
# Synonym insertation (not pairs) based on wordnet (2 new sentences for each one)

raw_sent_notpairs_transaug=aug_trans_swap(raw_sent_notpairs,from_lang='eng',to_lang='de')

In [294]:
# Creat dataset for BERT with language back translation data augmentation

bert_inputs_transaug=prepare_data_for_BERT_train(raw_sent_pairs_transaug,raw_sent_notpairs_transaug)
bert_inputs_transaug

{'input_ids': tensor([[ 101, 7592, 1010,  ...,    0,    0,    0],
        [ 101, 1045, 2655,  ...,    0,    0,    0],
        [ 101, 1045, 1005,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2655,  ...,    0,    0,    0],
        [ 101, 2009, 2003,  ...,    0,    0,    0],
        [ 101, 7592, 1010,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'next_sentence_label': tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
      

In [295]:
# Run bechmark BERT

model = BertForPreTraining.from_pretrained('bert-base-uncased',output_hidden_states=True)
m3,loss3=fine_tune_BERT(model,bert_inputs_transaug,epochs=5,batch_size=16,learning_rate=1e-4)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:45<00:00, 22.86s/it, loss=15.3]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:50<00:00, 25.31s/it, loss=9.82]
Epoch 2: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:50<00:00, 25.44s/it, loss=6.83]
Epoch 3: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:57<00:00, 28.62s/it, loss=5.98]
Epoch 4: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:52<00:00, 26.27s/it, loss=5.3]


##### 2.5 Back translation + synonym insertation

In [488]:
# Concatenate syn aug and trans aug

ran_sent_pairs_combaug=raw_sent_pairs_transaug+raw_sent_pairs_synaug # Combine pair data for synonym augmentation and translation augmentaion
raw_sent_notpairs_combaug=raw_sent_notpairs_transaug+raw_sent_notpairs_synaug # Combine not pair data for synonym augmentation and translation augmentaion

In [489]:
# Creat dataset for BERT with language back translation data augmentation combined with syn insertation augmentaiton

bert_inputs_combaug=prepare_data_for_BERT_train(ran_sent_pairs_combaug,raw_sent_notpairs_combaug)
bert_inputs_combaug

{'input_ids': tensor([[ 101, 7592, 1010,  ...,    0,    0,    0],
        [ 101, 1045, 2655,  ...,    0,    0,    0],
        [ 101, 1045, 1005,  ...,    0,    0,    0],
        ...,
        [ 101, 2009, 1521,  ...,    0,    0,    0],
        [ 101, 2592, 2974,  ...,    0,    0,    0],
        [ 101, 2009, 1521,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'next_sentence_label': tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
      

In [667]:
# Run bechmark BERT

model = BertForPreTraining.from_pretrained('bert-base-uncased',output_hidden_states=True)
m4,loss4=fine_tune_BERT(model,bert_inputs_combaug,epochs=10,batch_size=16,learning_rate=1e-4)

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|████████████████████████████████████████████████████████████████| 4/4 [03:07<00:00, 46.92s/it, loss=10.2]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 4/4 [02:57<00:00, 44.37s/it, loss=5.73]
Epoch 2: 100%|████████████████████████████████████████████████████████████████| 4/4 [02:50<00:00, 42.66s/it, loss=3.33]
Epoch 3: 100%|████████████████████████████████████████████████████████████████| 4/4 [02:47<00:00, 41.92s/it, loss=1.17]
Epoch 4: 100%|███████████████████████████████████████████████████████████████| 4/4 [02:49<00:00, 42.50s/it, loss=0.503]
Epoch 5: 100%|██████████████████████████████████

In [677]:
loss4

defaultdict(list,
            {'batch_22.818199157714844': tensor(22.8182, grad_fn=<AddBackward0>),
             'batch_15.787662506103516': tensor(15.7877, grad_fn=<AddBackward0>),
             'batch_12.779033660888672': tensor(12.7790, grad_fn=<AddBackward0>),
             'batch_10.189157485961914': tensor(10.1892, grad_fn=<AddBackward0>),
             'batch_7.766191482543945': tensor(7.7662, grad_fn=<AddBackward0>),
             'batch_7.130096912384033': tensor(7.1301, grad_fn=<AddBackward0>),
             'batch_6.189721584320068': tensor(6.1897, grad_fn=<AddBackward0>),
             'batch_5.733639717102051': tensor(5.7336, grad_fn=<AddBackward0>),
             'batch_5.045542240142822': tensor(5.0455, grad_fn=<AddBackward0>),
             'batch_4.438772201538086': tensor(4.4388, grad_fn=<AddBackward0>),
             'batch_3.584261417388916': tensor(3.5843, grad_fn=<AddBackward0>),
             'batch_3.3339877128601074': tensor(3.3340, grad_fn=<AddBackward0>),
             

### 3. BERT testing using Questions db

##### 3.1 Prepare questions embeddings

In [671]:
# Questions list

questions_list=txt_to_lists(QNA_PATH,to_sentence=True)

In [672]:
# Prepare qna text for inference

questions_list_bert=prepare_data_for_BERT_inference(questions_list,prep_type='flatten')

##### 3.2 Test

In [673]:
# Get question emebeddings

question_embedding=torch.empty(questions_list_bert['input_ids'].shape[0],768)
for i in range(questions_list_bert['input_ids'].shape[0]):
    questions_list_bert_embeddings=pred_emb_BERT(questions_list_bert['input_ids'][i],questions_list_bert['token_type_ids'][i],questions_list_bert['attention_mask'][i],m4)
    question_embedding[i]=questions_list_bert_embeddings

Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 to

In [674]:
questions_list_bert_embeddings.shape

torch.Size([768])

In [675]:
# Loop over text

res_dict={}
# Loop over texts
for i in range(len(raw_text_list)):
    
    res_dict['query_{}'.format(i)]={}
    # Loop over lines and evaluate one after another
    for j in range(len(raw_text_list[i])):
        # Tokenize
        query_list=raw_text_list[i][:j+1]
        query_list_bert=prepare_data_for_BERT_inference(query_list,prep_type='concat')
        query_embedding=pred_emb_BERT(query_list_bert['input_ids'][0],query_list_bert['token_type_ids'][0],query_list_bert['attention_mask'][0],m4)
        
        # Calc cos similarity
        sim_list=[('q_{}'.format(i),compare_text(query_embedding,emb)) for i,emb in enumerate(question_embedding)]
        sim_list.sort(key = lambda x: x[1],reverse=True)

        # Update results dict
        res_dict['query_{}'.format(i)]['{}_sentences'.format(j)]=sim_list

Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 torch.Size([4, 768]).
Final dimension size torch.Size([768]).
Stacked embedding size torch.Size([4, 1, 512, 768]).
Reduced emebedding size torch.Size([4, 512, 768]).
Mask shape torch.Size([4, 768]).
Mask shape 2 to

In [676]:
res_dict

{'query_0': {'0_sentences': [('q_5', tensor(0.9993)),
   ('q_4', tensor(0.9992)),
   ('q_2', tensor(0.9992)),
   ('q_3', tensor(0.9990)),
   ('q_1', tensor(0.9988)),
   ('q_0', tensor(0.9986))],
  '1_sentences': [('q_4', tensor(0.9989)),
   ('q_2', tensor(0.9989)),
   ('q_3', tensor(0.9987)),
   ('q_1', tensor(0.9987)),
   ('q_5', tensor(0.9986)),
   ('q_0', tensor(0.9985))],
  '2_sentences': [('q_4', tensor(0.9967)),
   ('q_2', tensor(0.9965)),
   ('q_3', tensor(0.9963)),
   ('q_1', tensor(0.9963)),
   ('q_5', tensor(0.9962)),
   ('q_0', tensor(0.9961))],
  '3_sentences': [('q_4', tensor(0.9960)),
   ('q_2', tensor(0.9958)),
   ('q_1', tensor(0.9956)),
   ('q_3', tensor(0.9956)),
   ('q_5', tensor(0.9954)),
   ('q_0', tensor(0.9953))],
  '4_sentences': [('q_4', tensor(0.9921)),
   ('q_2', tensor(0.9920)),
   ('q_3', tensor(0.9915)),
   ('q_1', tensor(0.9915)),
   ('q_5', tensor(0.9913)),
   ('q_0', tensor(0.9911))],
  '5_sentences': [('q_2', tensor(0.9858)),
   ('q_4', tensor(0.9857))

##### 3.3 Save model

In [627]:
# Save model

torch.save(m4, MODEL_PATH.joinpath('best_model'))
torch.save(tokenizer, MODEL_PATH.joinpath('best_model_t'))


In [594]:
# Load model

#model = torch.load(MODEL_PATH.joinpath('best_model'))
#model_t = torch.load(MODEL_PATH.joinpath('best_model_t'))