In [1]:
# install
# !pip install pytorch-pretrained-bert pytorch-nlp keras scikit-learn matplotlib tensorflow

#https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03 

In [2]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

UsageError: Line magic function `%` not found.


In [4]:
# specify GPU device
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'
print(f'device = {device}')

Running on cpu. Be patient...


## Load Query and Document embeddings

In [None]:
# Embeddin#gs of queries
q_outputs = torch.load(f'./query_embeddings/query_tensor.pt') 

# Embeddings of documents
d_outputs = torch.load(f'./doc_embeddings/doc_tensor.pt')

print('Query embedding size:    ', q_outputs.shape) 
print('Document embedding size: ', d_outputs.shape) 

Query embedding size:     torch.Size([13485, 24, 768])
Document embedding size:  torch.Size([13485, 128, 768])


# Import model
Queries and documents have now been tokenized to the vocabolary

In [13]:
from transformers import BertConfig
from transformers import BertModel

config = BertConfig.from_pretrained(model_path + r'\bert_config.json')
bert_base = BertModel(config)

#param_optimizer = list(bert_base.named_parameters())
#print(bert_base)

In [14]:
from torch import nn
import torch.nn.functional as F

class CustomBERTModel(nn.Module):
    def __init__(self):
          super(CustomBERTModel, self).__init__()
          self.bert = bert_base 
          ### New layers:
          self.linear1 = nn.Linear(768, 32) # 32 is "low" for faster computation of MaxSim (it is independent of sequence lentgh)
          

    def forward(self, ids, mask):
          sequence_output, pooled_output = self.bert(ids, attention_mask=mask) # sequence_output shape is: (batch_size, sequence_length, 768)
               
          # We apply the linear layer in line with ColBERT paper. The linear layer (which applies a linear transformation)
          # takes as input the hidden states of all tokens (so seq_len times a vector of size 768, each corresponding to
          # a single token in the input sequence) and outputs 32 numbers for every token
          # so the logits are of shape (batch_size, sequence_length, 32)
          sequence_output = self.linear1(sequence_output)
          sequence_output = F.softmax(sequence_output, dim=1)

          #linear2_output = self.linear2(linear2_output)

          return sequence_output

In [16]:
#bert_base.to(torch.device(device))
my_model  = CustomBERTModel()
my_model.to(torch.device(device))

#BERT_base: q_outputs = bert_base(q_id, attention_mask=q_mask)
q_outputs = my_model(q_id, mask=q_mask)


# BERT_base: d_outputs = bert_base(d_id, attention_mask=d_mask) 
d_outputs = my_model(d_id, mask=d_mask)



#With bert_base shape is: torch.Size([10, 24, 768]) and torch.Size([10, 128, 768])
print(q_outputs.shape) 
print(d_outputs.shape) 

# [1] må være model output...

torch.Size([10, 24, 32])
torch.Size([10, 128, 32])


In summary, given a query sequence $q = q_0 q_1...q_l$ and a document sequence $d = d_0 d_1...d_n$, we compute the bags of embeddings $E_q$ and $E_d$ in the following manner:

* $E_q$ := Normalize( CNN( BERT(“[Q]$q_0 q_1...q_l$ ##...#”) ) )

* $E_d$ := Normalize( CNN( BERT(“[D]$d_0 d_1...d_l$ ...d_n”) ) )

where '#' refers to the [mask] tokens. In my implementation of ColBERT the output dimensions are as follow:
\begin{align*}
    dim(E_q) = [batch_{size} \times 24 \times 32] \\
    dim(E_d) = [batch_{size} \times 128 \times 32]
\end{align*}


The relevancy score, MaxSim, is defined as follows:

$$ S_{q,d} = \sum_{i \in ||E_q||} \max_{j \in ||E_d||} E_{q_i} * E_{d_j}^T$$

In [20]:

def MaxSim(q, D):
    '''Takes in a query, q, and return it's similarity score to
        all documents in  D.'''

    # repeat q for faster matrix multiplication (faster than loop)
    batch_size=D.shape[0]
    q_X = q.repeat(batch_size, 1, 1)
    
    # multiply the same query q against all documents (in D)
    batch_mm = torch.bmm(q_X, D.permute(0,2,1))
    
    maks, _ = torch.max(batch_mm, dim=2) # dim=1 or dim=2
    
    # Sum over maximum values --> return vector of length len(D)
    S_qD = torch.sum(maks, dim=1)
    
    return S_qD

In [21]:
most_similar_doc_score = []
most_similar_docID = []

 # Define D as all documents:
D = d_outputs

for q_no in tqdm(range(sample_size)):
    
    # Select one query
    q = q_outputs[q_no]

    # Compute similarity scores for all 
    S_qD = MaxSim(q, D)
    maks, maks_id = torch.max(S_qD, dim=0)

    most_similar_doc_score.append(float(maks))
    most_similar_docID.append(int(maks_id))

100%|██████████| 10/10 [00:00<00:00, 831.89it/s]


# Training / Fine-tuning of model 

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-cased")

model = CustomBERTModel() # You can pass the parameters if required to have more flexible model
model.to(torch.device(device)) ## can be gpu
criterion = nn.CrossEntropyLoss() ## If required define your own criterion# TODO
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))

for epoch in epochs:
    for zip(q_batch, d_batch) in zip(q_data_loader, d_data_loader): ## If you have a DataLoader()  object to get the data.# TODO

        # assign batch of query and document data
        q_data = q_batch[0]
        targets = q_batch[1] ## assuming that data loader returns a tuple of data and its targets
         
        d_data = d_batch[0]
        #d_targets = d_batch[1] ## assuming that data loader returns a tuple of data and its targets
        # Target is the same for query and documents. # TODO

        optimizer.zero_grad()   

        ##### Queries
        #q_encoding = tokenizer.batch_encode_plus(q_data, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
        q_outputs = model(q_input_ids, attention_mask=q_attention_masks)
        q_outputs = F.log_softmax(q_outputs, dim=1)
        #q_input_ids = q_encoding['input_ids']
        #q_attention_mask = q_encoding['attention_mask']
        
        ##### Documents
        d_outputs = model(d_input_ids, attention_mask=d_attention_masks)
        d_outputs = F.log_softmax(q_outputs, dim=1)
        
        maxsim = MaxSim(q_outputs, d_outputs) # vector of length batch_size
        loss = criterion(maxsim, targets)
               
        loss.backward()
        optimizer.step()

In [152]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(df, batch_size=64, shuffle=True)

In [156]:
#for batch in train_dataloader:
#    print(batch.item())