In [1]:
# install
# !pip install pytorch-pretrained-bert pytorch-nlp keras scikit-learn matplotlib tensorflow

#https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03 

In [1]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.optim as optim
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

UsageError: Line magic function `%` not found.


In [2]:
# specify CPU or GPU as device
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'
print(f'device = {device}')

device = cpu


In [None]:
# Load data
df = pd.read_csv(r'.\data\skills_description.csv', sep='\t', encoding='utf-8')
df = df.rename(columns={'preferredLabel':'query', 'description': 'documents'})
df = df[['query', 'documents']]

df.head()

Unnamed: 0,query,documents
0,lede musikalsk personale,Tildele og forvalte personaleopgaver på område...
1,føre tilsyn med fængselsprocedurer,Føre tilsyn med driften af et fængsel eller an...
2,anvende antioppressiv praksis,"Identificere undertrykkelse i samfund, økonomi..."
3,kontrollere overensstemmelse med jernbaneforsk...,"Inspicere rullende materiel, komponenter og sy..."
4,identificere tilgængelige tjenester,"Identificere de forskellige tjenester, der er ..."


### Add special tokens to sequences

In [None]:
# add special ColBERT tokens to queries and documents
queries = ["[CLS] " + query + " [SEP]" for query in df['query']]
documents =  ["[CLS] " + query + " [SEP]" for query in df['documents']]
print("Example of query:\n", queries[0])
print("\nExample of document:\n", documents[0])


Example of query:
 [CLS] lede musikalsk personale [SEP]

Example of document:
 [CLS] Tildele og forvalte personaleopgaver på områder såsom instrumentering, bearbejdning, reproduktion af musik og stemmetræning. [SEP]


### Load BERT tokenizer
The BERT tokenizer is very storage efficient way of splitting a sequence into words - or rather tokens of subwords. The tokenizer uses WordPiece which uses subwords. That is splitting words into multiple words in order to keep the vocabulary smaller. That way, the vocabulary does not need to keep both: "boy" and "boys" but only "boy" and "s" where "s" can be used in many other cases.

In [None]:
# Tokenize with BERT tokenizer
model_path = r'J:\VOA\MABI\Deep Learning\my_DTU_project\Models\danish_bert_uncased_v2'
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)

# Tokenize queries and documents
tokenized_texts = [tokenizer.tokenize(sent) for sent in queries]
tokenized_docs = [tokenizer.tokenize(doc) for doc in documents]

print(f'Tokenized first sentence: \n {tokenized_texts[0]}')
print (f'\nTokenized first document: \n {tokenized_docs[0]}')

Tokenized first sentence: 
 ['[CLS]', 'lede', 'musikalsk', 'personale', '[SEP]']

Tokenized first document: 
 ['[CLS]', 'tildele', 'og', 'forvalt', '##e', 'personale', '##opgaver', 'pa', 'om', '##rad', '##er', 'sas', '##om', 'instrumenter', '##ing', ',', 'bearbejdning', ',', 'reproduktion', 'af', 'musik', 'og', 'stemme', '##træning', '.', '[SEP]']


In [None]:
# Set the maximum query length. 
MAX_LEN_Q = 24

# Pad our input tokens
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
q_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
q_input_ids = pad_sequences(q_input_ids, maxlen=MAX_LEN_Q, dtype="long", truncating="post", padding="post")
print(f'Shape of query ids:\n q_input_ids.shape = {q_input_ids.shape}')


# Create query attention masks
q_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in q_input_ids:
  seq_mask = [float(i>0) for i in seq]
  q_attention_masks.append(seq_mask)

print(f'Shape of query attention mask:\n q_attention_masks = {np.shape(q_attention_masks)}')

assert q_input_ids.shape == np.shape(q_attention_masks), 'dimensions of q_input_ids and q_attention_mask do not match' 

Shape of query ids:
 q_input_ids.shape = (13485, 24)
Shape of query attention mask:
 q_attention_masks = (13485, 24)


In [None]:
# Set the maximum document length. 
MAX_LEN_DOC = 128
# Pad our input tokens
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
d_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_docs]
d_input_ids = pad_sequences(d_input_ids, maxlen=MAX_LEN_DOC, dtype="long", truncating="post", padding="post")
print(f'Shape of input_ids.shape: {d_input_ids.shape}')


# Create attention masks for documents
d_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in d_input_ids:
  seq_mask = [float(i>0) for i in seq]
  d_attention_masks.append(seq_mask)

print(f'Shape of d_attention_masks: {np.shape(d_attention_masks)}')

assert d_input_ids.shape == np.shape(d_attention_masks), 'dimensions of document d_input_ids and d_attention_mask do not match' 

Shape of input_ids.shape: (13485, 128)
Shape of d_attention_masks: (13485, 128)


# Import model
Queries and documents have now been tokenized to the vocabolary

In [3]:
from transformers import BertConfig
from transformers import BertModel


model_path = r'J:\VOA\MABI\Deep Learning\my_DTU_project\Models\danish_bert_uncased_v2'

config = BertConfig.from_pretrained(model_path + r'\bert_config.json')
bert_base = BertModel(config)

# Freeze parameters of BERT Base as I only will train last layer
for param in bert_base.parameters():
    param.requires_grad = False

In [11]:
#for param in bert_base.parameters():
#    print(param)  # param.requires_grad = False

#param_optimizer = list(bert_base.named_parameters())
#print(param_optimizer)


In [12]:
from torch import nn
import torch.nn.functional as F

class MyColBERT(nn.Module):
    def __init__(self):
          super(MyColBERT, self).__init__()
          self.bert = bert_base 
          ### New layers:
          self.linear1 = nn.Linear(768, 32) # 32 is "low" for faster computation of MaxSim (it is independent of sequence lentgh)
          

    def forward(self, ids, mask):
          sequence_output, pooled_output = self.bert(ids, attention_mask=mask) # sequence_output shape is: (batch_size, sequence_length, 768)
               
          # We apply the linear layer in line with ColBERT paper. The linear layer (which applies a linear transformation)
          # takes as input the hidden states of all tokens (so seq_len times a vector of size 768, each corresponding to
          # a single token in the input sequence) and outputs 32 numbers for every token
          # so the logits are of shape (batch_size, sequence_length, 32)
          sequence_output = self.linear1(sequence_output)
          sequence_output = F.softmax(sequence_output, dim=1)

          #linear2_output = self.linear2(linear2_output)

          return sequence_output

In summary, given a query sequence $q = q_0 q_1...q_l$ and a document sequence $d = d_0 d_1...d_n$, we compute the bags of embeddings $E_q$ and $E_d$ in the following manner:

* $E_q$ := Normalize( CNN( BERT(“[Q]$q_0 q_1...q_l$ ##...#”) ) )

* $E_d$ := Normalize( CNN( BERT(“[D]$d_0 d_1...d_l$ ...d_n”) ) )

where '#' refers to the [mask] tokens. In my implementation of ColBERT the output dimensions are as follow:
\begin{align*}
    dim(E_q) = [batch_{size} \times 24 \times 32] \\
    dim(E_d) = [batch_{size} \times 128 \times 32]
\end{align*}


The relevancy score, MaxSim, is defined as follows:

$$ S_{q,d} = \sum_{i \in ||E_q||} \max_{j \in ||E_d||} E_{q_i} * E_{d_j}^T$$

In [20]:

def MaxSim(q, D):
    '''Takes in a query, q, and return it's similarity score to
        all documents in  D.'''

    # repeat q for faster matrix multiplication (faster than loop)
    batch_size=D.shape[0]
    q_X = q.repeat(batch_size, 1, 1)
    
    # multiply the same query q against all documents (in D)
    batch_mm = torch.bmm(q_X, D.permute(0,2,1))
    
    maks, _ = torch.max(batch_mm, dim=2) # dim=1 or dim=2
    
    # Sum over maximum values --> return vector of length len(D)
    S_qD = torch.sum(maks, dim=1)
    
    return S_qD

In [21]:
most_similar_doc_score = []
most_similar_docID = []

 # Define D as all documents:
D = d_outputs

for q_no in tqdm(range(sample_size)):
    
    # Select one query
    q = q_outputs[q_no]

    # Compute similarity scores for all 
    S_qD = MaxSim(q, D)
    maks, maks_id = torch.max(S_qD, dim=0)

    most_similar_doc_score.append(float(maks))
    most_similar_docID.append(int(maks_id))

100%|██████████| 10/10 [00:00<00:00, 831.89it/s]


## Define criterion
$$ L(q, d_i) = \frac{exp(MaxSim(q, d_i))}{\sum_{j \in ||D||} exp(MaxSim(q, d_j))} = \frac{exp( MaxSim(q, D)[i] )}{\sum exp( MaxSim(q, D) )}$$

In [27]:
#criterion = 
loss = exp(MaxSim(q_outputs[k], D)[i]) / sum(exp(MaxSim(q_outputs[k], D)))
#loss = criterion(output, labels)  

NameError: name 'exp' is not defined

# Training / Fine-tuning of model 

In [13]:
## Define model, loss function (criterion) and optimizer
model = MyColBERT()
model.to(torch.device(device))
criterion = nn.CrossEntropyLoss() ## If required define your own criterion #TODO:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))  # alternative: optim.SGD(model.parameters(), lr=0.01, momentum = 0.9)

In [24]:
#for param in bert_base.parameters():
#    print(param)  # param.requires_grad = False

#param_optimizer = list(model.named_parameters())
#print(param_optimizer[-1])

In [23]:
#model

In [None]:
from torch.autograd import Variable

num_epoch = 15  # Your code here!

for epoch in range(num_epoch):  # loop over the dataset multiple times

    running_loss = 0.0
    model.train()
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs.to(device)), Variable(labels.to(device))

        # zero the parameter gradients
        # Your code here!
        optimizer.zero_grad()

        # forward + backward + optimize
        # Your code here!

        # Forward
        output = model(inputs)
        loss = criterion(output, labels)        
        
        # backward
        loss.backward()

        # optimize
        optimizer.step()

        # print statistics
        #running_loss += loss.data[0]
        running_loss += loss.item() 
        if i % 1000 == 999:    # print every 1000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 1000))
            running_loss = 0.0

print('Finished Training')

In [None]:


for epoch in epochs:
    for zip(q_batch, d_batch) in zip(q_data_loader, d_data_loader): ## If you have a DataLoader()  object to get the data.# TODO

        # assign batch of query and document data
        q_data = q_batch[0]
        targets = q_batch[1] ## assuming that data loader returns a tuple of data and its targets
         
        d_data = d_batch[0]
        #d_targets = d_batch[1] ## assuming that data loader returns a tuple of data and its targets
        # Target is the same for query and documents. # TODO

        optimizer.zero_grad()   

        ##### Queries
        #q_encoding = tokenizer.batch_encode_plus(q_data, return_tensors='pt', padding=True, truncation=True,max_length=50, add_special_tokens = True)
        q_outputs = model(q_input_ids, attention_mask=q_attention_masks)
        q_outputs = F.log_softmax(q_outputs, dim=1)
        #q_input_ids = q_encoding['input_ids']
        #q_attention_mask = q_encoding['attention_mask']
        
        ##### Documents
        d_outputs = model(d_input_ids, attention_mask=d_attention_masks)
        d_outputs = F.log_softmax(q_outputs, dim=1)
        
        maxsim = MaxSim(q_outputs, d_outputs) # vector of length batch_size
        loss = criterion(maxsim, targets)
               
        loss.backward()
        optimizer.step()

In [152]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(df, batch_size=64, shuffle=True)

In [156]:
#for batch in train_dataloader:
#    print(batch.item())