In [1]:
# install
# !pip install pytorch-pretrained-bert pytorch-nlp keras scikit-learn matplotlib tensorflow

#https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03 

In [1]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.optim as optim
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

UsageError: Line magic function `%` not found.


In [2]:
# specify CPU or GPU as device
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'
print(f'device = {device}')

device = cpu


In [3]:
# Load data
df = pd.read_csv(r'.\data\skills_description.csv', sep='\t', encoding='utf-8')
df = df.rename(columns={'preferredLabel':'query', 'description': 'documents'})
df = df[['query', 'documents']]

df.head()

Unnamed: 0,query,documents
0,lede musikalsk personale,Tildele og forvalte personaleopgaver på område...
1,føre tilsyn med fængselsprocedurer,Føre tilsyn med driften af et fængsel eller an...
2,anvende antioppressiv praksis,"Identificere undertrykkelse i samfund, økonomi..."
3,kontrollere overensstemmelse med jernbaneforsk...,"Inspicere rullende materiel, komponenter og sy..."
4,identificere tilgængelige tjenester,"Identificere de forskellige tjenester, der er ..."


### Add special tokens to sequences

In [4]:
# add special ColBERT tokens to queries and documents
queries = ["[CLS] " + query + " [SEP]" for query in df['query']]
documents =  ["[CLS] " + query + " [SEP]" for query in df['documents']]
print("Example of query:\n", queries[0])
print("\nExample of document:\n", documents[0])


Example of query:
 [CLS] lede musikalsk personale [SEP]

Example of document:
 [CLS] Tildele og forvalte personaleopgaver på områder såsom instrumentering, bearbejdning, reproduktion af musik og stemmetræning. [SEP]


### Load BERT tokenizer
The BERT tokenizer is very storage efficient way of splitting a sequence into words - or rather tokens of subwords. The tokenizer uses WordPiece which uses subwords. That is splitting words into multiple words in order to keep the vocabulary smaller. That way, the vocabulary does not need to keep both: "boy" and "boys" but only "boy" and "s" where "s" can be used in many other cases.

In [5]:
# Tokenize with BERT tokenizer
model_path = r'J:\VOA\MABI\Deep Learning\my_DTU_project\Models\danish_bert_uncased_v2'
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)

# Tokenize queries and documents
tokenized_texts = [tokenizer.tokenize(sent) for sent in queries]
tokenized_docs = [tokenizer.tokenize(doc) for doc in documents]

print(f'Tokenized first sentence: \n {tokenized_texts[0]}')
print (f'\nTokenized first document: \n {tokenized_docs[0]}')

Tokenized first sentence: 
 ['[CLS]', 'lede', 'musikalsk', 'personale', '[SEP]']

Tokenized first document: 
 ['[CLS]', 'tildele', 'og', 'forvalt', '##e', 'personale', '##opgaver', 'pa', 'om', '##rad', '##er', 'sas', '##om', 'instrumenter', '##ing', ',', 'bearbejdning', ',', 'reproduktion', 'af', 'musik', 'og', 'stemme', '##træning', '.', '[SEP]']


In [6]:
# Set the maximum query length. 
MAX_LEN_Q = 24

# Pad our input tokens
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
q_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
q_input_ids = pad_sequences(q_input_ids, maxlen=MAX_LEN_Q, dtype="long", truncating="post", padding="post")
print(f'Shape of query ids:\n q_input_ids.shape = {q_input_ids.shape}')


# Create query attention masks
q_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in q_input_ids:
  seq_mask = [float(i>0) for i in seq]
  q_attention_masks.append(seq_mask)

print(f'Shape of query attention mask:\n q_attention_masks = {np.shape(q_attention_masks)}')

assert q_input_ids.shape == np.shape(q_attention_masks), 'dimensions of q_input_ids and q_attention_mask do not match' 

Shape of query ids:
 q_input_ids.shape = (13485, 24)
Shape of query attention mask:
 q_attention_masks = (13485, 24)


In [7]:
# Set the maximum document length. 
MAX_LEN_DOC = 128
# Pad our input tokens
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
d_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_docs]
d_input_ids = pad_sequences(d_input_ids, maxlen=MAX_LEN_DOC, dtype="long", truncating="post", padding="post")
print(f'Shape of input_ids.shape: {d_input_ids.shape}')


# Create attention masks for documents
d_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in d_input_ids:
  seq_mask = [float(i>0) for i in seq]
  d_attention_masks.append(seq_mask)

print(f'Shape of d_attention_masks: {np.shape(d_attention_masks)}')

assert d_input_ids.shape == np.shape(d_attention_masks), 'dimensions of document d_input_ids and d_attention_mask do not match' 

Shape of input_ids.shape: (13485, 128)
Shape of d_attention_masks: (13485, 128)


## Split into training and test datasets

In [8]:
# spliting index
index_spilt = 10000

# training data
train_q_input_ids = q_input_ids[:index_spilt] 
train_d_input_ids = d_input_ids[:index_spilt] 

train_q_attention_masks = q_attention_masks[:index_spilt] 
train_d_attention_masks = d_attention_masks[:index_spilt] 

# create labels (all are correct)
train_labels = torch.ones(train_q_input_ids.shape[0])


# validation data
val_q_input_ids = q_input_ids[index_spilt:] 
val_d_input_ids = d_input_ids[index_spilt:] 

val_q_attention_masks = q_attention_masks[index_spilt:] 
val_d_attention_masks = d_attention_masks[index_spilt:] 

val_labels = torch.ones(val_q_input_ids.shape[0])


## Create Pytorch DataLoader

In [9]:
dataset = TensorDataset(torch.tensor(train_q_input_ids), 
                        torch.tensor(train_d_input_ids), 
                        torch.tensor(train_q_attention_masks), 
                        torch.tensor(train_d_attention_masks), 
                        train_labels)

loader = DataLoader(
    dataset,
    batch_size=2
)

#for batch_idx, (x, y, z, a, l) in enumerate(loader):
#    print(x.shape, y.shape, z.shape, a.shape, l.shape)

In [103]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

# Create the DataLoader for our training set
train_data = TensorDataset(torch.tensor(train_q_input_ids), 
                        torch.tensor(train_d_input_ids), 
                        torch.tensor(train_q_attention_masks), 
                        torch.tensor(train_d_attention_masks), 
                        train_labels)

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(torch.tensor(val_q_input_ids), 
                        torch.tensor(val_d_input_ids), 
                        torch.tensor(val_q_attention_masks), 
                        torch.tensor(val_d_attention_masks), 
                        val_labels)

val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Import BERT base model (Danish)
Queries and documents have now been tokenized to the vocabolary

In [104]:
from transformers import BertConfig
from transformers import BertModel


model_path = r'J:\VOA\MABI\Deep Learning\my_DTU_project\Models\danish_bert_uncased_v2'

config = BertConfig.from_pretrained(model_path + r'\bert_config.json')
bert_base = BertModel(config)


In [105]:
#for param in bert_base.parameters():
#    print(param)  # param.requires_grad = False

#param_optimizer = list(bert_base.named_parameters())
#print(param_optimizer)


In [106]:
from torch import nn
import torch.nn.functional as F

class MyColBERT(nn.Module):
      def __init__(self):
            super(MyColBERT, self).__init__()
            self.bert = bert_base 
            ### New layers:
            self.linear1 = nn.Linear(768, 32) # 32 is "low" for faster computation of MaxSim (it is independent of sequence lentgh)

            # Freeze parameters of BERT Base as I only will train last layer
            for param in bert_base.parameters():
                  param.requires_grad = False
          

      def forward(self, ids, mask):
            sequence_output, pooled_output = self.bert(ids, attention_mask=mask) # sequence_output shape is: (batch_size, sequence_length, 768)
            sequence_output = self.linear1(sequence_output)
            sequence_output = F.softmax(sequence_output, dim=1)

            #linear2_output = self.linear2(linear2_output)

            return sequence_output

In summary, given a query sequence $q = q_0 q_1...q_l$ and a document sequence $d = d_0 d_1...d_n$, we compute the bags of embeddings $E_q$ and $E_d$ in the following manner:

* $E_q$ := Normalize( CNN( BERT(“[Q]$q_0 q_1...q_l$ ##...#”) ) )

* $E_d$ := Normalize( CNN( BERT(“[D]$d_0 d_1...d_l$ ...d_n”) ) )

where '#' refers to the [mask] tokens. In my implementation of ColBERT the output dimensions are as follow:
\begin{align*}
    dim(E_q) = [batch_{size} \times 24 \times 32] \\
    dim(E_d) = [batch_{size} \times 128 \times 32]
\end{align*}


The relevancy score, MaxSim, is defined as follows:

$$ S_{q,d} = \sum_{i \in ||E_q||} \max_{j \in ||E_d||} E_{q_i} * E_{d_j}^T$$

In [107]:
def MaxSim(q, D):
    '''Takes in the embeddings of a query, q, and all documents' embeddings, D.
        Return a tensor of the query's similarity scores to all documents in D.'''

    # repeat q for faster matrix multiplication (faster than loop)
    batch_size=D.shape[0]
    q_X = q.repeat(batch_size, 1, 1)
    
    # multiply the same query q against all documents (in D)
    batch_mm = torch.bmm(q_X, D.permute(0,2,1))
    
    maks, _ = torch.max(batch_mm, dim=2) # dim=1 or dim=2
    #print(maks.shape) # should be (batch_size, 24)
    
    # Sum over maximum values --> return vector of length len(D)
    S_qD = torch.sum(maks, dim=1)
    
    return S_qD

In [163]:
def MaxSimQD(Q, D):
    '''Takes in the embeddings of all queries, Q, and all documents' embeddings, D.
        Return a tensor of IDs of the docs in D closest to the queries.'''

    # repeat q for faster matrix multiplication (faster than loop)
    batch_size=D.shape[0]

    output = []

    for q in Q:
        q_X = q.repeat(batch_size, 1, 1)
    
        # multiply the same query q against all documents (in D)
        batch_mm = torch.bmm(q_X, D.permute(0,2,1))
    
        maks, _ = torch.max(batch_mm, dim=2) # dim=1 or dim=2
        #print(maks.shape) # should be (batch_size, 24)
    
        # Sum over maximum values --> return vector of length len(D)
        S_qD = torch.sum(maks, dim=1)

        maks, maks_id = torch.max(S_qD, dim=0)

        output.append(maks_id)

    return torch.tensor(output)

In [164]:
pred = MaxSimQD(q_embeddings, d_embeddings)
print(pred)


tensor([13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13])


In [110]:
#        most_similar_doc_score = []
#        most_similar_docID = []
#        
#         # Define D as all documents:
#        D = d_outputs
#        
#        for q_no in tqdm(range(sample_size)):
#            
#            # Select one query
#            q = q_outputs[q_no]
#        
#            # Compute similarity scores for all 
#            S_qD = MaxSim(q, D)
#            maks, maks_id = torch.max(S_qD, dim=0)
#        
#            most_similar_doc_score.append(float(maks))
#            most_similar_docID.append(int(maks_id))

## Define criterion
$$ L(q, d_i) = \frac{exp(MaxSim(q, d_i))}{\sum_{j \in ||D||} exp(MaxSim(q, d_j))} = \frac{exp( MaxSim(q, D)[i] )}{\sum exp( MaxSim(q, D) )}$$

In [111]:
#criterion = 
#loss = exp(MaxSim(q_outputs[k], D)[i]) / sum(exp(MaxSim(q_outputs[k], D)))
#loss = criterion(output, labels)  

# Training / Fine-tuning of model 

In [112]:
## Define model, loss function (criterion) and optimizer
model = MyColBERT()
model.to(torch.device(device))
criterion = nn.CrossEntropyLoss() ## If required define your own criterion #TODO:
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))  # alternative: optim.SGD(model.parameters(), lr=0.01, momentum = 0.9)

#print(model)

In [114]:
step = 0
stide_len = batch_size # 100 or 10 for debugging


q_id    = torch.tensor(q_input_ids[step:step+stide_len]).to(torch.device(device)).to(torch.int64)
d_id    = torch.tensor(d_input_ids[step:step+stide_len]).to(torch.device(device)).to(torch.int64)
q_mask  = torch.tensor(np.array(q_attention_masks[step:step+stide_len])).to(torch.device(device)).to(torch.int64)
d_mask  = torch.tensor(np.array(d_attention_masks[step:step+stide_len])).to(torch.device(device)).to(torch.int64)

# Find Embeddings of documents and save to disk
q_outputs = model(q_id, mask=q_mask)
d_outputs = model(d_id, mask=d_mask)

In [153]:
def MaxSim(q, D):
    '''Takes in the embeddings of a query, q, and all documents' embeddings, D.
        Return a tensor of the query's similarity scores to all documents in D.'''

    # repeat q for faster matrix multiplication (faster than loop)
    batch_size=D.shape[0]
    q_X = q.repeat(batch_size, 1, 1)
    
    # multiply the same query q against all documents (in D)
    batch_mm = torch.bmm(q_X, D.permute(0,2,1))
    
    maks, _ = torch.max(batch_mm, dim=2) # dim=1 or dim=2
    #print(maks.shape) # should be (batch_size, 24)
    
    # Sum over maximum values --> return vector of length len(D)
    S_qD = torch.sum(maks, dim=1)
    
    return S_qD

In [143]:
from torch.autograd import Variable

num_epoch = 1 #15

for epoch in range(num_epoch):  # loop over the dataset multiple times

    running_loss = 0.0
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs
        q_ids, d_ids, q_masks, d_masks, labels = data
        
        # send to gpu (or cpu)
        q_ids, d_ids, q_masks, d_masks, labels = q_ids.to(torch.int64).to(device), d_ids.to(torch.int64).to(device), q_masks.to(torch.int64).to(device), d_masks.to(torch.int64).to(device), labels.to(device)

#torch.Size([32, 24]) torch.Size([32, 128]) torch.Size([32, 24]) torch.Size([32, 128]) torch.Size([32])

        # wrap them in Variable
        #inputs, labels = Variable(inputs.to(device)), Variable(labels.to(device))

        # zero the parameter gradients
        optimizer.zero_grad()

        ## Forward
        q_embeddings = model(q_ids, mask=q_masks)
        d_embeddings = model(d_ids, mask=d_masks)

        # Compute score for all q and D in one output
       
        j = np.random.randint(batch_size)
        target = torch.zeros(batch_size, dtype=torch.long)
        target[j] = 1 # correct is always respective document ID 
        print(f'target: {target}')
        
        #S_qD = MaxSim(q_embeddings[j], d_embeddings)
        #maks, maks_id = torch.max(S_qD, dim=0)
        #pred[maks_id] = 1
        pred = F.softmax(MaxSim(q_embeddings[j], d_embeddings), dim=0)
        print(f'prediction: {pred}')

        loss = criterion(pred, target)        
    
        ## backward
        loss.backward()
        # optimize
        optimizer.step()
        # print statistics
        #running_loss += loss.data[0]
        running_loss += loss.item() 
        if i % 1000 == 999:    # print every 1000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 1000))
            running_loss = 0.0

print('Finished Training')

target: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])
prediction: tensor([0.0622, 0.0625, 0.0624, 0.0630, 0.0626, 0.0623, 0.0619, 0.0628, 0.0623,
        0.0621, 0.0623, 0.0628, 0.0624, 0.0638, 0.0623, 0.0622],
       grad_fn=<SoftmaxBackward>)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [156]:
#for batch in train_dataloader:
#    print(batch.item())