# Make documentembeddings using SkillsColBERT

In [2]:
# install
# !pip install pytorch-pretrained-bert pytorch-nlp keras scikit-learn matplotlib tensorflow

#https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03 

In [1]:
# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import time
% matplotlib inline

UsageError: Line magic function `%` not found.


In [2]:
df = pd.read_csv(r'.\data\skills_description.csv', sep='\t', encoding='utf-8')
df = df.rename(columns={'preferredLabel':'query', 'description': 'documents'})
df = df[['query', 'documents']]

df.head()

Unnamed: 0,query,documents
0,lede musikalsk personale,Tildele og forvalte personaleopgaver på område...
1,føre tilsyn med fængselsprocedurer,Føre tilsyn med driften af et fængsel eller an...
2,anvende antioppressiv praksis,"Identificere undertrykkelse i samfund, økonomi..."
3,kontrollere overensstemmelse med jernbaneforsk...,"Inspicere rullende materiel, komponenter og sy..."
4,identificere tilgængelige tjenester,"Identificere de forskellige tjenester, der er ..."


In [3]:
# specify GPU device
if torch.cuda.is_available():
    device = 'cuda'
    n_gpu = torch.cuda.device_count()
    print(f' running on {device} with {n_gpu} number of GPUs. Name of GPU: {torch.cuda.get_device_name(0)}')
else:
    device = 'cpu'
    print(f'device = {device}')

Running on cpu. Be patient...


In [4]:
# add special ColBERT tokens to documents
documents =  ["[CLS] " + query + " [SEP]" for query in df['documents']]
print("\nExample of document:\n", documents[0])


Example of document:
 [CLS] Tildele og forvalte personaleopgaver på områder såsom instrumentering, bearbejdning, reproduktion af musik og stemmetræning. [SEP]


In [5]:
# Tokenize with BERT tokenizer
model_path = r'J:\VOA\MABI\Deep Learning\my_DTU_project\Models\danish_bert_uncased_v2'
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)

# Tokenize documents
tokenized_docs = [tokenizer.tokenize(doc) for doc in documents]

print (f'\nTokenize the first document: \n {tokenized_docs[0]}')


Tokenize the first document: 
 ['[CLS]', 'tildele', 'og', 'forvalt', '##e', 'personale', '##opgaver', 'pa', 'om', '##rad', '##er', 'sas', '##om', 'instrumenter', '##ing', ',', 'bearbejdning', ',', 'reproduktion', 'af', 'musik', 'og', 'stemme', '##træning', '.', '[SEP]']


In [6]:
# Set the maximum document length. 
MAX_LEN_DOC = 128
# Pad our input tokens
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
d_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_docs]
d_input_ids = pad_sequences(d_input_ids, maxlen=MAX_LEN_DOC, dtype="long", truncating="post", padding="post")
print(f'Shape of input_ids.shape: {d_input_ids.shape}')

Shape of input_ids.shape: (13485, 128)


In [7]:
# Create attention masks for documents
d_attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in d_input_ids:
  seq_mask = [float(i>0) for i in seq]
  d_attention_masks.append(seq_mask)

print(f'Shape of d_attention_masks: {np.shape(d_attention_masks)}')

assert d_input_ids.shape == np.shape(d_attention_masks), 'dimensions of document d_input_ids and d_attention_mask do not match' 

Shape of d_attention_masks: (13485, 128)


# Import model
Queries and documents have now been tokenized to the vocabolary

In [8]:
from transformers import BertConfig
from transformers import BertModel

config = BertConfig.from_pretrained(model_path + r'\bert_config.json')
bert_base = BertModel(config)

In [11]:
from torch import nn
import torch.nn.functional as F

class SkillsColBERT(nn.Module):
    def __init__(self):
          super(SkillsColBERT, self).__init__()
          self.bert = bert_base 
          ### New layers:
          #TODO: 
          # self.finalLinear = nn.Linear(768, 32) # 32 is "low" for faster computation of MaxSim (it is independent of sequence lentgh)
          

    def forward(self, ids, mask):
          sequence_output, pooled_output = self.bert(ids, attention_mask=mask) # sequence_output shape is: (batch_size, sequence_length, 768)
               
          # We apply the linear layer in line with ColBERT paper. The linear layer (which applies a linear transformation)
          # takes as input the hidden states of all tokens (so seq_len times a vector of size 768, each corresponding to
          # a single token in the input sequence) and outputs 32 numbers for every token
          # so the logits are of shape (batch_size, sequence_length, 32)
          
          #TODO: 
          # sequence_output = self.finalLinear(sequence_output)
          sequence_output = F.softmax(sequence_output, dim=1)

          return sequence_output

In [24]:
import os
#os.path.isdir(f'data/doc_embeddings/tensor_{str(1)}.pt')


True

In [12]:
# Choose batch_size
stide_len = 100

my_model  = SkillsColBERT()
my_model.to(torch.device(device))


# Initialize tensor to store output
d_id    = torch.tensor(d_input_ids[:stide_len]).to(torch.device(device)).to(torch.int64)
d_mask  = torch.tensor(d_attention_masks[:stide_len]).to(torch.device(device)).to(torch.int64)
doc_output = my_model(d_id, mask=d_mask)

step = 0
i = 0
start_time = time.time()
while step < len(df):
    if os.path.exists(os.path.join(os.getcwd(), 'doc_embeddings', f'tensor_{i}.pt')):
        step += stide_len
        i += 1
    else:
        if step % (500)==0:
            print(f'batch {i} of size {stide_len} out of {len(df)/stide_len}')

        d_id    = torch.tensor(d_input_ids[step:step+stide_len]).to(torch.device(device)).to(torch.int64)
        d_mask  = torch.tensor(np.array(d_attention_masks[step:step+stide_len])).to(torch.device(device)).to(torch.int64)

        # Find Embeddings of documents and save to disk
        torch.save(my_model(d_id, mask=d_mask), f'./doc_embeddings/tensor_{i}.pt')

        # Add stride_length to step
        step += stide_len
        i += 1

end_time = time.time()
print("total time taken this loop: ", end_time - start_time)

batch 0 of size 100 out of 134.85
batch 1 of size 100 out of 134.85
batch 2 of size 100 out of 134.85
batch 3 of size 100 out of 134.85
batch 4 of size 100 out of 134.85
batch 5 of size 100 out of 134.85
batch 6 of size 100 out of 134.85
batch 7 of size 100 out of 134.85
batch 8 of size 100 out of 134.85
batch 9 of size 100 out of 134.85
batch 10 of size 100 out of 134.85
batch 11 of size 100 out of 134.85
batch 12 of size 100 out of 134.85
batch 13 of size 100 out of 134.85
batch 14 of size 100 out of 134.85
batch 15 of size 100 out of 134.85
batch 16 of size 100 out of 134.85
batch 17 of size 100 out of 134.85
batch 18 of size 100 out of 134.85
batch 19 of size 100 out of 134.85
batch 20 of size 100 out of 134.85
batch 21 of size 100 out of 134.85
batch 22 of size 100 out of 134.85
batch 23 of size 100 out of 134.85
batch 24 of size 100 out of 134.85
batch 25 of size 100 out of 134.85
batch 26 of size 100 out of 134.85
batch 27 of size 100 out of 134.85
batch 28 of size 100 out of 13

In [42]:
load_doc_embeddings = torch.load(f'./doc_embeddings/tensor_{0}.pt')

i = 1
while if os.path.exists(os.path.join(os.getcwd(), 'doc_embeddings', f'tensor_{i}.pt')):
    print(f'loading document embedding {i}')
    load_doc_embeddings = torch.cat((load_doc_embeddings, torch.load(f'./doc_embeddings/tensor_{i}.pt')), 0)
    i += 1

loading document embedding 101
loading document embedding 102
loading document embedding 103
loading document embedding 104
loading document embedding 105
loading document embedding 106
loading document embedding 107
loading document embedding 108
loading document embedding 109
loading document embedding 110
loading document embedding 111
loading document embedding 112
loading document embedding 113
loading document embedding 114
loading document embedding 115
loading document embedding 116
loading document embedding 117
loading document embedding 118
loading document embedding 119
loading document embedding 120
loading document embedding 121
loading document embedding 122
loading document embedding 123
loading document embedding 124
loading document embedding 125
loading document embedding 126
loading document embedding 127
loading document embedding 128
loading document embedding 129
loading document embedding 130
loading document embedding 131
loading document embedding 132
loading 

In [33]:
load_doc_embeddings.shape

torch.Size([10400, 128, 768])