In [27]:
from datasets import load_dataset
import numpy as np
from dataclasses import dataclass
from typing import List, Union, Dict, Optional
import torch
from transformers import DistilBertTokenizerFast, AutoModel, DistilBertPreTrainedModel, AutoTokenizer
from pathlib import Path

In [2]:
hdfs1_dataset = load_dataset('text', data_files='../data/raw/HDFS1/HDFS.log', split='train')

Using custom data configuration default-f7d20bad4b8d075b
Reusing dataset text (/home/cernypro/.cache/huggingface/datasets/text/default-f7d20bad4b8d075b/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691)


In [3]:
small_raw_dataset = hdfs1_dataset.select(range(1000))

In [4]:
def remove_timestamp(example):
    # need to find third occurence of a space and slice the string after it
    # using a very non robust silly solution
    s = example['text']
    example['text'] = s[s.find(' ', s.find(' ', s.find(' ')+1)+1)+1:]
    return example

small_cleaned_dataset = small_raw_dataset.map(remove_timestamp)

Loading cached processed dataset at /home/cernypro/.cache/huggingface/datasets/text/default-f7d20bad4b8d075b/0.0.0/44d63bd03e7e554f16131765a251f2d8333a5fe8a73f6ea3de012dbc49443691/cache-9078a3a0732e2ad5.arrow


In [5]:
pretrained_model_name = "distilbert-base-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name)

In [6]:
class ClsEncoderTower(torch.nn.Module):
    """
    Simple model on top of a BERT like model.
    It's a linear layer on the [CLS] token of each sentence from BERT.
    """
    def __init__(self, pretrained_model_name_or_path, output_encode_dimension=512):
        super(ClsEncoderTower, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path)
        self.linear = torch.nn.Linear(self.bert.config.dim, output_encode_dimension) # self.bert.config.dim most likely 768
        
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_embedding = bert_output[0][:, 0]
        cls_encoding = self.linear(cls_token_embedding)
        return cls_encoding
    
class OneTowerICT(torch.nn.Module):
    """
    Network for the inverse close task, uses one BERT tower for creating encodings of target and context sentences (query and document as per nomenclature of original paper)
    Uses cross entropy loss
    """
    def __init__(self, pretrained_model_name_or_path, output_encode_dimension=512):
        super(OneTowerICT, self).__init__()
        self.tower = ClsEncoderTower(pretrained_model_name_or_path, output_encode_dimension)
        self.loss_fn = torch.nn.CrossEntropyLoss()
    def forward(self, target, target_mask, context, context_mask, correct_class):
        target_cls_encode = self.tower(input_ids=target, attention_mask=target_mask)
        context_cls_encode = self.tower(input_ids=context, attention_mask=context_mask)
        
        logits = torch.matmul(target_cls_encode, context_cls_encode.transpose(-2, -1))
        loss = self.loss_fn(logits, correct_class)
        return loss, target_cls_encode, context_cls_encode

In [7]:
saved_model_dir = Path.cwd().parent / 'models' / '1T_Eps_2_Lines_8000000_T-len_512_C-len_512_Tr-batch_64_Ev-b_64_O-dim_512'
saved_model_file = saved_model_dir / 'pytorch_model.bin'

In [8]:
state_dict = torch.load(saved_model_file)

In [9]:
model = OneTowerICT(pretrained_model_name)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [10]:
encoder = model.tower
encoder = encoder.to('cuda')

In [11]:
def encode(examples, tokenizer, encoder):
    return {'embedding': encoder(**tokenizer(examples['text'], return_tensors='pt', truncation=True, padding=True).to('cuda')).cpu().detach().numpy().tolist()}

small_embedded_dataset = small_cleaned_dataset.map(encode, fn_kwargs={'tokenizer': tokenizer, 'encoder': encoder}, batched=True, batch_size=128)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




# Transforming into a huggingface PretrainedModel
An experiment

In [23]:
from transformers.file_utils import ModelOutput
from transformers import DistilBertModel

@dataclass
class EmbeddingOutput(ModelOutput):
    """
    ModelOutput class inspired per Huggingface Transformers library conventions, may be replaced by a suitable alternative class from the library if any exists.
    """
    embedding: torch.FloatTensor = None
        
class DistilBertForClsEmbedding(DistilBertPreTrainedModel):
    """
    DistilBertModel with a linear layer applied to [CLS] token.
    Initialize using .from_pretrained(path_or_model_name) method
    """
    def __init__(self, config):
        super().__init__(config)
        if config.task_specific_params is None:
            config.task_specific_params = dict()

        self.distilbert = DistilBertModel(config)
        self.cls_projector = torch.nn.Linear(config.dim, config.task_specific_params.setdefault('cls_embedding_dimension', 512))

        self.init_weights()
    
    def forward(self, input_ids, attention_mask):
        bert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_embedding = bert_output.last_hidden_state[:, 0]
        cls_encoding = self.cls_projector(cls_token_embedding)
        return EmbeddingOutput(embedding=cls_encoding)

In [28]:
@dataclass
class ICTOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    target_cls_encode: torch.FloatTensor = None
    context_cls_encode: torch.FloatTensor = None
    

class DistilBertOneTowerICT(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.embedding_tower = DistilBertForClsEmbedding(config)
        self.loss_fn = torch.nn.CrossEntropyLoss()
    def forward(self, target, target_mask, context, context_mask, correct_class):
        target_cls_encode = self.embedding_tower(input_ids=target, attention_mask=target_mask)
        context_cls_encode = self.embedding_tower(input_ids=context, attention_mask=context_mask)
        
        logits = torch.matmul(target_cls_encode, context_cls_encode.transpose(-2, -1))
        loss = self.loss_fn(logits, correct_class)
        return ICTOutpu(loss=loss,
                        target_cls_encode=target_cls_encode,
                        context_cls_encode=context_cls_encode)
        

In [62]:
pretrainedTst = DistilBertForClsEmbedding.from_pretrained(pretrained_model_name)
pretrainedTst.distilbert = encoder.bert
pretrainedTst.cls_projector = encoder.linear

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForClsEmbedding: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForClsEmbedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForClsEmbedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForClsEmbedding were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['cls_projector.weight', 'cls_projector.bias']
You should probably TRAIN this model on a down-stream

In [70]:
pretrained_out = pretrainedTst(**tokenizer(small_cleaned_dataset[0]['text'], return_tensors='pt', truncation=True, padding=True).to('cuda'))['embedding']

In [21]:
enc_out = encoder(**tokenizer(small_cleaned_dataset[0:10]['text'], return_tensors='pt', truncation=True, padding=True).to('cuda'))

In [70]:
torch.all(torch.eq(pretrained_out, enc_out))

tensor(True, device='cuda:0')

In [15]:
tst_save_path = saved_model_dir.parent / 'PretrainedTestDir'

In [74]:
pretrainedTst.save_pretrained(tst_save_path)

In [24]:
pretrained2 = DistilBertForClsEmbedding.from_pretrained(tst_save_path)

In [25]:
pretrained2 = pretrained2.to('cuda')

In [26]:
pre2_out = pretrained2(**tokenizer(small_cleaned_dataset[0:10]['text'], return_tensors='pt', truncation=True, padding=True).to('cuda')).embedding
torch.all(torch.eq(pre2_out, enc_out))

tensor(True, device='cuda:0')

In [79]:
preTstConf = DistilBertForClsEmbedding.from_pretrained(pretrained_model_name, task_specific_params={'cls_embedding_dimension': 256})

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForClsEmbedding: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForClsEmbedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForClsEmbedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForClsEmbedding were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['cls_projector.weight', 'cls_projector.bias']
You should probably TRAIN this model on a down-stream

In [92]:
pretrained2.distilbert.embeddings

Embeddings(
  (word_embeddings): Embedding(28996, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)