In [2]:
import sys
sys.path.append('/home/cernypro/dev/source/ml4logs/src/bert')

In [4]:
from encoders import DistilBertForClsEmbedding
from ict import OneTowerICT, TwoTowerICT
from dataset_utils import my_caching_load_from_disk

In [6]:
import numpy as np
from dataclasses import dataclass
from typing import List, Union, Dict, Optional
import torch
from transformers import DistilBertTokenizerFast, AutoModel, DistilBertPreTrainedModel, AutoTokenizer
from datasets import load_dataset
from pathlib import Path

In [7]:
hdfs1_dataset = load_dataset('text', data_files='/home/cernypro/dev/source/ml4logs/data/interim/HDFS1/no_timestamps_test-data-HDFS1.log', split='train')

Using custom data configuration default-35c85ea2505ebc64


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /data/temporary/huggingface_2406238/datasets/text/default-35c85ea2505ebc64/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /data/temporary/huggingface_2406238/datasets/text/default-35c85ea2505ebc64/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


In [8]:
small_raw_dataset = hdfs1_dataset.select(range(1000))

In [9]:
def remove_timestamp(example):
    # need to find third occurence of a space and slice the string after it
    # using a very non robust silly solution
    #s = example['text']
    #example['text'] = s[s.find(' ', s.find(' ', s.find(' ')+1)+1)+1:]
    return example

small_cleaned_dataset = small_raw_dataset.map(remove_timestamp)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [10]:
pretrained_model_name = "distilbert-base-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(pretrained_model_name)

In [14]:
class ClsEncoderTower(torch.nn.Module):
    """
    Simple model on top of a BERT like model.
    It's a linear layer on the [CLS] token of each sentence from BERT.
    """
    def __init__(self, pretrained_model_name_or_path, output_encode_dimension=512):
        super(ClsEncoderTower, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path)
        self.linear = torch.nn.Linear(self.bert.config.dim, output_encode_dimension) # self.bert.config.dim most likely 768
        
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_embedding = bert_output[0][:, 0]
        cls_encoding = self.linear(cls_token_embedding)
        return cls_encoding


class OneTowerICT(torch.nn.Module):
    """
    Network for the inverse close task, uses one BERT tower for creating encodings of target and context sentences (query and document as per nomenclature of original paper)
    Uses cross entropy loss
    """
    def __init__(self, pretrained_model_name_or_path, output_encode_dimension=512):
        super(OneTowerICT, self).__init__()
        self.tower = ClsEncoderTower(pretrained_model_name_or_path, output_encode_dimension)
        self.loss_fn = torch.nn.CrossEntropyLoss()
    def forward(self, target, target_mask, context, context_mask, correct_class):
        target_cls_encode = self.tower(input_ids=target, attention_mask=target_mask)
        context_cls_encode = self.tower(input_ids=context, attention_mask=context_mask)
        
        logits = torch.matmul(target_cls_encode, context_cls_encode.transpose(-2, -1))
        loss = self.loss_fn(logits, correct_class)
        return loss, target_cls_encode, context_cls_encode
    
class TwoTowerICT(torch.nn.Module):
    def __init__(self, target_tower_pretrained_model_name_or_path, context_tower_pretrained_model_name_or_path=None, output_encode_dimension=512):
        super(TwoTowerICT, self).__init__()
        assert target_tower_pretrained_model_name_or_path is not None, "Target tower pretrained model must me specified!"
        if context_tower_pretrained_model_name_or_path is None:
            context_tower_pretrained_model_name_or_path = target_tower_pretrained_model_name_or_path
        self.target_encoder = ClsEncoderTower(target_tower_pretrained_model_name_or_path, output_encode_dimension)
        self.context_encoder = ClsEncoderTower(context_tower_pretrained_model_name_or_path, output_encode_dimension)
        self.loss_fn = torch.nn.CrossEntropyLoss()
    
    def forward(self, target, target_mask, context, context_mask, correct_class):
        target_cls_encode = self.target_encoder(input_ids=target, attention_mask=target_mask)
        context_cls_encode = self.context_encoder(input_ids=context, attention_mask=context_mask)
        
        logits = torch.matmul(target_cls_encode, context_cls_encode.transpose(-2, -1))
        loss = self.loss_fn(logits, correct_class)
        return loss, target_cls_encode, context_cls_encode

In [15]:
saved_model_dir = Path('/home/cernypro/dev/source/ml4logs/models/ICT/2T_Eps_1_M_basic_chunked_10_Seed-42_T-len_512_C-len_512_Tr-batch_64_Ev-b_64_O-dim_100')
saved_model_file = saved_model_dir / 'pytorch_model.bin'

In [16]:
# state_dict = torch.load(saved_model_file, map_location=torch.device('cpu'))
state_dict = torch.load(saved_model_file)

In [17]:
model = TwoTowerICT(pretrained_model_name, output_encode_dimension=100)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
encoder = model.context_encoder
encoder = encoder.to(device)

In [20]:
def encode(examples, tokenizer, encoder):
    return {'embedding': encoder(**tokenizer(examples['text'], return_tensors='pt', truncation=True, padding=True).to(device)).cpu().detach().numpy().tolist()}

small_embedded_dataset = small_cleaned_dataset.map(encode, fn_kwargs={'tokenizer': tokenizer, 'encoder': encoder}, batched=True, batch_size=128)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




# Transforming into a huggingface PretrainedModel
An experiment

In [63]:
from transformers.file_utils import ModelOutput
from transformers import DistilBertModel

@dataclass
class EmbeddingOutput(ModelOutput):
    """
    ModelOutput class inspired per Huggingface Transformers library conventions, may be replaced by a suitable alternative class from the library if any exists.
    """
    embedding: torch.FloatTensor = None
        
class DistilBertForClsEmbedding(DistilBertPreTrainedModel):
    """
    DistilBertModel with a linear layer applied to [CLS] token.
    Initialize using .from_pretrained(path_or_model_name) method
    use task_specific_params={'cls_embedding_dimension': *YOUR EMBEDDING DIMENSION HERE*} to set embedding dimension
    """
    def __init__(self, config):
        super().__init__(config)
        if config.task_specific_params is None:
            config.task_specific_params = dict()

        self.distilbert = DistilBertModel(config)
        self.cls_projector = torch.nn.Linear(config.dim, config.task_specific_params.setdefault('cls_embedding_dimension', 512))

        self.init_weights()
    
    def forward(self, input_ids, attention_mask):
        bert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_embedding = bert_output.last_hidden_state[:, 0]
        cls_encoding = self.cls_projector(cls_token_embedding)
        return EmbeddingOutput(embedding=cls_encoding)

In [21]:
pretrainedTst = DistilBertForClsEmbedding.from_pretrained(pretrained_model_name, task_specific_params={'cls_embedding_dimension': 100})
pretrainedTst.distilbert = encoder.bert
pretrainedTst.cls_projector = encoder.linear

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForClsEmbedding: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForClsEmbedding from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForClsEmbedding from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForClsEmbedding were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['cls_projector.weight', 'cls_projector.bias']
You should probably TRAIN this model on a down-stream

In [23]:
pretrained_out = pretrainedTst(**tokenizer(small_cleaned_dataset[0:10]['text'], return_tensors='pt', truncation=True, padding=True).to(device))['embedding']

In [24]:
enc_out = encoder(**tokenizer(small_cleaned_dataset[0:10]['text'], return_tensors='pt', truncation=True, padding=True).to(device))

In [25]:
torch.all(torch.eq(pretrained_out, enc_out))

tensor(True, device='cuda:0')

In [26]:
tst_save_path = saved_model_dir.parent / f'ContextEncoder_from_{saved_model_dir.stem}'

In [27]:
tst_save_path

PosixPath('/home/cernypro/dev/source/ml4logs/models/ICT/ContextEncoder_from_2T_Eps_1_M_basic_chunked_10_Seed-42_T-len_512_C-len_512_Tr-batch_64_Ev-b_64_O-dim_100')

In [28]:
pretrainedTst.save_pretrained(tst_save_path)

In [29]:
pretrained2 = DistilBertForClsEmbedding.from_pretrained(tst_save_path)

In [30]:
pretrained2 = pretrained2.to(device)

In [32]:
pre2_out = pretrained2(**tokenizer(small_cleaned_dataset[0:10]['text'], return_tensors='pt', truncation=True, padding=True).to(device)).embedding
torch.all(torch.eq(pre2_out, enc_out))

tensor(True, device='cuda:0')