In [1]:

"""PyTorch b model."""


import math
import warnings
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from torch.nn import functional as F

from transformers.file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from transformers import BloomConfig, BloomPreTrainedModel, BloomModel, AutoConfig, PreTrainedModel, AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput, Seq2SeqLMOutput



logger = logging.get_logger(__name__)


import torch
import torch.nn as nn


class PrefixEncoder(torch.nn.Module):
    def __init__(self, config, transfromer):
        super().__init__()

        self.config = config
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        self.transfromer=transfromer

        word_embeddings = transfromer.embed_tokens
        
        tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
        
        init_token_ids = tokenizer(config.text, return_tensors='pt')['input_ids']
        print("Prefix sequence length: ", init_token_ids.shape[1])
        tokenizer=None

        self.embedding = torch.nn.Embedding(init_token_ids.shape[1], config.hidden_size)

        if config.transform==True:
            self.transform = nn.Linear(config.n_embd, config.n_embd, bias=False)
        else:
            self.transform=None
     
        init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device)

        word_embedding_weights = word_embeddings(init_token_ids).detach().clone()
        word_embedding_weights = word_embedding_weights.to(torch.float32)
        #print('word_embedding_weights', word_embedding_weights.shape)
        #print('word_embedding_weights', word_embedding_weights.squeeze(0).shape)
        self.embedding.weight = torch.nn.Parameter(word_embedding_weights.squeeze(0))  
        global virtual_tokens 
        virtual_tokens = torch.arange(0, init_token_ids.shape[1])
        

    def forward(
        self,
        device=None,
        batch_size=None,

    ):


        inputs_embeds = self.embedding(virtual_tokens.to(device))
        inputs_embeds=self.dropout(inputs_embeds)
        outputs = self.transfromer(
            inputs_embeds=inputs_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
        )        
        #print('working', outputs.past_key_values)
        #print('working', projection)
        past_key_values=outputs.past_key_values
        if config.transform==True:
        # Apply transformations
            transformed_key_values = []
            for layer in past_key_values:
                key, value = layer
                #print(key.shape, value.shape)
                # Transpose, transform, and transpose back for key
                transformed_key = self.transform(key.transpose(1, 2)).transpose(1, 2)
                transformed_key=self.dropout(transformed_key)
                # Transpose, transform, and transpose back for value
                transformed_value = self.transform(value)
                transformed_value = self.dropout(transformed_value)
                transformed_key_values.append((transformed_key, transformed_value))

            transformed_past_key_values = tuple(transformed_key_values)
        
            return  (transformed_past_key_values, inputs_embeds.shape[0])
        else:
            return  (past_key_values, inputs_embeds.shape[0])



class PrefixForSequenceClassification(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.transformer =  AutoModel.from_pretrained(config._name_or_path)
        
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

        for param in self.transformer.parameters():
            param.requires_grad = False

        self.n_layer = config.num_hidden_layers
        self.n_head = config.n_head
        self.n_embd = config.hidden_size // config.n_head
        config.n_embd=self.n_embd

        #print('self.prefix_ids', self.prefix_ids)
        self.prompt_encoder = PrefixEncoder(config, self.transformer)
        self.config = config


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        batch_size = input_ids.shape[0]
        
        #print('prefix_ids', prefix_ids)
        past_key_values, pre_length =  self.prompt_encoder(self.transformer.device, batch_size)
        #print('prompts', prompts.shape)
        #print('raw_tokens_embedding', raw_tokens_embedding)
        #print('batch_size', batch_size, self.pre_seq_len)
        #inputs_embeds = torch.cat((prompts, raw_tokens_embedding), dim=1)
        prompt_attention_mask = torch.ones(batch_size, pre_length).to(self.transformer.device)
        attention_mask = torch.cat((prompt_attention_mask, attention_mask), dim=1)

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            return_dict=return_dict,
            past_key_values=past_key_values,
        )

        
        hidden_states = self.dropout(outputs[0])

        logits = self.score(hidden_states)
        logits = torch.mean(logits, dim=1)


        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class PromptEncoder(torch.nn.Module):
    def __init__(self, config, word_embeddings):
        super().__init__()

        self.config = config
        
        tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
        
        init_token_ids = tokenizer(config.text, return_tensors='pt')['input_ids']
        print("Prompt sequence length: ", init_token_ids.shape[1])
        #print("config.pre_seq_len, config.hidden_size", config.pre_seq_len, config.hidden_size)
        tokenizer=None

        self.embedding = torch.nn.Embedding(init_token_ids.shape[1], config.hidden_size)
        self.dropout = torch.nn.Dropout(config.hidden_dropout)

        if config.transform==True:
            self.transform = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
        else:
            self.transform=None
            
        init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device)

        word_embedding_weights = word_embeddings(init_token_ids).detach().clone()
        word_embedding_weights = word_embedding_weights.to(torch.float32)
        #print('word_embedding_weights', word_embedding_weights.shape)
        #print('word_embedding_weights', word_embedding_weights.squeeze(0).shape)
        self.embedding.weight = torch.nn.Parameter(word_embedding_weights.squeeze(0))  
        global virtual_tokens 
        virtual_tokens = torch.arange(0, init_token_ids.shape[1])
        

    def forward(
        self,
        device=None,
        batch_size=None,

    ):

        projection = self.embedding(virtual_tokens.to(device))
        projection=self.dropout(projection)
        
        if config.transform==True:
            projection = self.transform(projection)
            projection=self.dropout(projection)

        return projection.repeat(batch_size, 1, 1)


class PromptForSequenceClassification(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.transformer =  AutoModel.from_pretrained(config._name_or_path)
        
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        #prefix_ids = config.tokenizer(config.prefix, return_tensors='pt')['input_ids']
        #print('prefix_ids', prefix_ids)
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

        for param in self.transformer.parameters():
            param.requires_grad = False

        self.n_layer = config.num_hidden_layers
        self.n_head = config.n_head
        self.n_embd = config.hidden_size // config.n_head

        #print('self.prefix_ids', self.prefix_ids)
        self.prompt_encoder = PromptEncoder(config, self.transformer.embed_tokens)
        self.config = config


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        batch_size = input_ids.shape[0]
        raw_tokens_embedding = self.transformer.embed_tokens(input_ids)
        #print('prefix_ids', prefix_ids)
        prompts =  self.prompt_encoder(self.transformer.device, batch_size)
        #print('prompts', prompts.shape)
        #print('raw_tokens_embedding', raw_tokens_embedding)
        #print('batch_size', batch_size, self.pre_seq_len)
        inputs_embeds = torch.cat((prompts, raw_tokens_embedding), dim=1)
        prompt_attention_mask = torch.ones(batch_size, prompts.shape[1]).to(self.transformer.device)
        attention_mask = torch.cat((prompt_attention_mask, attention_mask), dim=1)

        outputs = self.transformer(
            # input_ids,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            return_dict=return_dict,
            # past_key_values=past_key_values,
        )


        
        hidden_states = self.dropout(outputs[0])
        logits = self.score(hidden_states)
        logits = torch.mean(logits, dim=1)


        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [2]:
from datasets import load_dataset
import pandas as pd
snli = load_dataset("snli")

In [3]:
train = pd.DataFrame(snli["train"])
test = pd.DataFrame(snli["test"])

train = train[train["label"]!=-1].reset_index(drop=True)

test = test[test["label"]!=-1].reset_index(drop=True)

from datasets import Dataset
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test) 

In [4]:
from transformers import AutoTokenizer
model_name="mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['premise','hypothesis']

def preprocessing_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, max_length=128)

tokenized_train_dataset = train_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)

tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup

import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [7]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
config

MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [8]:
config._name_or_path=model_name
config.hidden_size=4096
config.num_hidden_layers=32
config.n_head=32
config.num_labels=3
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='Recognize the textual entailment from the text:'

In [9]:
#from falconSKT import  PrefixForTokenClassification

model = PrefixForSequenceClassification.from_pretrained(
    model_name,
    config=config,

)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Prefix sequence length:  14


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of PrefixForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['prompt_encoder.transfromer.layers.5.post_attention_layernorm.weight', 'prompt_encoder.transfromer.layers.17.self_attn.v_proj.weight', 'prompt_encoder.transfromer.layers.5.self_attn.o_proj.weight', 'transformer.layers.2.input_layernorm.weight', 'transformer.layers.21.self_attn.k_proj.weight', 'prompt_encoder.transfromer.layers.21.self_attn.o_proj.weight', 'prompt_encoder.transfromer.layers.13.mlp.up_proj.weight', 'transformer.layers.13.self_attn.o_proj.weight', 'prompt_encoder.transfromer.layers.3.input_layernorm.weight', 'transformer.layers.11.mlp.down_proj.weight', 'prompt_encoder.transfromer.layers.21.mlp.gate_proj.weight', 'transformer.layers.6.mlp.gate_proj.weight', 'transformer.layers.25.mlp.up_proj.weight', 'prompt_encoder.transfromer.layers.24.mlp.down_proj.weight', 'prompt_encoder.transfromer.layers.28.self_attn.k_proj

In [10]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 7110729731
Trainable Parameters: 69635
Percentage Trainable: 0.00097929470862067290%


In [11]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./r_task',
    #learning_rate=1e-5,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=1000,
    logging_steps=1000,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1000,0.5766,0.421197,0.851192,0.846119,0.842998,0.847516
2000,0.4165,0.386888,0.85935,0.851715,0.847399,0.853013




OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 47.54 GiB total capacity; 45.94 GiB already allocated; 19.12 MiB free; 46.45 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
!nvidia-smi