In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
!pip install transformers accelerate datasets scikit-learn sentencepiece

In [2]:

"""PyTorch b model."""


import math
import warnings
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
from torch.nn import functional as F

from transformers.file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from transformers import BloomConfig, BloomPreTrainedModel, BloomModel, AutoConfig, PreTrainedModel, AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutput, Seq2SeqLMOutput



logger = logging.get_logger(__name__)


import torch
import torch.nn as nn


class PrefixEncoder(torch.nn.Module):
    def __init__(self, config, transfromer):
        super().__init__()

        self.config = config
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        self.transfromer=transfromer

        word_embeddings = transfromer.word_embeddings
        
        tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
        
        init_token_ids = tokenizer(config.text, return_tensors='pt')['input_ids']
        print("Prefix sequence length: ", init_token_ids.shape[1])
        tokenizer=None

        self.embedding = torch.nn.Embedding(init_token_ids.shape[1], config.hidden_size)

        if config.transform==True:
            self.transform = nn.Linear(config.n_embd, config.n_embd, bias=False)
        else:
            self.transform=None
     
        init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device)

        word_embedding_weights = word_embeddings(init_token_ids).detach().clone()
        word_embedding_weights = word_embedding_weights.to(torch.float32)
        #print('word_embedding_weights', word_embedding_weights.shape)
        #print('word_embedding_weights', word_embedding_weights.squeeze(0).shape)
        self.embedding.weight = torch.nn.Parameter(word_embedding_weights.squeeze(0))  
        global virtual_tokens 
        virtual_tokens = torch.arange(0, init_token_ids.shape[1])
        

    def forward(
        self,
        device=None,
        batch_size=None,

    ):


        inputs_embeds = self.embedding(virtual_tokens.to(device))
        inputs_embeds=self.dropout(inputs_embeds)
        outputs = self.transfromer(
            inputs_embeds=inputs_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
        )        
        #print('working', outputs.past_key_values)
        #print('working', projection)
        past_key_values=outputs.past_key_values
        if config.transform==True:
        # Apply transformations
            transformed_key_values = []
            for layer in past_key_values:
                key, value = layer
                #print(key.shape, value.shape)
                # Transpose, transform, and transpose back for key
                transformed_key = self.transform(key.transpose(1, 2)).transpose(1, 2)
                transformed_key=self.dropout(transformed_key)
                # Transpose, transform, and transpose back for value
                transformed_value = self.transform(value)
                transformed_value = self.dropout(transformed_value)
                transformed_key_values.append((transformed_key, transformed_value))

            transformed_past_key_values = tuple(transformed_key_values)
        
            return  (transformed_past_key_values, inputs_embeds.shape[0])
        else:
            return  (past_key_values, inputs_embeds.shape[0])



class PrefixForSequenceClassification(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.transformer =  AutoModel.from_pretrained(config._name_or_path)
        
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

        for param in self.transformer.parameters():
            param.requires_grad = False

        self.n_layer = config.num_hidden_layers
        self.n_head = config.n_head
        self.n_embd = config.hidden_size // config.n_head
        config.n_embd=self.n_embd

        #print('self.prefix_ids', self.prefix_ids)
        self.prompt_encoder = PrefixEncoder(config, self.transformer)
        self.config = config


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        batch_size = input_ids.shape[0]
        
        #print('prefix_ids', prefix_ids)
        past_key_values, pre_length =  self.prompt_encoder(self.transformer.device, batch_size)
        #print('prompts', prompts.shape)
        #print('raw_tokens_embedding', raw_tokens_embedding)
        #print('batch_size', batch_size, self.pre_seq_len)
        #inputs_embeds = torch.cat((prompts, raw_tokens_embedding), dim=1)
        prompt_attention_mask = torch.ones(batch_size, pre_length).to(self.transformer.device)
        attention_mask = torch.cat((prompt_attention_mask, attention_mask), dim=1)

        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            return_dict=return_dict,
            past_key_values=past_key_values,
        )

        
        hidden_states = self.dropout(outputs[0])

        logits = self.score(hidden_states)
        logits = torch.mean(logits, dim=1)


        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class PromptEncoder(torch.nn.Module):
    def __init__(self, config, word_embeddings):
        super().__init__()

        self.config = config
        
        tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
        
        init_token_ids = tokenizer(config.text, return_tensors='pt')['input_ids']
        print("Prompt sequence length: ", init_token_ids.shape[1])
        #print("config.pre_seq_len, config.hidden_size", config.pre_seq_len, config.hidden_size)
        tokenizer=None

        self.embedding = torch.nn.Embedding(init_token_ids.shape[1], config.hidden_size)
        self.dropout = torch.nn.Dropout(config.hidden_dropout)

        if config.transform==True:
            self.transform = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
        else:
            self.transform=None
            
        init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device)

        word_embedding_weights = word_embeddings(init_token_ids).detach().clone()
        word_embedding_weights = word_embedding_weights.to(torch.float32)
        #print('word_embedding_weights', word_embedding_weights.shape)
        #print('word_embedding_weights', word_embedding_weights.squeeze(0).shape)
        self.embedding.weight = torch.nn.Parameter(word_embedding_weights.squeeze(0))  
        global virtual_tokens 
        virtual_tokens = torch.arange(0, init_token_ids.shape[1])
        

    def forward(
        self,
        device=None,
        batch_size=None,

    ):

        projection = self.embedding(virtual_tokens.to(device))
        projection=self.dropout(projection)
        
        if config.transform==True:
            projection = self.transform(projection)
            projection=self.dropout(projection)

        return projection.repeat(batch_size, 1, 1)


class PromptForSequenceClassification(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.transformer =  AutoModel.from_pretrained(config._name_or_path)
        
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        #prefix_ids = config.tokenizer(config.prefix, return_tensors='pt')['input_ids']
        #print('prefix_ids', prefix_ids)
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

        for param in self.transformer.parameters():
            param.requires_grad = False

        self.n_layer = config.num_hidden_layers
        self.n_head = config.n_head
        self.n_embd = config.hidden_size // config.n_head

        #print('self.prefix_ids', self.prefix_ids)
        self.prompt_encoder = PromptEncoder(config, self.transformer.word_embeddings )
        self.config = config


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        batch_size = input_ids.shape[0]
        raw_tokens_embedding = self.transformer.word_embeddings (input_ids)
        #print('prefix_ids', prefix_ids)
        prompts =  self.prompt_encoder(self.transformer.device, batch_size)
        #print('prompts', prompts.shape)
        #print('raw_tokens_embedding', raw_tokens_embedding)
        #print('batch_size', batch_size, self.pre_seq_len)
        inputs_embeds = torch.cat((prompts, raw_tokens_embedding), dim=1)
        prompt_attention_mask = torch.ones(batch_size, prompts.shape[1]).to(self.transformer.device)
        attention_mask = torch.cat((prompt_attention_mask, attention_mask), dim=1)

        outputs = self.transformer(
            # input_ids,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            return_dict=return_dict,
            # past_key_values=past_key_values,
        )


        
        hidden_states = self.dropout(outputs[0])
        logits = self.score(hidden_states)
        logits = torch.mean(logits, dim=1)


        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class PromptForTokenClassification(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        
        self.transformer =  AutoModel.from_pretrained(config._name_or_path)
        
        self.dropout = torch.nn.Dropout(config.hidden_dropout)
        #prefix_ids = config.tokenizer(config.prefix, return_tensors='pt')['input_ids']
        #print('prefix_ids', prefix_ids)
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

        for param in self.transformer.parameters():
            param.requires_grad = False

        self.n_layer = config.num_hidden_layers
        self.n_head = config.n_head
        self.n_embd = config.hidden_size // config.n_head

        #print('self.prefix_ids', self.prefix_ids)
        self.prompt_encoder = PromptEncoder(config, self.transformer.word_embeddings)
        self.config = config


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        batch_size = input_ids.shape[0]
        raw_tokens_embedding = self.transformer.word_embeddings(input_ids)
        #print('prefix_ids', prefix_ids)
        prompts =  self.prompt_encoder(self.transformer.device, batch_size)
        #print('prompts', prompts.shape)
        #print('raw_tokens_embedding', raw_tokens_embedding)
        #print('batch_size', batch_size, self.pre_seq_len)
        inputs_embeds = torch.cat((prompts, raw_tokens_embedding), dim=1)
        prompt_attention_mask = torch.ones(batch_size, prompts.shape[1]).to(self.transformer.device)
        attention_mask = torch.cat((prompt_attention_mask, attention_mask), dim=1)

        outputs = self.transformer(
            # input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids,
            # position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            # past_key_values=past_key_values,
        )


        hidden_states = outputs[0][:, prompts.shape[1]:, :]
        #print('hidden_states', hidden_states.shape)
        #print('labels', labels.shape)
        
        hidden_states = self.dropout(hidden_states)
        logits = self.score(hidden_states)

        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(logits.device)
            batch_size, seq_length = labels.shape
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
            )

        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )




In [None]:
!nvidia-smi

In [3]:
from datasets import load_dataset

dataset = load_dataset("wikiann","en")

In [4]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [8]:
from transformers import AutoTokenizer
model_name="tiiuae/falcon-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],max_length=128, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [12]:
import evaluate

seqeval = evaluate.load("seqeval")

In [13]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
config

FalconConfig {
  "_name_or_path": "tiiuae/falcon-7b-instruct",
  "alibi": false,
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "FalconForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "tiiuae/falcon-7b-instruct--configuration_falcon.FalconConfig",
    "AutoModel": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconModel",
    "AutoModelForCausalLM": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForCausalLM",
    "AutoModelForQuestionAnswering": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForQuestionAnswering",
    "AutoModelForSequenceClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForSequenceClassification",
    "AutoModelForTokenClassification": "tiiuae/falcon-7b-instruct--modeling_falcon.FalconForTokenClassification"
  },
  "bias": false,
  "bos_token_id": 11,
  "eos_token_id": 11,
  "hidden_dropout": 0.0,
  "hidden_size": 4544,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max

In [14]:
config._name_or_path=model_name
config.hidden_size=4544
config.num_hidden_layers=32
config.n_head=71
config.num_labels=7
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='classify the token of the text:'

In [15]:
#from falconSKT import  PrefixForSequenceClassification, PromptForSequenceClassification

model = PromptForTokenClassification.from_pretrained(
    model_name,
    config=config,

)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Prompt sequence length:  8


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PromptForTokenClassification were not initialized from the model checkpoint at tiiuae/falcon-7b-instruct and are newly initialized: ['score.bias', 'score.weight', 'prompt_encoder.embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

In [17]:
import numpy as np


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [18]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./r_task',
    #learning_rate=1e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=500,
    logging_steps=500,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
 

    compute_metrics=compute_metrics,
)

trainer.train()


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,1.2569,0.847661,0.19888,0.278581,0.232079,0.720575
1000,0.7921,0.716461,0.252689,0.366789,0.299231,0.7595
1500,0.7039,0.646363,0.283837,0.404058,0.333442,0.781932
2000,0.6354,0.608301,0.300665,0.432405,0.354698,0.795207
2500,0.6314,0.584105,0.315594,0.445428,0.369435,0.80137
3000,0.5879,0.571816,0.327057,0.464926,0.383991,0.80782
3500,0.5739,0.559108,0.323874,0.463918,0.381448,0.810902
4000,0.5625,0.549083,0.33756,0.476869,0.3953,0.815181
4500,0.5409,0.541778,0.346495,0.489316,0.405703,0.818861
5000,0.5411,0.541678,0.344363,0.494208,0.405897,0.820308


Checkpoint destination directory ./r_task/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./r_task/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


In [None]:
tokenized_dataset["train"]['input_ids'][0]

In [None]:
!nvidia-smi

In [None]:
tokenized_dataset