In [4]:
from datasets import load_dataset

dataset = load_dataset("glue","mrpc")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [16]:
from transformers import AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding
model_name="bigscience/bloomz-7b1"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token


In [17]:
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['sentence1','sentence2']

def preprocessing_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'])

tokenized_dataset = dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)

# tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
# tokenized_train_dataset.set_format("torch")
tokenized_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup

import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [11]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
config

BloomConfig {
  "_name_or_path": "bigscience/bloomz-7b1",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 32,
  "n_inner": null,
  "n_layer": 30,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 4,
  "seq_length": 2048,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "transformers_version": "4.36.2",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

In [18]:
config._name_or_path=model_name
config.hidden_size=4096
config.num_hidden_layers=30
config.n_head=32
config.num_labels=2
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='Classify the textual equivalence from the text:'

In [19]:
from bloomSKT import  PrefixForSequenceClassification

model = PrefixForSequenceClassification.from_pretrained(
    model_name,
    config=config,

)

Some weights of PrefixForSequenceClassification were not initialized from the model checkpoint at bigscience/bloomz-7b1 and are newly initialized: ['transformer.h.5.mlp.dense_4h_to_h.bias', 'prompt_encoder.transfromer.h.15.input_layernorm.bias', 'prompt_encoder.transfromer.h.9.post_attention_layernorm.bias', 'transformer.h.26.self_attention.query_key_value.bias', 'prompt_encoder.transfromer.h.20.mlp.dense_4h_to_h.weight', 'prompt_encoder.transfromer.h.2.input_layernorm.weight', 'transformer.h.29.post_attention_layernorm.bias', 'prompt_encoder.transfromer.h.29.mlp.dense_h_to_4h.weight', 'transformer.h.18.input_layernorm.weight', 'prompt_encoder.transfromer.h.18.self_attention.dense.bias', 'prompt_encoder.transfromer.h.29.mlp.dense_4h_to_h.bias', 'prompt_encoder.transfromer.h.21.mlp.dense_h_to_4h.bias', 'transformer.h.10.mlp.dense_h_to_4h.weight', 'transformer.h.22.post_attention_layernorm.weight', 'prompt_encoder.transfromer.h.23.self_attention.query_key_value.weight', 'transformer.h.3.

Prefix sequence length 8


In [20]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 7069057026
Trainable Parameters: 40962
Percentage Trainable: 0.00057945493789824743%


In [21]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}


In [23]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./r_task',
    #learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=200,
    logging_steps=200,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
200,0.6622,0.618242,0.651389,0.572685,0.563513,0.703431
400,0.6115,0.576853,0.707318,0.603109,0.602817,0.727941
600,0.54,0.529487,0.776679,0.646953,0.658353,0.762255
800,0.5147,0.487415,0.830595,0.661582,0.676658,0.779412
1000,0.4387,0.430975,0.785033,0.758106,0.76866,0.808824
1200,0.4148,0.395787,0.803811,0.77511,0.786456,0.823529
1400,0.3849,0.406774,0.849413,0.747437,0.77272,0.828431
1600,0.3705,0.374187,0.840325,0.791239,0.808783,0.845588
1800,0.3405,0.346826,0.866257,0.798116,0.820548,0.857843
2000,0.3314,0.36413,0.842738,0.801075,0.816783,0.85049




KeyboardInterrupt: 