In [3]:
from datasets import load_dataset

dataset_name = 'sst2' 
dataset = load_dataset(dataset_name)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [7]:
from transformers import AutoTokenizer
model_name="mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token


In [8]:

from transformers import AutoTokenizer, DataCollatorWithPadding

# llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id
# llama_tokenizer.pad_token = llama_tokenizer.eos_token
col_to_delete = ['idx']
col_to_delete = ['idx', 'sentence']

def preprocessing_function(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=128)

tokenized_datasets = dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup

import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [10]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
config

MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [11]:
config._name_or_path=model_name
config.hidden_size=4096
config.num_hidden_layers=32
config.n_head=32
config.num_labels=2
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='Classify the positive or negative sentiment from the text'

In [12]:
from mistralSKT import  PrefixForSequenceClassification

model = PrefixForSequenceClassification.from_pretrained(
    model_name,
    config=config,

)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Prefix sequence length:  11


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of PrefixForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['prompt_encoder.transfromer.layers.14.self_attn.v_proj.weight', 'transformer.layers.21.post_attention_layernorm.weight', 'prompt_encoder.transfromer.layers.24.self_attn.v_proj.weight', 'prompt_encoder.transfromer.layers.14.input_layernorm.weight', 'prompt_encoder.transfromer.layers.1.post_attention_layernorm.weight', 'transformer.layers.14.mlp.down_proj.weight', 'transformer.layers.7.post_attention_layernorm.weight', 'prompt_encoder.transfromer.layers.12.mlp.gate_proj.weight', 'transformer.layers.17.self_attn.k_proj.weight', 'prompt_encoder.transfromer.layers.6.self_attn.v_proj.weight', 'transformer.layers.18.post_attention_layernorm.weight', 'transformer.layers.30.post_attention_layernorm.weight', 'prompt_encoder.transfromer.layers.27.post_attention_layernorm.weight', 'transformer.layers.9.self_attn.o_proj.weight', 'transforme

In [13]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 7110713346
Trainable Parameters: 53250
Percentage Trainable: 0.00074887001358246003%


In [14]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}


In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./r_task',
    #learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=200,
    logging_steps=200,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
200,0.7039,0.520159,0.744365,0.743706,0.743033,0.743119
400,0.5298,0.412051,0.818641,0.806917,0.803554,0.805046
600,0.442,0.327493,0.876178,0.873137,0.87347,0.873853
800,0.3735,0.33107,0.8767,0.861823,0.861893,0.863532
1000,0.4076,0.268105,0.908819,0.908647,0.908255,0.908257
1200,0.3714,0.256878,0.924455,0.92416,0.924264,0.924312
1400,0.3177,0.341146,0.911654,0.911773,0.911683,0.911697
1600,0.3354,0.287382,0.921371,0.921245,0.920871,0.920872
1800,0.3017,0.322593,0.91465,0.912267,0.912641,0.912844
2000,0.3052,0.312483,0.923351,0.922992,0.923112,0.923165


Could not locate the best model at ./r_task/checkpoint-28600/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=42095, training_loss=0.22266500411626633, metrics={'train_runtime': 52490.6157, 'train_samples_per_second': 6.415, 'train_steps_per_second': 0.802, 'total_flos': 4.494459940300656e+17, 'train_loss': 0.22266500411626633, 'epoch': 5.0})