In [3]:
from datasets import load_dataset
import pandas as pd
snli = load_dataset("snli")

In [4]:
train = pd.DataFrame(snli["train"])
test = pd.DataFrame(snli["test"])

train = train[train["label"]!=-1].reset_index(drop=True)

test = test[test["label"]!=-1].reset_index(drop=True)

from datasets import Dataset
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test) 

In [5]:
from transformers import AutoTokenizer
model_name="mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [14]:

from transformers import AutoTokenizer, DataCollatorWithPadding
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# col_to_delete = ['idx']
col_to_delete = ['premise','hypothesis']

def preprocessing_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'])

tokenized_train_dataset = train_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)

tokenized_test_dataset = test_dataset.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/549367 [00:00<?, ? examples/s]

Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup

import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from tqdm import tqdm

In [8]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
config

MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [9]:
config._name_or_path=model_name
config.hidden_size=4096
config.num_hidden_layers=32
config.n_head=32
config.num_labels=3
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.transform=False
config.text='Recognize the textual entailment from the text:'

In [11]:
from mistralSKT import  PrefixForSequenceClassification

model = PrefixForSequenceClassification.from_pretrained(
    model_name,
    config=config,

)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Prefix sequence length:  14


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of PrefixForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['transformer.layers.15.mlp.down_proj.weight', 'prompt_encoder.transfromer.layers.9.mlp.gate_proj.weight', 'prompt_encoder.transfromer.layers.1.input_layernorm.weight', 'transformer.layers.30.mlp.gate_proj.weight', 'transformer.layers.29.input_layernorm.weight', 'prompt_encoder.transfromer.layers.18.input_layernorm.weight', 'prompt_encoder.transfromer.layers.9.self_attn.q_proj.weight', 'transformer.layers.21.self_attn.k_proj.weight', 'prompt_encoder.transfromer.layers.28.self_attn.k_proj.weight', 'prompt_encoder.transfromer.layers.14.post_attention_layernorm.weight', 'prompt_encoder.transfromer.layers.3.mlp.up_proj.weight', 'prompt_encoder.transfromer.layers.8.self_attn.o_proj.weight', 'prompt_encoder.transfromer.layers.21.mlp.down_proj.weight', 'prompt_encoder.transfromer.layers.23.mlp.up_proj.weight', 'prompt_encoder.transfrom

In [12]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 7110729731
Trainable Parameters: 69635
Percentage Trainable: 0.00097929470862067290%


In [13]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}


In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./r_task',
    #learning_rate=1e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=1000,
    logging_steps=1000,
   
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1000,0.599,0.413689,0.84563,0.842677,0.843589,0.842732
2000,0.4413,0.359358,0.869451,0.866698,0.867495,0.866755
3000,0.4041,0.353223,0.875446,0.874954,0.873745,0.874796
4000,0.378,0.305617,0.894812,0.893922,0.893987,0.89373
5000,0.361,0.305054,0.896208,0.894588,0.893601,0.895358
6000,0.3535,0.315745,0.89688,0.892371,0.892592,0.891694
7000,0.3471,0.320114,0.899103,0.896306,0.895466,0.897191
8000,0.3329,0.291599,0.90082,0.900619,0.899787,0.901059
9000,0.3399,0.319963,0.899468,0.895172,0.894703,0.896173
10000,0.3351,0.277374,0.905945,0.90637,0.905942,0.906555




KeyboardInterrupt: 