In [1]:
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [3]:


from transformers import AutoTokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

def prefix_function(examples):
    prefix_ids = 'Classify this text whether positive or negative :-> '

    examples["prefix_ids"] = len(examples['input_ids']) * [tokenizer(prefix_ids)['input_ids']]

    return examples

# First, apply tokenize_function to tokenized_datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Then, apply prefix_function to the tokenized datasets
tokenized_datasets = tokenized_datasets.map(prefix_function, batched=True)


In [19]:

from bertSKT import  PrefixForSequenceClassification, PromptForSequenceClassification
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
config._name_or_path=model_name
config.hidden_size=768
config.num_hidden_layers=12
config.n_head=12
config.num_labels=2
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.model_type='bert'
config.pooling=True
config.tokenizer=tokenizer
config.prefix='classify the text as positive or negative, text:'

In [20]:

model = PrefixForSequenceClassification.from_pretrained(
    model_name,
    config=config,
)


Some weights of PrefixForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.transformer.encoder.layer.10.attention.self.query.weight', 'bert.transformer.encoder.layer.9.attention.self.key.bias', 'bert.transformer.encoder.layer.7.attention.self.key.bias', 'bert.transformer.encoder.layer.3.attention.self.value.weight', 'bert.transformer.encoder.layer.1.attention.self.query.weight', 'bert.transformer.encoder.layer.5.attention.self.value.bias', 'bert.transformer.encoder.layer.2.output.dense.weight', 'bert.transformer.encoder.layer.5.intermediate.dense.bias', 'bert.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'bert.transformer.encoder.layer.5.attention.output.dense.weight', 'bert.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'bert.transformer.encoder.layer.4.intermediate.dense.bias', 'bert.transformer.encoder.layer

prefix sequence length:  12


In [21]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 112359939
Trainable Parameters: 2877699
Percentage Trainable: 2.56114325587165003739%


In [22]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.metrics import r2_score, accuracy_score, matthews_corrcoef
import numpy as np

def compute_metrics(p):
    logits = p.predictions
    #print("logits", logits)
    #print("logits", len(logits), len(logits[0]), len(logits[0][0]))
    preds = np.argmax(logits, axis=-1)
    labels = p.label_ids
    #print("labels", labels)

    accuracy = accuracy_score(labels, preds)



    return {"acc": accuracy}

training_args = TrainingArguments(
    output_dir='./rfalcon_task_prefix',
    num_train_epochs=10,
    do_eval=True,
    #learning_rate=0.001,
    #bf16=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps = 100,
    save_strategy="steps",
    save_steps=100,

    save_total_limit=2,
    load_best_model_at_end=True,
    #optim="paged_adamw_8bit",
)

In [23]:

trainer = Trainer(
    model=model,

    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics, #compute_metrics1,#compute_metrics_classification,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=7)]
)

trainer.train()

Step,Training Loss,Validation Loss,Acc
100,0.6887,0.686411,0.521789
200,0.6471,0.628674,0.619266
300,0.4062,0.341113,0.861239
400,0.3612,0.310025,0.877294
500,0.3022,0.293771,0.881881
600,0.3089,0.287002,0.883028
700,0.3178,0.278796,0.888761
800,0.2968,0.286029,0.877294
900,0.2994,0.273525,0.884174
1000,0.271,0.268625,0.892202


TrainOutput(global_step=3700, training_loss=0.2867033221270587, metrics={'train_runtime': 1018.6974, 'train_samples_per_second': 661.129, 'train_steps_per_second': 20.664, 'total_flos': 8048872156568832.0, 'train_loss': 0.2867033221270587, 'epoch': 1.76})