In [1]:
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [3]:


from transformers import AutoTokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

def prefix_function(examples):
    prefix_ids = 'Classify this text whether positive or negative :-> '

    examples["prefix_ids"] = len(examples['input_ids']) * [tokenizer(prefix_ids)['input_ids']]

    return examples

# First, apply tokenize_function to tokenized_datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Then, apply prefix_function to the tokenized datasets
tokenized_datasets = tokenized_datasets.map(prefix_function, batched=True)


In [4]:

from bertSKT import  PrefixForSequenceClassification, PromptForSequenceClassification
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
config._name_or_path=model_name
config.hidden_size=768
config.num_hidden_layers=12
config.n_head=12
config.num_labels=2
config.pad_token_id=tokenizer.pad_token_id
config.hidden_dropout = 0.1
config.model_type='bert'
config.pooling=True
config.tokenizer=tokenizer
config.prompt='classify the text as positive or negative, text:'

In [5]:


model = PromptForSequenceClassification.from_pretrained(
    model_name,
    config=config,
)


Some weights of PromptForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.transformer.encoder.layer.1.attention.self.query.weight', 'bert.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'bert.transformer.encoder.layer.0.attention.output.dense.weight', 'bert.transformer.encoder.layer.1.attention.output.dense.weight', 'bert.transformer.encoder.layer.9.output.LayerNorm.weight', 'bert.transformer.encoder.layer.1.intermediate.dense.bias', 'bert.transformer.encoder.layer.1.output.dense.weight', 'bert.transformer.encoder.layer.9.attention.self.query.bias', 'bert.transformer.encoder.layer.9.attention.output.dense.weight', 'bert.transformer.pooler.dense.bias', 'bert.transformer.encoder.layer.3.output.dense.bias', 'bert.transformer.encoder.layer.11.output.dense.bias', 'bert.transformer.encoder.layer.4.attention.self.key.bias', 'bert.transformer.encoder.layer.0.attention.self.value.bias', 'bert.transformer

prompt sequence length:  12


In [6]:
# Total number of parameters in the model
total_parameters = model.num_parameters()

# Total number of trainable parameters in the model
trainable_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Calculate the percentage of trainable parameters
percentage_trainable = (trainable_parameters / total_parameters) * 100

print(f"Total Parameters: {total_parameters}")
print(f"Trainable Parameters: {trainable_parameters}")
print(f"Percentage Trainable: {percentage_trainable:.20f}%")

Total Parameters: 109604355
Trainable Parameters: 122115
Percentage Trainable: 0.11141436852577620009%


In [10]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.metrics import r2_score, accuracy_score, matthews_corrcoef
import numpy as np

def compute_metrics(p):
    logits = p.predictions
    #print("logits", logits)
    #print("logits", len(logits), len(logits[0]), len(logits[0][0]))
    preds = np.argmax(logits, axis=-1)
    labels = p.label_ids
    #print("labels", labels)

    accuracy = accuracy_score(labels, preds)



    return {"acc": accuracy}

training_args = TrainingArguments(
    output_dir='./rfalcon_task_prompt',
    num_train_epochs=10,
    do_eval=True,
    #learning_rate=0.001,
    #bf16=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,

    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps = 100,
    save_strategy="steps",
    save_steps=100,

    save_total_limit=2,
    load_best_model_at_end=True,
    #optim="paged_adamw_8bit",
)

In [11]:

trainer = Trainer(
    model=model,

    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics, #compute_metrics1,#compute_metrics_classification,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=7)]
)

trainer.train()

Step,Training Loss,Validation Loss,Acc
100,0.6627,0.661219,0.604358
200,0.651,0.643838,0.629587
300,0.6239,0.618876,0.674312
400,0.5984,0.580747,0.716743
500,0.5675,0.542817,0.764908
600,0.5429,0.517421,0.772936
700,0.5159,0.481306,0.795872
800,0.4659,0.458912,0.795872
900,0.4642,0.445851,0.799312
1000,0.4318,0.414131,0.815367


TrainOutput(global_step=4500, training_loss=0.3769823650783963, metrics={'train_runtime': 1233.413, 'train_samples_per_second': 546.038, 'train_steps_per_second': 17.066, 'total_flos': 9483883696470528.0, 'train_loss': 0.3769823650783963, 'epoch': 2.14})