### Finetuning BERT

Для VSCode\
`pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126`

In [2]:
# %pip install --upgrade transformers datasets accelerate deepspeed

import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import transformers
import datasets
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments

### Load data and model

In [3]:
qqp = datasets.load_dataset('SetFit/qqp')
print('\n')
print("Sample[0]:", qqp['train'][0])
print("Sample[3]:", qqp['train'][3])

Repo card metadata block was not found. Setting CardData to empty.




Sample[0]: {'text1': 'How is the life of a math student? Could you describe your own experiences?', 'text2': 'Which level of prepration is enough for the exam jlpt5?', 'label': 0, 'idx': 0, 'label_text': 'not duplicate'}
Sample[3]: {'text1': 'What can one do after MBBS?', 'text2': 'What do i do after my MBBS ?', 'label': 1, 'idx': 3, 'label_text': 'duplicate'}


In [4]:
model_name = "gchhablani/bert-base-cased-finetuned-qqp"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)

### Tokenize the data

In [5]:
MAX_LENGTH = 128
def preprocess_function(examples):
    result = tokenizer(
        examples['text1'], examples['text2'],
        padding='max_length', max_length=MAX_LENGTH, truncation=True
    )
    result['label'] = examples['label']
    return result

qqp_preprocessed = qqp.map(preprocess_function, batched=True)

In [6]:
print(repr(qqp_preprocessed['train'][0]['input_ids'])[:100], "...")

[101, 1731, 1110, 1103, 1297, 1104, 170, 12523, 2377, 136, 7426, 1128, 5594, 1240, 1319, 5758, 136,  ...


### Evaluation

Just glimpsing at our data

In [7]:
val_set = qqp_preprocessed['validation']
val_loader = torch.utils.data.DataLoader(
    val_set, batch_size=1, shuffle=False, collate_fn=transformers.default_data_collator
)

In [8]:
for batch in val_loader:
     break  # here be your training code
print("Sample batch:", batch)

with torch.no_grad():
  predicted = model(
      input_ids=batch['input_ids'],
      attention_mask=batch['attention_mask'],
      token_type_ids=batch['token_type_ids']
  )

print('\nPrediction (probs):', torch.softmax(predicted.logits, dim=1).data.numpy())

Sample batch: {'labels': tensor([0]), 'idx': tensor([0]), 'input_ids': tensor([[  101,  2009,  1132,  2170,   118,  4038,  1177,  2712,   136,   102,
          2009,  1132,  1117, 10224,  4724,  1177,  2712,   136,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,   

In [9]:
from tqdm import tqdm
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

32

In [10]:
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create a DataLoader for the validation set
val_set = qqp_preprocessed['validation']
val_loader = torch.utils.data.DataLoader(
    val_set, 
    batch_size=64,  # Larger batch size for faster processing
    shuffle=False, 
    collate_fn=transformers.default_data_collator,
    num_workers=cores,  # Use multiple workers to load data faster
    pin_memory=True
)

In [11]:
# Measure validation accuracy
model.eval()  # Set model to evaluation mode

correct = 0
total = 0

# (optional) Enable mixed precision for faster computation if supported
scaler = torch.amp.GradScaler('cuda') if device == torch.device("cuda") else None

with torch.no_grad():  # Disable gradient calculation
    for batch in tqdm(val_loader, desc="Evaluating"):
        # Move batch to GPU
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        token_type_ids = batch['token_type_ids'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)

        # Use mixed precision if available
        if scaler:
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids
                )
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

        # Get predictions and update accuracy
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

# Compute accuracy
accuracy = correct / total # Validation accuracy, between 0 and 1
print(f"Validation Accuracy: {accuracy:.4f}")

  with torch.cuda.amp.autocast():
Evaluating: 100%|██████████| 632/632 [00:55<00:00, 11.32it/s]

Validation Accuracy: 0.9084





In [12]:
assert 0.9 < accuracy < 0.91

### Train the model 

In [13]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np

In [14]:
# Load your model e.g. DeBERTa-v3 tokenizer and model
model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2  
)  # Binary classification. num_labels=1 if you prefer.

# Note that if the tokenizer of your model
# is different from the one we used aboVe,
# you need ot preprocess your data again.

# Preprocess the data
qqp = load_dataset("SetFit/qqp")

def preprocess_function(examples):
    return tokenizer(
        examples["text1"],
        examples["text2"],
        truncation=True,
        padding=False, 
        max_length=128
    )

# <If so, your code goes here>
qqp_preprocessed = qqp.map(
    preprocess_function,
    batched=True,
    remove_columns=["text1", "text2", "label_text", "idx"]  
)

qqp_preprocessed = qqp_preprocessed.rename_column("label", "labels")

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Repo card metadata block was not found. Setting CardData to empty.


In [15]:
# Prepare the training and validation sets
train_set = qqp_preprocessed['train']
val_set = qqp_preprocessed['validation']  


# Define a metric for evaluation. You can write your own if you prefer
from sklearn.metrics import accuracy_score

# If you are using transformers.Trainer, you may want to use a utility function below
def compute_metrics(eval_pred):
    """
    Compute evaluation metrics for the model during training or evaluation.
    Args:
        eval_pred (tuple): A tuple containing:
            - logits (ndarray or torch.Tensor): The raw logits output by the model for each sample
              in the evaluation batch. Shape: (batch_size, num_classes).
            - labels (ndarray or torch.Tensor): The ground truth labels for each sample in the batch.
              Shape: (batch_size,).
    Returns:
        dict: A dictionary containing the computed metric(s):
            - "accuracy" (float): The proportion of correct predictions over the total number of samples.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Feel free not to use transformers.Trainer and write the code manually if you want
# A good starting learning rate is 2e-5.
# A step of an order of magnitude is a good way to adjust it if necessary e.g. 2e-4, 2e-3 etc.
# 3 train epochs is likely enough for gently finetuning the model without the model 'forgetting previous data'
# Be sure to use weight_decay i.e. regularisation. A good starting point is 1e-2. Feel free to experiment.
# Consider setting accuracy as the metric for the best model.

# Define your training arguments without the 'device' argument since it is handled automatically.
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",          
    save_strategy="epoch",          
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,    
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    fp16=False,                          
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
accuracy = eval_results["eval_accuracy"]
print(f"Validation Accuracy: {accuracy:.4f}")


trainer.save_model("./best_deberta_qqp")

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2406,0.235258,0.904551
2,0.1616,0.225324,0.912862


Validation Accuracy: 0.9129


In [16]:
assert 0.9 < accuracy

To be completely honest, we made a small crime here. Validation part of the dataset is intended for tuning the hyperparameters, but for the sake of simplicity we ommited that logic here. You are free to pick the best hyperparameters and test the results on the `test` subsample if you feel so.