Google Colab Setup

In [None]:
%pip install --upgrade \
    transformers \
    datasets \
    peft \
    accelerate \
    evaluate \
    fsspec \
    huggingface_hub

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec
  Downloading fssp

In [None]:
!rm -rf ~/.cache/huggingface/datasets
!rm -rf ~/.cache/huggingface/hub

LoRA finetune distilBert on GLUE-SST2 (self implement version)

In [None]:
import math
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    default_data_collator,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score

# LoRA Linear Layer
class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=8, alpha=16, dropout=0.1):
        super().__init__()
        self.r = r
        self.alpha = alpha / r if r > 0 else 1.0
        self.dropout = nn.Dropout(dropout)
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.zeros(out_features))

        if r > 0:
            self.A = nn.Parameter(torch.empty(r, in_features))
            self.B = nn.Parameter(torch.empty(out_features, r))
            nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
            nn.init.zeros_(self.B)
        else:
            self.A = None
            self.B = None

    def forward(self, x):
        base = nn.functional.linear(x, self.weight, self.bias)
        if self.r > 0:
            lora = self.dropout(x) @ self.A.T @ self.B.T
            return base + self.alpha * lora
        return base

# Replace Linear Layers with LoRA
def replace_linear_with_lora(model, r=8, alpha=16, dropout=0.1, target_modules=["q_lin", "v_lin"]):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(t in name for t in target_modules):
            lora = LoRALinear(
                in_features=module.in_features,
                out_features=module.out_features,
                r=r, alpha=alpha, dropout=dropout
            )
            lora.weight.data = module.weight.data.clone()
            lora.bias.data = module.bias.data.clone()

            parent = model
            for attr in name.split('.')[:-1]:
                parent = getattr(parent, attr)
            setattr(parent, name.split('.')[-1], lora)
    return model

# Load dataset and tokenizer
dataset = load_dataset("glue", "sst2")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def preprocess_function(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(preprocess_function, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

train_loader = DataLoader(dataset["train"], shuffle=True, batch_size=16, collate_fn=default_data_collator)
eval_loader = DataLoader(dataset["validation"], batch_size=16, collate_fn=default_data_collator)

# Load model and apply LoRA
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model = replace_linear_with_lora(model, r=8, alpha=16, dropout=0.1, target_modules=["q_lin", "v_lin"])

# Freeze base model
for param in model.parameters():
    param.requires_grad = False
for name, param in model.named_parameters():
    if any(x in name for x in ['A', 'B']):
        param.requires_grad = True

# Print trainable parameter ratio
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable} / {total} ({trainable / total * 100:.4f}%)")

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4, weight_decay=0.01)
model.train()
for epoch in range(3):
    print(f"\n[Epoch {epoch+1}] Training...")
    total_loss = 0
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (step + 1) % 100 == 0:
            print(f"Step {step+1} | Loss: {loss.item():.4f}")
    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable parameters: 147456 / 67102466 (0.2197%)

[Epoch 1] Training...
Step 100 | Loss: 0.6824
Step 200 | Loss: 0.1883
Step 300 | Loss: 0.3259
Step 400 | Loss: 0.3348
Step 500 | Loss: 0.4017
Step 600 | Loss: 0.2773
Step 700 | Loss: 0.3262
Step 800 | Loss: 0.2714
Step 900 | Loss: 0.2562
Step 1000 | Loss: 0.1994
Step 1100 | Loss: 0.1346
Step 1200 | Loss: 0.4310
Step 1300 | Loss: 0.3402
Step 1400 | Loss: 0.3141
Step 1500 | Loss: 0.1071
Step 1600 | Loss: 0.4481
Step 1700 | Loss: 0.1882
Step 1800 | Loss: 0.7653
Step 1900 | Loss: 0.1354
Step 2000 | Loss: 0.2645
Step 2100 | Loss: 0.2242
Step 2200 | Loss: 0.2963
Step 2300 | Loss: 0.1615
Step 2400 | Loss: 0.3994
Step 2500 | Loss: 0.4376
Step 2600 | Loss: 0.3895
Step 2700 | Loss: 0.1583
Step 2800 | Loss: 0.0865
Step 2900 | Loss: 0.5981
Step 3000 | Loss: 0.4097
Step 3100 | Loss: 0.3721
Step 3200 | Loss: 0.6079
Step 3300 | Loss: 0.4920
Step 3400 | Loss: 0.1231
Step 3500 | Loss: 0.3236
Step 3600 | Loss: 0.2936
Step 3700 | Loss: 0.2702
Step 3800 |

In [None]:
# Evaluation
model.eval()
all_preds, all_labels = [], []
for batch in eval_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1).cpu().tolist()
    labels = batch.get("label", batch.get("labels")).cpu().tolist()
    all_preds.extend(preds)
    all_labels.extend(labels)

acc = accuracy_score(all_labels, all_preds)
print(f"\nValidation Accuracy: {acc * 100:.2f}%")

# Save
if not os.path.exists("./model"):
    os.makedirs("./model")
torch.save(model.state_dict(), "./model/lora_replay_distilbert_sst2.pt")
tokenizer.save_pretrained("./model/lora_replay_distilbert_sst2")
print("[\u2713] Model weights and tokenizer saved.")


Validation Accuracy: 88.76%
[✓] Model weights and tokenizer saved.


Predict

In [None]:
import os
import torch
import torch.nn as nn
import math
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=8, alpha=16, dropout=0.1):
        super().__init__()
        self.r = r
        self.alpha = alpha / r
        self.dropout = nn.Dropout(dropout)
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.zeros(out_features))

        self.A = nn.Parameter(torch.empty(r, in_features))
        self.B = nn.Parameter(torch.empty(out_features, r))
        nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
        nn.init.zeros_(self.B)

    def forward(self, x):
        base = nn.functional.linear(x, self.weight, self.bias)
        lora = self.dropout(x) @ self.A.T @ self.B.T
        return base + self.alpha * lora

def replace_linear_with_lora(model, r=8, alpha=16, dropout=0.1):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and 'classifier' not in name:
            lora_layer = LoRALinear(
                in_features=module.in_features,
                out_features=module.out_features,
                r=r, alpha=alpha, dropout=dropout
            )
            lora_layer.weight.data = module.weight.data.clone()
            lora_layer.bias.data = module.bias.data.clone()

            parent = model
            for attr in name.split('.')[:-1]:
                parent = getattr(parent, attr)
            setattr(parent, name.split('.')[-1], lora_layer)
    return model

def load_lora_model(model_dir, device="cpu"):
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
    model = replace_linear_with_lora(model, r=8, alpha=16, dropout=0.1)

    weights_path = "/content/model/lora_replay_distilbert_sst2.pt"
    state_dict = torch.load(weights_path, map_location=device)
    model.load_state_dict(state_dict, strict=False)

    model.to(device)
    model.eval()
    return model

def predict(sentence, model_dir="./model/lora_replay_distilbert_sst2", device="cpu"):
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
    model = load_lora_model(model_dir, device)

    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    pred_id = torch.argmax(outputs.logits, dim=-1).item()

    label_map = {0: "Negative", 1: "Positive"}
    print(f"Input: {sentence}")
    print(f"Prediction: {label_map[pred_id]}")

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    test_sentence = "I love you."
    predict(test_sentence, device=device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(weights_path, map_location=device)


Input: I love you.
Prediction: Positive


Download

In [None]:
!zip -r lora_replay_distilbert_sst2.zip /content/model

  adding: content/model/ (stored 0%)
  adding: content/model/lora_replay_distilbert_sst2.pt (deflated 8%)
  adding: content/model/lora_replay_distilbert_sst2/ (stored 0%)
  adding: content/model/lora_replay_distilbert_sst2/tokenizer.json (deflated 71%)
  adding: content/model/lora_replay_distilbert_sst2/vocab.txt (deflated 53%)
  adding: content/model/lora_replay_distilbert_sst2/tokenizer_config.json (deflated 75%)
  adding: content/model/lora_replay_distilbert_sst2/special_tokens_map.json (deflated 42%)


In [None]:
from google.colab import files
files.download('lora_replay_distilbert_sst2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>