In [None]:
pip install datasets

In [None]:
pip install evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from transformers import DistilBertModel
from datasets import load_dataset

In [None]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
train_file = '/content/drive/MyDrive/pa4/train.csv'
val_file = '/content/drive/MyDrive/pa4/eval.csv'

In [None]:
# Step 1: Full fine-tuning

In [None]:
model = DistilBertModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")

In [None]:
imdb_dataset = load_dataset('csv', data_files = {'train': train_file, 'eval': val_file})

In [None]:
# AutoTokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def tokenize_helper(batch):
    return tokenizer(batch['review'], padding=True, truncation=True)
tokenized_imdb_dataset = imdb_dataset.map(tokenize_helper, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # Binary classification
)
print(model)

In [None]:
# Counting the number of trainable parameters
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_trainable_params = count_trainable_parameters(model)
print(f"Number of trainable parameters: {num_trainable_params}")


In [None]:
epoch_num = 7

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epoch_num,
    eval_strategy = 'epoch',
    run_name = 'fine_tuned_distilbert'
)
training_args

In [None]:
# Preparing for training
import evaluate

accuracy_scorer = evaluate.load('accuracy')

def evaluation_helper(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_scorer.compute(predictions=predictions, references=labels)

In [None]:
# Training the model
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb_dataset['train'],
    eval_dataset=tokenized_imdb_dataset['eval'],
    compute_metrics=evaluation_helper
)

In [None]:

trainer.train()

In [None]:
trainer.save_model("./fine_tuned_distilbert")

In [None]:
# Step 2: Tuning the final layers only

In [None]:
model_freeze = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # Binary classification
)
print(model_freeze)

In [None]:
# Freeze all parameters in the model
for param in model_freeze.parameters():
    param.requires_grad = False

# Unfreeze the classification head parameters
for name, param in model_freeze.named_parameters():
    if "pre_classifier" in name or "classifier" in name:
        param.requires_grad = True


In [None]:
num_trainable_params_freeze = count_trainable_parameters(model_freeze)
print(f"Number of trainable parameters: {num_trainable_params_freeze}")

In [None]:
training_args_freeze = TrainingArguments(
    output_dir='./results_freeze_layers',
    num_train_epochs=epoch_num,
    eval_strategy='epoch',
    run_name='fine_tuned_distilbert_freeze'
)

trainer_freeze = Trainer(
    model=model_freeze,
    args=training_args_freeze,
    train_dataset=tokenized_imdb_dataset['train'],
    eval_dataset=tokenized_imdb_dataset['eval'],
    compute_metrics=evaluation_helper
)

In [None]:
trainer_freeze.train()

In [None]:
trainer_freeze.save_model("./fine_tuned_distilbert_freeze")

In [None]:
# Step 3: Fine-tuning with LoRA

In [None]:
def extract_qv_layers(model):
    qv_layers = {}
    # Iterate through all Transformer layers in DistilBERT
    for i in range(len(model.distilbert.transformer.layer)):
        # Get the query and value linear layers
        q_name = f'distilbert.transformer.layer.{i}.attention.q_lin'
        v_name = f'distilbert.transformer.layer.{i}.attention.v_lin'

        q_layer = model.get_submodule(q_name)
        v_layer = model.get_submodule(v_name)

        # Add them to the dictionary
        qv_layers[q_name] = q_layer
        qv_layers[v_name] = v_layer

    return qv_layers

In [None]:
model_LoRA = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

qv_layers = extract_qv_layers(model_LoRA)
print("Extracted Q and V layers:", list(qv_layers.keys()))
print("The number of layers:", len(qv_layers))

In [None]:
def replace_layers(model, named_layers):
    for name, layer in named_layers.items():
        components = name.split('.')
        submodule = model
        for component in components[:-1]:
            submodule = getattr(submodule, component)
        setattr(submodule, components[-1], layer)

In [None]:
import torch
import torch.nn as nn

# Implementing the LoRA layer
class LoRALayer(nn.Module):
    def __init__(self, W, r, alpha):
        super().__init__()
        self.W = W          # The original linear layer
        self.r = r          # Rank of the low-rank approximation
        self.alpha = alpha
        self.scaling = alpha / r  # Scaling factor

        self.A = nn.Parameter(torch.empty((r, W.in_features)))  # A : R^r×k
        self.B = nn.Parameter(torch.empty((W.out_features, r)))  # B : R^d×r

        # Parameter initialization
        nn.init.normal_(self.A, mean=0.0, std=0.02)  # Initialize A with normal distribution
        nn.init.zeros_(self.B)             # Initialize B with zeros

    def forward(self, x):
        # print(f"x shape: {x.shape}")
        # print(f"A shape: {self.A.shape}")
        # print(f"B shape: {self.B.shape}")
        # print(f"W shape: {self.W.weight.shape}")

        batch_size, seq_length, in_features = x.shape

        # Reshape x for matrix multiplication
        x_reshaped = x.view(-1, in_features)  # Shape: (batch_size * seq_length, in_features)

        # Compute low-rank update: BAx
        lora_update = self.B @ (self.A @ x_reshaped.T)  # Shape: (out_features, batch_size * seq_length)
        lora_update = lora_update.T.view(batch_size, seq_length, -1)  # Reshape back to (batch_size, seq_length, out_features)

        # Add the low-rank update to the frozen linear layer's output
        return self.W(x) + self.scaling * lora_update


In [None]:
rank = 64   # Low-rank approximation
alpha = 32   # Scaling factor

# Wrap each linear layer in the extracted layers with LoRA
lora_layers = {
    name: LoRALayer(layer, r=rank, alpha=alpha) for name, layer in qv_layers.items()
}


In [None]:
lora_layers

In [None]:
replace_layers(model_LoRA, lora_layers)
print("Replaced original layers with LoRA layers.")

# Freeze all parameters except LoRA layers
for name, param in model_LoRA.named_parameters():
    if "A" not in name and "B" not in name:  # Only keep A and B trainable
        param.requires_grad = False


In [None]:
num_trainable_params_LoRA = count_trainable_parameters(model_LoRA)
print(f"Number of trainable parameters with LoRA: {num_trainable_params_LoRA}")

In [None]:
training_args_lora = TrainingArguments(
    output_dir='./results_lora',
    num_train_epochs=epoch_num,
    eval_strategy='epoch',
    run_name='fine_tuned_distilbert_lora'
)

trainer_lora = Trainer(
    model=model_LoRA,
    args=training_args_lora,
    train_dataset=tokenized_imdb_dataset['train'],
    eval_dataset=tokenized_imdb_dataset['eval'],
    compute_metrics=evaluation_helper
)

In [None]:
trainer_lora.train()

In [None]:
trainer_lora.save_model("./fine_tuned_distilbert_lora")

## Summary
*   **Full Fine-Tuning**(Step 1)  achieves the best accuracy but is computationally expensive.
*   **Tuning Final Layers Only**(Step 2) is the fastest but achieves lower accuracy, making it suitable for quick prototyping.
*   **Fine-Tuning with LoRA**(Step 3) strikes a balance, achieving near full fine-tuning performance and reducing trainable parameters significantly. It is computationally efficient compared to Full Fine-Tuning.

