# Step 0: Preparations

In [59]:
import pickle
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np

In [60]:
!git clone https://github.com/MahdiTheGreat/Intro-to-language-modeling.git

fatal: destination path 'Intro-to-language-modeling' already exists and is not an empty directory.


In [61]:
! pip install evaluate



In [62]:
! pip install datasets



In [63]:
# Data
dataset='/content/Intro-to-language-modeling/pa4'
zip_file = f"{dataset}.zip"
! unzip -q {zip_file}
! rm {zip_file}

unzip:  cannot find or open /content/Intro-to-language-modeling/pa4.zip, /content/Intro-to-language-modeling/pa4.zip.zip or /content/Intro-to-language-modeling/pa4.zip.ZIP.
rm: cannot remove '/content/Intro-to-language-modeling/pa4.zip': No such file or directory


In [64]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cuda


In [65]:
import os
os.environ['WANDB_DISABLED'] = 'true'

# Step 1: Full fine tuning

### Preprocessing

In [66]:
from datasets import load_dataset
imdb_dataset = load_dataset('csv', data_files = {'train': '/content/train.csv', 'eval': '/content/eval.csv'})

In [67]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_helper(batch):
    return tokenizer(batch['review'], padding=True, truncation=True)
tokenized_imdb_dataset = imdb_dataset.map(tokenize_helper, batched=True)

### Creating classifier

In [68]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
# Sanity check
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


### Counting the number of trainable paramters

In [70]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [71]:
# Sanity check
count_trainable_parameters(model)

66955010

### Preparing for training

In [72]:
from transformers import TrainingArguments
training_arguments = TrainingArguments(output_dir = 'out', num_train_epochs=10, eval_strategy='epoch')

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [73]:
import evaluate

accuracy_scorer = evaluate.load('accuracy')

def evaluation_helper(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_scorer.compute(predictions=predictions, references=labels)

### Training the model

In [74]:
from transformers import Trainer
model.to(device)
trainer = Trainer(model=model, args=training_arguments, train_dataset=tokenized_imdb_dataset['train'], eval_dataset=tokenized_imdb_dataset['eval'], compute_metrics=evaluation_helper)

In [75]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.348259,0.894
2,0.338300,0.482767,0.892
3,0.338300,0.460919,0.888
4,0.090200,0.612165,0.9
5,0.090200,0.742427,0.868
6,0.020500,0.646446,0.9
7,0.020500,0.72622,0.894
8,0.006100,0.736991,0.894
9,0.006100,0.751541,0.902
10,0.002200,0.760752,0.904


TrainOutput(global_step=2500, training_loss=0.09145046510696411, metrics={'train_runtime': 1147.6014, 'train_samples_per_second': 17.428, 'train_steps_per_second': 2.178, 'total_flos': 2649347973120000.0, 'train_loss': 0.09145046510696411, 'epoch': 10.0})

In [76]:
# Save model to avoid rerunning
trainer.save_model('fully-finetuned.model')

# Step 2: Fine tuning final layers only

In [77]:
adapted_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Remove gradient computation for all but classification layer
for param in adapted_model.parameters():
    param.requires_grad = False
for param in adapted_model.classifier.parameters():
    param.requires_grad = True
for param in adapted_model.pre_classifier.parameters():
    param.requires_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
# Sanity check
count_trainable_parameters(adapted_model)

592130

In [79]:
adapted_model.to(device)
adapted_trainer = Trainer(model=adapted_model, args=training_arguments, train_dataset=tokenized_imdb_dataset['train'], eval_dataset=tokenized_imdb_dataset['eval'], compute_metrics=evaluation_helper)

In [80]:
adapted_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.596484,0.676
2,0.590600,0.500001,0.77
3,0.590600,0.434207,0.814
4,0.449100,0.402174,0.83
5,0.449100,0.401504,0.826
6,0.400000,0.389323,0.832
7,0.400000,0.377066,0.838
8,0.379000,0.368555,0.842
9,0.379000,0.36632,0.842
10,0.376200,0.364697,0.84


TrainOutput(global_step=2500, training_loss=0.4389877624511719, metrics={'train_runtime': 410.7837, 'train_samples_per_second': 48.687, 'train_steps_per_second': 6.086, 'total_flos': 2649347973120000.0, 'train_loss': 0.4389877624511719, 'epoch': 10.0})

In [81]:
adapted_trainer.save_model('adapted.model')

# Step 3: Fine tuning with LoRA

In [183]:
lora_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Utilities for modifying models

In [194]:
def extract_qv_layers(model):
  dictionary = {}
  for name in model.state_dict():
    if 'q' in name or 'v' in name:
      dictionary[name] = model.state_dict()[name]
  return dictionary


In [195]:
def replace_layers(model, named_layers):
    for name, layer in named_layers.items():
        components = name.split('.')
        submodule = model
        for component in components[:-1]:
            submodule = getattr(submodule, component)
        setattr(submodule, components[-1], layer)

### Implementing the LoRA layer

In [196]:
class LinearBlockWithLoRA(nn.Module):
    def __init__(self, W, r, alpha = 0.01):
        """
        Initializes the LinearBlockWithLoRA.

        Args:
            W (torch.Tensor): Pre-trained weight matrix.
            r (int): Rank of the low-rank approximation.
        """
        super().__init__()
        self.r = r
        self.alpha = alpha

        # Store the pre-trained weight matrix
        self.W = W  # Frozen pre-trained weights

        # Get the dimensions of the pre-trained weight matrix
        out_dim, in_dim  = W.shape

        # Initialize the low-rank matrices A and B
        #self.A = nn.Linear(in_features=in_dim, out_features=r, bias=False)  # Low-rank adaptation A
        #self.B = nn.Linear(in_features=r, out_features=out_dim, bias=False)   # Low-rank adaptation B
        self.A = nn.Parameter(torch.randn(in_dim, r))
        self.B = nn.Parameter(torch.randn(r, out_dim))

        # Initialize the weights of the low-rank matrices
        nn.init.normal_(self.A)
        nn.init.zeros_(self.B)

    def forward(self, X):
        """
        Forward pass for the LinearBlockWithLoRA.

        Args:
            X (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after applying W and LoRA.
        """
        # Compute the output with the pre-trained weight matrix
        W_out = self.W(X)  # Using frozen weights

        # Compute the low-rank adaptation
        a_out = self.A(X) # (batch_size x in_dim) @ (in_dim x r) @ (r x out_dim)
        b_out = self.B(a_out) # (batch_size x in_dim) @ (in_dim x r) @ (r x out_dim

        # Add scaled adaptation to the pre-trained weights' output
        scaled_b_out = (self.alpha/self.r)*b_out

        return W_out + scaled_b_out


### Fine tuning with LoRA

In [197]:
# Wrap linear layers with LinearBlockWithLoRA
relevant_layers = extract_qv_layers(lora_model)

for name, layer in relevant_layers.items():
    layer.requires_grad = False
    if 'weight' in name:
      relevant_layers[name] = LinearBlockWithLoRA(layer, r=8)


#print(relevant_layers)
replace_layers(lora_model, relevant_layers)

TypeError: cannot assign '__main__.LinearBlockWithLoRA' as parameter 'weight' (torch.nn.Parameter or None expected)

In [190]:
# Wrap linear layers with LinearBlockWithLoRA
relevant_layers = extract_qv_layers(lora_model)

for name, layer in relevant_layers.items():
    layer.requires_grad = False
    if 'weight' in name:
        # Create LinearBlockWithLoRA instance
        lora_block = LinearBlockWithLoRA(layer, r=8)
        # Assign the pre-trained weights directly to LinearBlockWithLoRA's W
        lora_block.W = layer
        relevant_layers[name] = lora_block  # Assign the entire lora_block instance to relevant_layers

#print(relevant_layers)
replace_layers(lora_model, relevant_layers) # Now relevant_layers contains entire modules

TypeError: cannot assign '__main__.LinearBlockWithLoRA' as parameter 'weight' (torch.nn.Parameter or None expected)

In [111]:
# Sanity check
count_trainable_parameters(lora_model)

66955010

In [None]:
lora_model.to(device)
lora_trainer = Trainer(model=lora_model, args=training_arguments, train_dataset=tokenized_imdb_dataset['train'], eval_dataset=tokenized_imdb_dataset['eval'], compute_metrics=evaluation_helper)

In [None]:
lora_trainer.train()

In [None]:
lora_trainer.save_model('lora.model')

# From LoRA demo

In [87]:
pretrained = torch.load('s7_pretrained.model')
pretrained

  pretrained = torch.load('s7_pretrained.model')


FileNotFoundError: [Errno 2] No such file or directory: 's7_pretrained.model'

In [None]:
def batcher(batch):
    X = torch.as_tensor([x for x, _ in batch])
    Y = 1.0*torch.as_tensor([y for _, y in batch])
    return X, Y

In [None]:
def eval_model(model):
    dl = DataLoader(list(zip(books_X_te, books_Y_te)), batch_size=32, shuffle=False, collate_fn=batcher)
    n_corr = 0
    for Xb, Yb in dl:
        with torch.no_grad():
            model_out = model(Xb)
        preds = model_out[:, 0] > 0
        gold = Yb > 0
        n_corr += sum(preds == gold).item()
    return n_corr / len(books_Y_te)

In [None]:
eval_model(pretrained)

# Basic fine-tuning

We create a new model where we copy the weights from the pre-trained model.

In [None]:
torch.manual_seed(0)

finetuned = nn.Sequential(
    nn.Linear(in_features=768, out_features=512),
    nn.ReLU(),
    nn.Linear(in_features=512, out_features=1)
)

# pretrained = torch.load('s7_pretrained.model')

finetuned[0].weight.data = pretrained[0].weight.data.clone()
finetuned[0].bias.data = pretrained[0].bias.data.clone()
finetuned[2].weight.data = pretrained[2].weight.data.clone()
finetuned[2].bias.data = pretrained[2].bias.data.clone()

In [None]:
eval_model(finetuned)

In [None]:
def train(model, n_epochs=10):
    dl = DataLoader(list(zip(books_X_tr, books_Y_tr)), batch_size=32, shuffle=True, collate_fn=batcher)

    # NOTE!
    params = [ p for p in model.parameters() if p.requires_grad_ ]

    optimizer = torch.optim.Adam(params, lr=1e-3)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    for epoch in range(n_epochs):
        total_loss = 0
        for Xb, Yb in dl:
            model_out = model(Xb)[:, 0]
            loss = loss_fn(model_out, Yb)
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        mean_loss = total_loss / len(dl)
        acc = eval_model(model)
        print(f'loss = {mean_loss:.4f}, acc = {acc:.4f}')

Your task:
- Complete `count_trainable_parameters` below.
- Count the total number of trainable parameters in the model you fine-tuned.
- Use the function `train` to fine-tune the cloned model.

# Implementing LoRA

Your task:
- Complete `LinearBlockWithLoRA` above
- Set up a model using this new block to replace the first linear layer. Initialize parameters from the pre-trained model. (Don't forget to switch off gradient computation for `W`.)
- Count the parameters in the new model.
- Train the new model.

In [None]:

torch.manual_seed(0)

lora_model = nn.Sequential(
    LinearBlockWithLoRA(pretrained[0], r=8),
    nn.ReLU(),
    nn.Linear(in_features=512, out_features=1)
)

lora_model[0].W.weight.data = pretrained[0].weight.data.clone()
lora_model[0].W.bias.data = pretrained[0].bias.data.clone()
lora_model[2].weight.data = pretrained[2].weight.data.clone()
lora_model[2].bias.data = pretrained[2].bias.data.clone()

lora_model[0].W.requires_grad = False

train(lora_model, n_epochs=10)

print(count_trainable_parameters(lora_model))