# Step 0: Preparations

In [1]:
import pickle
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np

In [2]:
!git clone https://github.com/MahdiTheGreat/Intro-to-language-modeling.git

Cloning into 'Intro-to-language-modeling'...
remote: Enumerating objects: 145, done.[K
remote: Counting objects: 100% (145/145), done.[K
remote: Compressing objects: 100% (144/144), done.[K
remote: Total 145 (delta 78), reused 2 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (145/145), 34.86 MiB | 15.88 MiB/s, done.
Resolving deltas: 100% (78/78), done.


In [3]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [4]:
! pip install datasets



In [5]:
# Data
dataset='/content/Intro-to-language-modeling/pa4'
zip_file = f"{dataset}.zip"
! unzip -q {zip_file}
! rm {zip_file}

In [6]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu'))
print(f'Using device: {device}')

Using device: cuda


In [7]:
import os
os.environ['WANDB_DISABLED'] = 'true'

# Step 1: Full fine tuning

### Preprocessing

In [8]:
from datasets import load_dataset
imdb_dataset = load_dataset('csv', data_files = {'train': '/content/train.csv', 'eval': '/content/eval.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_helper(batch):
    return tokenizer(batch['review'], padding=True, truncation=True)
tokenized_imdb_dataset = imdb_dataset.map(tokenize_helper, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

### Creating classifier

In [10]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Sanity check
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


### Counting the number of trainable paramters

In [12]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [13]:
# Sanity check
count_trainable_parameters(model)

66955010

### Preparing for training

In [14]:
from transformers import TrainingArguments
training_arguments = TrainingArguments(output_dir = 'out', num_train_epochs=10, eval_strategy='epoch')

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
import evaluate

accuracy_scorer = evaluate.load('accuracy')

def evaluation_helper(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_scorer.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

### Training the model

In [16]:
from transformers import Trainer
model.to(device)
trainer = Trainer(model=model, args=training_arguments, train_dataset=tokenized_imdb_dataset['train'], eval_dataset=tokenized_imdb_dataset['eval'], compute_metrics=evaluation_helper)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.378053,0.862
2,0.349200,0.425832,0.888
3,0.349200,0.503376,0.886
4,0.080700,0.592409,0.892
5,0.080700,0.61321,0.888
6,0.026400,0.705788,0.892
7,0.026400,0.748682,0.89
8,0.003500,0.764992,0.894
9,0.003500,0.786568,0.896
10,0.003100,0.793798,0.892


TrainOutput(global_step=2500, training_loss=0.0926034369468689, metrics={'train_runtime': 1060.4595, 'train_samples_per_second': 18.86, 'train_steps_per_second': 2.357, 'total_flos': 2649347973120000.0, 'train_loss': 0.0926034369468689, 'epoch': 10.0})

In [18]:
# Save model to avoid rerunning
trainer.save_model('fully-finetuned.model')

# Step 2: Fine tuning final layers only

In [19]:
adapted_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Remove gradient computation for all but classification layer
for param in adapted_model.parameters():
    param.requires_grad = False
for param in adapted_model.classifier.parameters():
    param.requires_grad = True
for param in adapted_model.pre_classifier.parameters():
    param.requires_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Sanity check
count_trainable_parameters(adapted_model)

592130

In [21]:
adapted_model.to(device)
adapted_trainer = Trainer(model=adapted_model, args=training_arguments, train_dataset=tokenized_imdb_dataset['train'], eval_dataset=tokenized_imdb_dataset['eval'], compute_metrics=evaluation_helper)

In [22]:
adapted_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.596484,0.676
2,0.590600,0.500001,0.77
3,0.590600,0.434207,0.814
4,0.449100,0.402174,0.83
5,0.449100,0.401504,0.826
6,0.400000,0.389323,0.832
7,0.400000,0.377066,0.838
8,0.379000,0.368555,0.842
9,0.379000,0.36632,0.842
10,0.376200,0.364697,0.84


TrainOutput(global_step=2500, training_loss=0.4389877624511719, metrics={'train_runtime': 387.1623, 'train_samples_per_second': 51.658, 'train_steps_per_second': 6.457, 'total_flos': 2649347973120000.0, 'train_loss': 0.4389877624511719, 'epoch': 10.0})

In [23]:
adapted_trainer.save_model('adapted.model')

# Step 3: Fine tuning with LoRA

In [24]:
lora_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Utilities for modifying models

In [25]:
def extract_qv_layers(model):
  dictionary = {}
  for name, module in model.named_modules():
    if 'q_lin' in name or 'v_lin' in name:
      dictionary[name] = module
  return dictionary


In [26]:
# Sanity check
len(extract_qv_layers(lora_model))

12

In [27]:
def replace_layers(model, named_layers):
    for name, layer in named_layers.items():
        components = name.split('.')
        submodule = model
        for component in components[:-1]:
            submodule = getattr(submodule, component)
        setattr(submodule, components[-1], layer)

### Implementing the LoRA layer

In [28]:
class LinearBlockWithLoRA(nn.Module):
    def __init__(self, W, r, alpha = 0.01):
        """
        Initializes the LinearBlockWithLoRA.

        Args:
            W (torch.Tensor): Pre-trained weight matrix.
            r (int): Rank of the low-rank approximation.
        """
        super().__init__()
        self.r = r
        self.alpha = alpha

        # Store the pre-trained weight matrix
        self.W = W  # Frozen pre-trained weights

        # Get the dimensions of the pre-trained weight matrix
        out_dim, in_dim  = W.shape

        # Initialize the low-rank matrices A and B
        self.A = nn.Linear(in_features=in_dim, out_features=r, bias=False)  # Low-rank adaptation A
        self.B = nn.Linear(in_features=r, out_features=out_dim, bias=False)   # Low-rank adaptation B

        # Initialize the weights of the low-rank matrices
        nn.init.normal_(self.A.weight)
        nn.init.zeros_(self.B.weight)

    def forward(self, X):
        """
        Forward pass for the LinearBlockWithLoRA.

        Args:
            X (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after applying W and LoRA.
        """
        # Compute the output with the pre-trained weight matrix
        W_out = X @ self.W  # Using frozen weights

        # Compute the low-rank adaptation
        a_out = self.A(X) # (batch_size x in_dim) @ (in_dim x r) @ (r x out_dim)
        b_out = self.B(a_out) # (batch_size x in_dim) @ (in_dim x r) @ (r x out_dim

        # Add scaled adaptation to the pre-trained weights' output
        scaled_b_out = (self.alpha/self.r)*b_out

        return W_out + scaled_b_out


### Fine tuning with LoRA

In [29]:
# Wrap linear layers with LinearBlockWithLoRA
relevant_layers = extract_qv_layers(lora_model)

for name, layer in relevant_layers.items():
    lora_layer = LinearBlockWithLoRA(layer.weight, r=8)
    lora_layer.W.requires_grad = False
    relevant_layers[name] = lora_layer

replace_layers(lora_model, relevant_layers)

In [33]:
for param in lora_model.parameters():
    param.requires_grad = False

for name, lora_layer in relevant_layers.items():
    lora_layer.A.weight.requires_grad = True
    lora_layer.B.weight.requires_grad = True


In [34]:
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

distilbert.transformer.layer.0.attention.q_lin.A.weight torch.Size([8, 768])
distilbert.transformer.layer.0.attention.q_lin.B.weight torch.Size([768, 8])
distilbert.transformer.layer.0.attention.v_lin.A.weight torch.Size([8, 768])
distilbert.transformer.layer.0.attention.v_lin.B.weight torch.Size([768, 8])
distilbert.transformer.layer.1.attention.q_lin.A.weight torch.Size([8, 768])
distilbert.transformer.layer.1.attention.q_lin.B.weight torch.Size([768, 8])
distilbert.transformer.layer.1.attention.v_lin.A.weight torch.Size([8, 768])
distilbert.transformer.layer.1.attention.v_lin.B.weight torch.Size([768, 8])
distilbert.transformer.layer.2.attention.q_lin.A.weight torch.Size([8, 768])
distilbert.transformer.layer.2.attention.q_lin.B.weight torch.Size([768, 8])
distilbert.transformer.layer.2.attention.v_lin.A.weight torch.Size([8, 768])
distilbert.transformer.layer.2.attention.v_lin.B.weight torch.Size([768, 8])
distilbert.transformer.layer.3.attention.q_lin.A.weight torch.Size([8, 768])

In [35]:
# Sanity check
count_trainable_parameters(lora_model)

147456

In [36]:
lora_model.to(device)
lora_trainer = Trainer(model=lora_model, args=training_arguments, train_dataset=tokenized_imdb_dataset['train'], eval_dataset=tokenized_imdb_dataset['eval'], compute_metrics=evaluation_helper)

In [None]:
lora_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.470035,0.834


In [None]:
lora_trainer.save_model('lora.model')