## LoRA

The primary goal of this project is to demonstrate the efficacy of **PEFT methods** in improving the performance of the distilbert-base-uncased model on the 'ag_news' dataset, a popular benchmark for text classification tasks. The focus is on enhancing model accuracy while significantly reducing the number of trainable parameters, thereby making the training process more efficient.

In [1]:
# !pip3 install transformers[torch] datasets peft evaluate accelerate

Installing collected packages: pyarrow-hotfix, dill, responses, multiprocess, accelerate, datasets, peft, evaluate
Successfully installed accelerate-0.25.0 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 multiprocess-0.70.15 peft-0.7.1 pyarrow-hotfix-0.6 responses-0.18.0


In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import numpy as np
from datasets import load_dataset
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from tqdm import tqdm

In [31]:
!nvidia-smi

Wed Dec 20 09:07:38 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0              32W /  70W |   4263MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
# Check if CUDA is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [26]:
# Load the dataset
dataset = load_dataset('ag_news')

In [5]:
# for i, j in zip(dataset['train']['text'][:10], dataset['train']['label'][:10]):
#   print(i, j)
n_labels = len(set(dataset['train']['label']))
n_labels

4

In [6]:
model_checkpoint = 'distilbert-base-uncased'

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=n_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Model - 1 (Without any fine-tuning)

In [27]:
# Tokenize the entire dataset
encodings = tokenizer(dataset['test']['text'], truncation=True, padding=True)
input_ids = torch.tensor(encodings['input_ids']).to(device)
attention_masks = torch.tensor(encodings['attention_mask']).to(device)
labels = torch.tensor(dataset['test']['label']).to(device)
model.to(device)
# Create a DataLoader
tensor_dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(tensor_dataset, batch_size=32)

# Lists to store predictions and actual labels
predictions = []
true_labels = []

# Disable gradient calculations
with torch.no_grad():
    model.eval()  # Set model to evaluation mode
    for batch in tqdm(dataloader, desc="Processing batches"):
        input_ids, attention_mask, label = batch

        # Move batch data to the device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        logits = model(input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(logits, dim=1).tolist()
        predictions.extend(preds)
        true_labels.extend(label.cpu().tolist())

# Calculate F1-scores
macro_f1 = f1_score(true_labels, predictions, average='macro')
micro_f1 = f1_score(true_labels, predictions, average='micro')

print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")


Processing batches: 100%|██████████| 238/238 [01:04<00:00,  3.67it/s]

Macro F1-Score: 0.20035313957085607
Micro F1-Score: 0.26105263157894737





## Model-2 (FineTune with LoRA)

In [8]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [10]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329179482232403


In [12]:
train_encodings = tokenizer(dataset['train']['text'], truncation=True, padding=True)

In [13]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

input_ids = torch.tensor(train_encodings['input_ids']).to(device)
attention_masks = torch.tensor(train_encodings['attention_mask']).to(device)
labels = torch.tensor(dataset['train']['label']).to(device)

In [None]:
model.to(device)

In [16]:
 # Create a DataLoader
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
num_epochs = 3  # Set the number of epochs


# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)




In [None]:
# Training Loop
model.train()
for epoch in range(num_epochs):
    for batch in tqdm(train_loader):
        # Move batch to device
        batch = [b.to(device) for b in batch]
        inputs, masks, labels = batch

        # Forward pass
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
    torch.save(model, f'epoch{epoch}.pth')
    # print(f"Epoch {epoch} finished")

In [18]:
model = torch.load('/content/epoch3.pth')

In [19]:
encodings = tokenizer(dataset['test']['text'], truncation=True, padding=True)

In [20]:
# Tokenize the entire dataset

input_ids = torch.tensor(encodings['input_ids']).to(device)
attention_masks = torch.tensor(encodings['attention_mask']).to(device)
labels = torch.tensor(dataset['test']['label']).to(device)
model.to(device)
# Create a DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32)

# Lists to store predictions and actual labels
predictions = []
true_labels = []

# Disable gradient calculations
with torch.no_grad():
    model.eval()  # Set model to evaluation mode
    for batch in tqdm(dataloader, desc="Processing batches"):
        input_ids, attention_mask, label = batch

        # Move batch data to the device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        logits = model(input_ids, attention_mask=attention_mask).logits
        preds = torch.argmax(logits, dim=1).tolist()
        predictions.extend(preds)
        true_labels.extend(label.cpu().tolist())

# Calculate F1-scores
macro_f1 = f1_score(true_labels, predictions, average='macro')
micro_f1 = f1_score(true_labels, predictions, average='micro')

print(f"Macro F1-Score: {macro_f1}")
print(f"Micro F1-Score: {micro_f1}")


Processing batches: 100%|██████████| 238/238 [01:05<00:00,  3.61it/s]

Macro F1-Score: 0.9190492680404317
Micro F1-Score: 0.9190789473684211



