In [None]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


# Mount to google drive to access training data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [None]:
import numpy as np
import pandas as pd
import math
import textwrap

from IPython.display import display, clear_output
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn import Parameter
from torch.optim import AdamW

from transformers import AutoModelForCausalLM, AutoTokenizer

from datasets import load_dataset

model_name = "gpt2-medium"

# Using GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Load the Fine-tune dataset
I got the data for this example from https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset?rvi=1

In [None]:
# set up tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = "<PAD>"

path_to_file = "" # path to file in google drive

dataset = load_dataset('csv', data_files=path_to_file)['train']["text"]

print("{} Entries in dataset.".format(len(dataset)))
print("\n")
print("Examples: ")
for i in range(10):
    print(dataset[i])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


20000 Entries in dataset.


Examples: 
TITLE: Why Do Fall Leaves Change Color? DESCRIPTION: Fall foliage delights leaf-peeping tourists, but how does the change in color benefit trees? As scientists explain, there is a reason for the season.
TITLE: Falling Oil Hits Europe; Dollar Bounces DESCRIPTION:  LONDON (Reuters) - Most European stock markets followed  Wall Street lower Wednesday as crude oil's slide to three-month  lows hit heavily weighted oil shares, although a recovering  dollar was a plus for the region's exporters.
TITLE: Linksys goes dual-band on Wi-Fi (MacCentral) DESCRIPTION: MacCentral - With its eyes on the future of home entertainment and a relatively uncluttered band of radio spectrum, Cisco Systems Inc.'s Linksys division on Wednesday unveiled a line of IEEE 802.11g/a wireless LAN products.
TITLE: Chirac hits out at international community's inaction in Middle East (AFP) DESCRIPTION: AFP - French President Jacques Chirac sharply criticized the international community

# Functions for fine-tuning a GPT2-style model

In [None]:
def print_trainable_parameters(model):
    """
    Prints the amount of trainable parameters in a model to console.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print("-----------------------------------------------------------------------")
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")
    print("-----------------------------------------------------------------------")
    print("\n")

def fine_tune(model, epochs=1, batch_size=8):
    """
    Just a simple function for fine-tuning a model like gpt-2 on the dataset
    defined above
    """
    LEARNING_RATE = 1e-5
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

    model.train()

    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        print(f"EPOCH: {epoch} " + '=' * 20)
        with tqdm(enumerate(loader), total=len(loader)) as progress_bar:
            for idx, batch in progress_bar:
                optimizer.zero_grad()

                inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
                input_ids = inputs['input_ids'].to(device)

                outputs = model(input_ids, labels=input_ids)
                loss = outputs.loss

                # Backward pass and optimization
                loss.backward()
                optimizer.step()

                progress_bar.set_description(f"Loss: {loss.item():.4f}")


# Disadvantage of fully fine-tuning a model

Full fine-tuning of a LLM typically requires extensive resources. The following example illustrates this. Utilizing Google Colab's free tier, which provides access to a single T4 GPU, I attempted to fine-tune GPT2-Medium (~355 million parameters) but encountered significant memory constraints.

In [None]:
# Download Model
model_name = "gpt2-medium"
full_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Print Params
print_trainable_parameters(full_model)

# Fine-tune model
fine_tune(full_model, epochs=1)

-----------------------------------------------------------------------
trainable params: 354823168 || all params: 354823168 || trainable%: 100.00
-----------------------------------------------------------------------




Loss: 2.8686:   1%|          | 25/2500 [00:19<31:26,  1.31it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 468.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 57.06 MiB is free. Process 31228 has 14.69 GiB memory in use. Of the allocated memory 13.81 GiB is allocated by PyTorch, and 765.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# LoRA Model
Implementation of LoRA (Low-Rank Adaptation) for fine-tuning a pre-trained causal language model like GPT-2. It initializes a pre-trained model, defines a custom LoRA_Linear module, and replaces modules related to the attention mechanism in the model with LoRA_Linear. Afterwards the model is fine-tuned. The results show, that the LoRA model can be fine-tuned without any memory issues.

This specific implementation of LoRA was inspired by https://github.com/tsmatz/finetune_llm_with_lora



In [None]:
lora_model = AutoModelForCausalLM.from_pretrained(model_name)


class LoRA_Linear(nn.Module):
    def __init__(self, weight, bias, lora_dim):
        super(LoRA_Linear, self).__init__()

        out, inp = weight.shape

        # Set up linear layer with old weight and bias
        if bias is None:
            self.linear = nn.Linear(inp, out, bias=False)
            self.linear.load_state_dict({"weight": weight})
        else:
            self.linear = nn.Linear(inp, out)
            self.linear.load_state_dict({"weight": weight, "bias": bias})

        # Set up new LoRA weights
        self.lora_right = nn.Parameter(torch.zeros(inp, lora_dim))
        nn.init.kaiming_uniform_(self.lora_right, a=math.sqrt(5))
        self.lora_left = nn.Parameter(torch.zeros(lora_dim, out))

    def forward(self, input):
        frozen_output = self.linear(input)
        LoRA_output = input @ self.lora_right @ self.lora_left
        return frozen_output + LoRA_output


lora_dim = 8

# Gather target modules
targets = [n for n, _ in lora_model.named_modules() if "attn.c_attn" in n]

# replace each module with LoRA
for name in targets:
    name_struct = name.split(".")

    module_list = [lora_model]
    for struct in name_struct:
        module_list.append(getattr(module_list[-1], struct))

    # build LoRA layer
    lora = LoRA_Linear(
        weight = torch.transpose(module_list[-1].weight, 0, 1), # old weight
        bias = module_list[-1].bias, # old bias
        lora_dim = lora_dim # lora dimensionality
        )

    # set child of parent to new LoRA layer
    module_list[-2].__setattr__(name_struct[-1], lora)

# Freeze all non-LoRA params
for n, p in lora_model.named_parameters():
    p.requires_grad = "lora_right" in n or "lora_left" in n

lora_model = lora_model.to(device)

print_trainable_parameters(lora_model)
fine_tune(lora_model, epochs=1)

-----------------------------------------------------------------------
trainable params: 786432 || all params: 355609600 || trainable%: 0.22
-----------------------------------------------------------------------




Loss: 2.3973: 100%|██████████| 2500/2500 [16:07<00:00,  2.58it/s]


# Evaluation functions

In [None]:
def choose_from_top(probs, n=5):
    """
    Selects one token ID from the top n probable token IDs in a given
    probability distribution.
    """
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

def generate_example(model, query):
    """
    This function generates text by repeatedly predicting the next token
    using a given model and a starting query.
    """
    indicator = query

    cur_ids = torch.tensor(tokenizer.encode(indicator)).unsqueeze(0).to(device)
    for i in range(2000):
        outputs = model(cur_ids, labels=cur_ids)
        loss, logits = outputs[:2]
        softmax_logits = torch.softmax(logits[0,-1], dim=0)

        next_token_id = choose_from_top(softmax_logits.to('cpu').detach().numpy(), n=5)
        cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1)

        if next_token_id in tokenizer.encode('<|endoftext|>'):
            break

        print(tokenizer.decode([next_token_id]), end='')
        if i % 20 == 0 and i != 0:
          print("\n", end='')


def evaluate_model(model):
    s_1 = "TITLE: Big News at University of Freiburg! DESCRIPTION:"

    print("Model Input: {}".format(s_1))
    print("")
    print("Model Completion: ")
    generate_example(model, s_1)

# Examle of the fine-tuned LoRA Model

In [None]:
lora_model.eval()
evaluate_model(lora_model)

Model Input: TITLE: Big News at University of Freiburg! DESCRIPTION:

Model Completion: 
 The University of Freiburg has released a new version of its software to improve the speed of the system
, which is designed to help students study for exams faster.