In [None]:
!pip install transformers==4.14.1 -q
!pip install bitsandbytes-cuda111==0.26.0 -q
!pip install datasets==1.16.1 -q

[K     |████████████████████████████████| 3.4 MB 6.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 67.7 MB/s 
[K     |████████████████████████████████| 182 kB 74.2 MB/s 
[K     |████████████████████████████████| 880 kB 76.5 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: Could not find a version that satisfies the requirement bitsandbytes-cuda111==0.26.0 (from versions: 0.26.0.post2)[0m
[31mERROR: No matching distribution found for bitsandbytes-cuda111==0.26.0[0m
[K     |████████████████████████████████| 298 kB 6.1 MB/s 
[K     |████████████████████████████████| 212 kB 58.6 MB/s 
[K     |████████████████████████████████| 132 kB 55.8 MB/s 
[?25h

In [12]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.2


### Fine-tuning 6-Billion GPT-J (& other models) in colab with LoRA and 8-bit compression

This notebook is a simple example for fine-tuning [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6B) with limited memory. A detailed explanation of how it works can be found in [this model card](https://huggingface.co/hivemind/gpt-j-6B-8bit). It is heavily based on [this Colab](https://colab.research.google.com/drive/1ft6wQU0BhqG5PRlwgaZJv2VukKKjU4Es#scrollTo=vfdLQHOuEU7h). Huge thanks to Hivemind!

You can also finetune [GPT-Neo-2.7B](https://huggingface.co/gustavecortal/gpt-neo-2.7B-8bit), [French GPT-J (Cedille's Boris)](https://huggingface.co/gustavecortal/fr-boris-8bit) and [T0-3B](https://huggingface.co/gustavecortal/T0_3B-8bit) with limited memory.

Twitter: [@gustavecortal](https://twitter.com/gustavecortal)

In [1]:
from sklearn.model_selection import train_test_split

import transformers

import pandas as pd

import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd

from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise

from tqdm.auto import tqdm

In [None]:
# from google.colab import drive
# drive._mount("/content/drive")

Mounted at /content/drive


## Converting the model to 8 bits

In [4]:
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias

    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"


# class DequantizeAndLinear(torch.autograd.Function):
#     @staticmethod
#     @custom_fwd
#     def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
#                 absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
#         weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
#         ctx.save_for_backward(input, weights_quantized, absmax, code)
#         ctx._has_bias = bias is not None
#         return F.linear(input, weights_deq, bias)

#     @staticmethod
#     @custom_bwd
#     def backward(ctx, grad_output: torch.Tensor):
#         assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
#         input, weights_quantized, absmax, code = ctx.saved_tensors
#         # grad_output: [*batch, out_features]
#         weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
#         grad_input = grad_output @ weights_deq
#         grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
#         return grad_input, None, None, None, grad_bias
    
class DequantizeAndLinear(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)

    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq.t()  # .t() for transpose
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input.clone(), None, None, None, grad_bias.clone() if grad_bias is not None else None


class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None

    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
        return output

    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)

    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"


def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)

    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)


def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr(
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )

You have to Monkey-Patch GPT-J before loading:

In [5]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock

If you're using another 8-bit quantized model (e.g. T0-3B), remember to Monkey-Patch the model using convert_to_int8()

In [6]:
class T5ForConditionalGeneration(transformers.models.t5.modeling_t5.T5ForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)

transformers.models.t5.modeling_t5.T5ForConditionalGeneration = T5ForConditionalGeneration

In [5]:
# config = transformers.GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B", bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
# tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B", bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

config = transformers.GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-125m")
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

In [9]:
# config.pad_token_id = config.eos_token_id
# tokenizer.pad_token = config.pad_token_id

In [8]:
# gpt = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True).cuda()
gpt = transformers.GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125m", low_cpu_mem_usage=True)

# # Get the hidden size of the model
# hidden_size = gpt.config.hidden_size

# # Get the vocabulary size from the tokenizer
# vocab_size = tokenizer.vocab_size

# # Check the dimensions
# print(f"Vocabulary size: {vocab_size}")
# print(f"Hidden size: {hidden_size}")

# # Create a new embeddings matrix with the correct dimensions
# new_output_embeddings = torch.nn.Embedding(vocab_size, hidden_size)

# # Initialize the embeddings matrix (optional)
# torch.nn.init.normal_(new_output_embeddings.weight, mean=0.0, std=gpt.config.initializer_range)

# # Set the new embeddings matrix to the model's output embeddings
# gpt.set_output_embeddings(new_output_embeddings)

# # Check if the embeddings have been set correctly
# print(gpt.get_output_embeddings())

# gpt.to('cpu')
# gpt.resize_token_embeddings(len(tokenizer))
# gpt.set_output_embeddings(len(tokenizer))
# print(gpt.get_output_embeddings)
# print(len(tokenizer))
# print(tokenizer)
#gpt = GPTJForCausalLM.from_pretrained("gustavecortal/fr-boris-8bit", low_cpu_mem_usage=True) French GPT-J Cedille's Boris

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

## LoRA fine-tuning example

You can load my very small dataset composed of philosophical sentences:

In [9]:
!gdown --id 1Q1WMjny26VHLKb71iTCHIS5zvdm9c-wz

Downloading...
From: https://drive.google.com/uc?id=1Q1WMjny26VHLKb71iTCHIS5zvdm9c-wz
To: /home/jupyter/pgbp_example.csv
100%|██████████████████████████████████████| 41.0k/41.0k [00:00<00:00, 89.6MB/s]


In [9]:
data = pd.read_csv('pgbp_example.csv')
data['sentence'] = 'Quote: ' + data['sentence']
train, test = train_test_split(data, test_size=0.01)
train.to_csv('train_pgbp_example.csv', index=False)
test.to_csv('test_pgbp_example.csv', index=False)

# torch.manual_seed(42)
# texts = pd.read_csv('data.csv')
# texts = texts.dropna()
# texts = texts.dropna(subset=['Quest Description'])
# max_length = max([len(tokenizer.encode('<|startoftext|>' + 'Title: ' + title + ' Description: ' + descr + '<|endoftext|>')) for title, descr in zip(texts['Quest Description'], texts['Quest Title'])])
# train, test = train_test_split(texts, test_size=0.1, random_state=42)

# data_train = pd.DataFrame()
# data_test = pd.DataFrame()

# data_train['data'] = '<|startoftext|>' + 'Title: ' + train['Quest Title'] + ' Description: ' + train['Quest Description'] + '<|endoftext|>'
# data_test['data'] = '<|startoftext|>' + 'Title: ' + test['Quest Title'] + ' Description: ' + test['Quest Description'] + '<|endoftext|>'

# data_train.to_csv('datasets/wow_1_train.csv', index=False)
# data_test.to_csv('datasets/wow_1_test.csv', index=False)

In [10]:
# from datasets import load_dataset
# dataset = load_dataset('csv', data_files={'train': 'datasets/wow_1_train.csv',
#                                               'test': 'datasets/wow_1_test.csv'})
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': 'train_pgbp_example.csv',
                                              'test': 'test_pgbp_example.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [11]:
# print(max_length)
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True, max_length=128) #max_length = 175

tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence"])
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [12]:
from torch.utils.data import DataLoader

full_train_dataset = tokenized_datasets["train"]
train_dataloader = DataLoader(full_train_dataset, shuffle=False, batch_size=8)

Add adapters to Embedding/MLP/Attention/LMHead layers

In [13]:
def add_adapters(model, adapter_dim=4, p = 0.1):
    assert adapter_dim > 0

    for name, module in model.named_modules():
      if isinstance(module, FrozenBNBLinear):
          if "attn" in name or "mlp" in name or "head" in name:
              print("Adding adapter to", name)
              module.adapter = nn.Sequential(
                nn.Linear(module.in_features, adapter_dim, bias=False),
                nn.Dropout(p=p),
                nn.Linear(adapter_dim, module.out_features, bias=False),
            )
              print("Initializing", name)
              nn.init.zeros_(module.adapter[2].weight)

          else:
              print("Not adding adapter to", name)
      elif isinstance(module, FrozenBNBEmbedding):
          print("Adding adapter to", name)
          module.adapter = nn.Sequential(
                nn.Embedding(module.num_embeddings, adapter_dim),
                nn.Dropout(p=p),
                nn.Linear(adapter_dim, module.embedding_dim, bias=False),
            )
          print("Initializing", name)
          nn.init.zeros_(module.adapter[2].weight)

add_adapters(gpt)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpt.to(device)
# gpt.to('cuda')

Adding adapter to transformer.wte
Initializing transformer.wte
Adding adapter to transformer.h.0.attn.k_proj
Initializing transformer.h.0.attn.k_proj
Adding adapter to transformer.h.0.attn.v_proj
Initializing transformer.h.0.attn.v_proj
Adding adapter to transformer.h.0.attn.q_proj
Initializing transformer.h.0.attn.q_proj
Adding adapter to transformer.h.0.attn.out_proj
Initializing transformer.h.0.attn.out_proj
Adding adapter to transformer.h.0.mlp.fc_in
Initializing transformer.h.0.mlp.fc_in
Adding adapter to transformer.h.0.mlp.fc_out
Initializing transformer.h.0.mlp.fc_out
Adding adapter to transformer.h.1.attn.k_proj
Initializing transformer.h.1.attn.k_proj
Adding adapter to transformer.h.1.attn.v_proj
Initializing transformer.h.1.attn.v_proj
Adding adapter to transformer.h.1.attn.q_proj
Initializing transformer.h.1.attn.q_proj
Adding adapter to transformer.h.1.attn.out_proj
Initializing transformer.h.1.attn.out_proj
Adding adapter to transformer.h.1.mlp.fc_in
Initializing transfor

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-27): 28 x GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): FrozenBNBLinear(4096, 50400)
)

In [13]:
from bitsandbytes.optim import Adam8bit

gpt.gradient_checkpointing_enable()
optimizer = Adam8bit(gpt.parameters(), lr=1e-5, weight_decay=0.01)

In [14]:
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

In [15]:
lr_scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, int(num_training_steps*0.1), num_training_steps
)

In [16]:
filepath = 'TEST/gpt-neo-125M/model.pt'

In [20]:
from tqdm.auto import tqdm
tokenizer.pad_token = tokenizer.eos_token
device = "cuda:0"
gpt.to(device)

scaler = torch.cuda.amp.GradScaler()
progress_bar = tqdm(range(num_training_steps))
gpt.train()
gpt.gradient_checkpointing_enable()
k = 0

for epoch in range(num_epochs):
    for batch in train_dataloader:

        k = k + 1
        if k % 500 == 0:
          print(k)
          state = {'k' : k, 'epoch': num_epochs, 'lr_scheduler': lr_scheduler.state_dict(), 'state_dict': gpt.state_dict(), 'optimizer': optimizer.state_dict()}
          torch.save(state, filepath)

        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
          out = gpt.forward(**batch,)

          loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0, -2), batch['input_ids'][:, 1:].flatten(),
                                reduction='mean', label_smoothing=0.1)

        print(loss)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

  0%|          | 0/195 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


tensor(7.7104, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.4244, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.8602, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.4109, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.6984, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.8747, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.2149, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.0586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.3411, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.5782, device='cuda:0', grad_fn=<AddBackward0>)
tensor(8.0346, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.1785, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.7267, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.9468, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.9718, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.2170, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.9039, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.9524, device='cuda:0', grad_fn=<AddBack

## Text generation example

In [22]:
gpt.eval()
print('xd')
with torch.no_grad():
  prompt = tokenizer("Title:", truncation=True, padding=True, max_length=128, return_tensors='pt')
  prompt = {key: value.to('cuda') for key, value in prompt.items()}
  out = gpt.generate(**prompt, max_length=128, top_k=50, top_p=0.9, temperature=1.0, do_sample=True, repetition_penalty = 1.2, num_beams=1)
  print(tokenizer.decode(out[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


xd
Title:<|endoftext|>


In [53]:
import torch

# Example tensors
tensor = torch.randn(10, 3)  # A tensor with shape (10, 3)
indices = torch.tensor([0, 1, 9])  # Indices for selection, all within range

# Check if any index is out of bounds
if torch.any(indices >= tensor.size(0)):
    raise ValueError(f"Index out of bounds. Tensor size: {tensor.size(0)}, Indices: {indices}")

# If indices are valid, perform the selection
result = torch.index_select(tensor, 0, indices)
print(result)


tensor([[ 0.7440,  0.1540, -1.3190],
        [ 0.5355,  0.9364, -2.4590],
        [ 0.5001,  1.0870, -1.1419]])
