## Отдельный блокнот для отдельных ресурсов)

### Part 2: Parameter Efficient Fine-Tuning
In this notebook, you're gonna fine-tune large language models within limited GPU memory.

In [1]:
!pip install --quiet transformers accelerate sentencepiece optimum peft bitsandbytes

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from tqdm.auto import tqdm, trange

assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m424.1/424.1 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install accelerate bitsandbytes



In [3]:
!nvidia-smi

Wed Dec 11 20:10:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8              10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
from transformers import AutoModelForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import torch

model_name = 'Enoch/llama-7b-hf'

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float32,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = LlamaTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.float32,
    quantization_config=quantization_config
)

for param in model.parameters():
    param.requires_grad = False

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/218 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]

pytorch_model-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

In [5]:
prompt = "The first discovered martian lifeform looks like"
batch = tokenizer([prompt], return_tensors='pt', return_token_type_ids=False).to(device)
print("Input batch (encoded):", batch)

output_tokens = model.generate(**batch, max_new_tokens=64, do_sample=True, temperature=0.8)
# greedy inference:                                        do_sample=False)
# beam search for highest probability:                     num_beams=4)

print("\nOutput:", tokenizer.decode(output_tokens[0].cpu()))

Input batch (encoded): {'input_ids': tensor([[    1,   450,   937, 10943, 14436,   713,  2834,   689,  3430,   763]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

Output: <s>The first discovered martian lifeform looks like something that fell from space.
It’s a squiggly little bugger, no bigger than a finger, and it seems to be thriving on Mars.
Scientists in Russia have discovered a bug that lives in a Martian lava tube called L’Ecuyer, and this


### Adapter basics: LoRA (1 points)

When training on more serious tasks, you can use low-rank adapters based on the [LoRA paper](https://arxiv.org/pdf/2106.09685.pdf).

The core idea is to add low-rank adapters __in parallel with existing linear layers,__ like this:
<center><img src="https://i.imgur.com/6bQLNiG.png" width=240px></center>

In the original LoRA paper, the adapters were only added to attention projection matrices. However, [subsequent works](https://arxiv.org/abs/2305.14314) show that it is useful to adapt FFNs as well. But before we do any training, we need to implement the basic LoRA layer.

In [6]:
class LoRALayer(nn.Module):
    """Wraps a linear layer with LoRA-like adapter. Wraps an existing OPT (или LLaMA) linear layer"""
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module  # pre-trained (frozen) linear layer
        self.adapter_A = nn.Parameter(torch.empty(module.in_features, rank, device=module.weight.device))
        nn.init.kaiming_uniform_(self.adapter_A, a=5 ** 0.5)
        self.adapter_B = nn.Parameter(torch.zeros(rank, module.out_features, device=module.weight.device))

    def forward(self, input):
        # Основная идея LoRA: output = W*x + (A*B)*x, где W - замороженный вес, A и B - обучаемые низкоранговые матрицы
        # self.module - это исходный линейный слой (W*x)
        # adapter_A.shape = [in_features, rank]
        # adapter_B.shape = [rank, out_features]

        # Выполняем обычную линейную трансформацию
        original_out = self.module(input)
        # Добавляем вклад от адаптеров:
        # (input * A) дает размер [batch, seq_len, rank]
        # затем (input*A)*B дает размер [batch, seq_len, out_features]
        lora_out = input @ self.adapter_A @ self.adapter_B
        return original_out + lora_out

In [7]:
# test your implementation
test_linear = nn.Linear(128, 128)
test_linear.weight.data[...] = torch.eye(128)
test_adapter = LoRALayer(test_linear, rank=8)
assert torch.allclose(test_adapter(torch.ones(1, 1, 128)), test_linear.bias + 1), "please check your forward pass"

test_adapter.adapter_A.data[...] = torch.linspace(0.1, -0.5, 128 * 8).view(128, 8)
test_adapter.adapter_B.data[...] = torch.linspace(0.5, -0.1, 128 * 8).view(8, 128)
test_linear.bias.data[...] = torch.linspace(1., -1., 128)

dummy_loss = F.mse_loss(test_adapter(torch.ones(1, 128) / 128).squeeze(), torch.linspace(-1, 1, 128))
assert torch.allclose(dummy_loss, torch.tensor(1.3711389), rtol=0, atol=1e-4)
dummy_loss.backward()
assert all(w.grad is not None for w in [test_adapter.adapter_A, test_adapter.adapter_B]), "some adapter weights have no grad"
assert torch.allclose(test_adapter.adapter_A.grad.sum(), torch.tensor(-0.60158), rtol=0, atol=1e-4), "bad grad w.r.t. A"
assert torch.allclose(test_adapter.adapter_B.grad.sum(), torch.tensor(0.9931), rtol=0, atol=1e-4), "bad grad w.r.t. B"
del dummy_loss, test_linear, test_adapter
print("All tests passed!")

All tests passed!


### Apply LoRA to the model

The code below applies LoRA adapters on top of Q/K/V linear layers in Llama attention. You may also choose to modify other layers:
* self_attn.o_proj - attention output projection
* mlp.up_proj, mlp.gate_proj, mlp.down_proj - transformer feedforward layers
* lm_head - output LM head

__Note:__ please scroll down for the homework task

In [8]:
lora_rank = 8

for name, module in model.model.layers.named_modules():
    # Ищем слои, соответствующие линейным преобразованиям Q/K/V в LlamaAttention
    if 'LlamaDecoderLayer' in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(device)
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(device)
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(device)

assert sum(isinstance(module, LoRALayer) for module in model.modules()) == 96  # for Llama-7B

In [9]:
batch = tokenizer("This model wants to share its greatest secret:", return_tensors='pt', return_token_type_ids=False).to(device)
with torch.cuda.amp.autocast(dtype=torch.float32):
    out = model.forward(**batch)
    (out.logits.norm() / 100).backward()

for i, module in enumerate(model.modules()):
    if isinstance(module, LoRALayer):
        assert module.adapter_B.grad is not None
        assert module.adapter_B.grad.norm().item() > 0

model.zero_grad(set_to_none=True)
print("Grad check successful, well done!")

  with torch.cuda.amp.autocast(dtype=torch.float32):


Grad check successful, well done!


### Toy task: the story of a fox (1 point)

![img](https://i.imgur.com/Ux3qQAu.png) (source: theodd1souts.fandom.com)

In [10]:
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

for i in range(10):
    with torch.no_grad():
        next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput before training:", tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist()))


the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)
outputs = model(**batch)
next_word_logits = outputs.logits[:, :-1]
true_next_tokens = batch['input_ids'][:, 1:]
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
print("Initial Loss:", loss.item())


# Тренируем модель на этой фразе, чтобы уменьшить лосс
opt = torch.optim.Adam(model.parameters(), lr=2e-4)  # только LoRA параметры обучаются

for epoch in range(20):  # несколько шагов, чтобы уменьшить лосс
    opt.zero_grad()
    outputs = model(**batch)
    next_word_logits = outputs.logits[:, :-1]
    true_next_tokens = batch['input_ids'][:, 1:]
    loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
    loss.backward()
    opt.step()
    if epoch % 5 == 0:
        print(f"Epoch {epoch}, loss = {loss.item()}")

# Проверяем, что loss достаточно снизился
assert loss.item() <= 0.1
print("Good job!")

# Проверяем генерацию после обучения
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

for i in range(15):
    with torch.no_grad():
        next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput after training:", tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist()))


Output before training: <s>A quick brown fox jumps over the lazy dog.
A quick
Initial Loss: 3.0561904907226562
Epoch 0, loss = 3.0561904907226562
Epoch 5, loss = 0.7767133116722107
Epoch 10, loss = 0.1826687455177307
Epoch 15, loss = 0.07677486538887024
Good job!

Output after training: <s>A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it


### Note: using HuggingFace PEFT

[`peft`](https://huggingface.co/docs/peft/index) is a transformer's sister library that allows you to apply various __p__arameter __e__fficient __f__ine-__t__uning methods to pre-trained transformers. The library imlements both LoRA, prompt tuning, prefix tuning, as well as several other adapter-based techniques under a common interface.

You can find the basic tutorial for using PEFT here: https://huggingface.co/docs/peft/task_guides/clm-prompt-tuning . You may (or may not) choose to use this library in the next assignment.


### (example) How to train your model with HF Trainer

The example below shows how to train the LoRA adapters on a dummy dataset. You will need to run a _similar_ training task later.

__Note:__ please scroll down for the homework task

In [11]:
# reload model to forget the previous training run
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map='auto', low_cpu_mem_usage=True, offload_state_dict=True,
    load_in_4bit=True, torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad=False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

In [12]:
# checking if the model can learn. Change max_steps for proper training
import datasets
data = datasets.load_dataset("Abirate/english_quotes", split="train[:32]") # 32 lines
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

trainer = transformers.Trainer(
    model=model, train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, gradient_accumulation_steps=1,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=250, max_steps=100, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs', report_to=None),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

# NOTE: this is just an example! you do not have to wait for this progressbar to finish :)

README.md:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


AssertionError: No inf checks were recorded for this optimizer.

### Final task: *actually* train the model (4 points)

Your task is to fine-tune the model to _generate python code_. Please use the above examples for inspiration. More specifically,

* __dataset:__ use [codeparrot-clean](https://huggingface.co/datasets/codeparrot/codeparrot-clean) or any other data containing python code. Since you do not need much data for this excercise, it is enough to use just shorter validation subset of `codeparrots`
* __preprocessing:__ select python code based on file extentions (.py)  (may skip in case of codeparrot - it is 100% python)
* __short lines:__ please take the first 512 characters of each line
* __adapter type:__ please use LoRA as defined above __plus at least one of:__
   - extra adapter on lm_head
   - extra adapter on MLP components (mlp.*)
   - trainable input embeddings (requires tweaking memory usage)

* __training:__ you do not have to train to convergence. If all goes well, your model should `.generate` code after 500 steps. Please use batch size of at least 4 (4 x 1 x 512 tokens) using `gradient_accumulation_steps=4`. **Please make sure you reload model and reset adapters before training**. Your previous model is too concerned about a quick brown fox jumping over the lazy dog.


__Alternative assignment:__ Instead of doing python code, feel free to substitute the task with any other dataset, e.g. your favorite artist or podcast, as long as it's ethical. If you choose your own task, please show examples of what your model learned - or did not learn, akin to the code examples below.

In [13]:
!pip install --quiet transformers accelerate sentencepiece optimum peft bitsandbytes datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import LlamaTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from IPython.display import HTML, display

device = torch.device('cuda')
model_name = 'Enoch/llama-7b-hf'
tokenizer = LlamaTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', low_cpu_mem_usage=True, offload_state_dict=True, load_in_4bit=True, torch_dtype=torch.float32)
for p in model.parameters():
    p.requires_grad=False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class LoRALayer(nn.Module):
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module
        self.adapter_A = nn.Parameter(torch.empty(module.in_features, rank, device=module.weight.device))
        nn.init.kaiming_uniform_(self.adapter_A, a=5**0.5)
        self.adapter_B = nn.Parameter(torch.zeros(rank, module.out_features, device=module.weight.device))
    def forward(self, x):
        return self.module(x) + (x @ self.adapter_A @ self.adapter_B)

lora_rank = 8
for layer in model.model.layers:
    layer.self_attn.q_proj = LoRALayer(layer.self_attn.q_proj, lora_rank).to(device)
    layer.self_attn.k_proj = LoRALayer(layer.self_attn.k_proj, lora_rank).to(device)
    layer.self_attn.v_proj = LoRALayer(layer.self_attn.v_proj, lora_rank).to(device)
    layer.mlp.up_proj = LoRALayer(layer.mlp.up_proj, lora_rank).to(device)

original_prompts = ['', 'import', 'from', 'while', 'try', 'if', 'for', 'torch']
def generate_samples(model, prompts):
    samples = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors='pt').to(device)
        with torch.no_grad():
            out = model.generate(**inputs, max_new_tokens=64, do_sample=True, temperature=0.8)
        text = tokenizer.decode(out[0], skip_special_tokens=True)
        samples.append(text)
    return samples

before = generate_samples(model, original_prompts)

ds = load_dataset("codeparrot/codeparrot-clean-validated", split='validation')

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

DatasetNotFoundError: Dataset 'codeparrot/codeparrot-clean-validated' doesn't exist on the Hub or cannot be accessed.

In [19]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

Было сложно авторизоваться, выбрать нужный датасет и заполучить нужный токен, но я справился!)

In [20]:
from datasets import load_dataset

# Загрузка датасета The Stack с Python-кодом
ds = load_dataset("bigcode/the-stack", data_dir="data/python", split="train[:1%]")  # 1% данных

Resolving data files:   0%|          | 0/206 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/206 [00:00<?, ?files/s]

train-00000-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00001-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00002-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00003-of-00206.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00004-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00005-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00006-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00007-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00008-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00009-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00010-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00011-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00012-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00013-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00014-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00015-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00016-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00017-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00018-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00019-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00020-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00021-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00022-of-00206.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00023-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00024-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00025-of-00206.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

train-00026-of-00206.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00027-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00028-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00029-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00030-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00031-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00032-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00033-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00034-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00035-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00036-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00037-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00038-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00039-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00040-of-00206.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

train-00041-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00042-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00043-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00044-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00045-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00046-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00047-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00048-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00049-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00050-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00051-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00052-of-00206.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00053-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00054-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00055-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00056-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00057-of-00206.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

train-00058-of-00206.parquet:   0%|          | 0.00/389M [00:00<?, ?B/s]

train-00059-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00060-of-00206.parquet:   0%|          | 0.00/390M [00:00<?, ?B/s]

train-00061-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00062-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00063-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00064-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00065-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00066-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00067-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00068-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00069-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00070-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00071-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00072-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00073-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00074-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00075-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00076-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00077-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00078-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00079-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00080-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00081-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00082-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00083-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00084-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00085-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00086-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00087-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00088-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00089-of-00206.parquet:   0%|          | 0.00/387M [00:00<?, ?B/s]

train-00090-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00091-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00092-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00093-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00094-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00095-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00096-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00097-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00098-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00099-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00100-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00101-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00102-of-00206.parquet:   0%|          | 0.00/377M [00:00<?, ?B/s]

train-00103-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00104-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00105-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00106-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00107-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00108-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00109-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00110-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00111-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00112-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00113-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00114-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00115-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00116-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00117-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00118-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00119-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00120-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00121-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00122-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00123-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00124-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00125-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00126-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00127-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00128-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00129-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00130-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00131-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00132-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00133-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00134-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00135-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00136-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00137-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00138-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00139-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00140-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00141-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00142-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00143-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00144-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00145-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00146-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00147-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00148-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00149-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00150-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00151-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00152-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00153-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00154-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00155-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00156-of-00206.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00157-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00158-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00159-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00160-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00161-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00162-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00163-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00164-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00165-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00166-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00167-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00168-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00169-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00170-of-00206.parquet:   0%|          | 0.00/378M [00:00<?, ?B/s]

train-00171-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00172-of-00206.parquet:   0%|          | 0.00/379M [00:00<?, ?B/s]

train-00173-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00174-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00175-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00176-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00177-of-00206.parquet:   0%|          | 0.00/388M [00:00<?, ?B/s]

train-00178-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00179-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00180-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00181-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00182-of-00206.parquet:   0%|          | 0.00/390M [00:00<?, ?B/s]

train-00183-of-00206.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00184-of-00206.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00185-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00186-of-00206.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00187-of-00206.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]



train-00188-of-00206.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]



train-00189-of-00206.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

Oh nooooo, not enough memory... Well everythinh from the very beginning in the 3rd part this case

In [21]:
def preprocess(example):
    text = example['content'][:512]
    return {'input_ids': tokenizer(text, truncation=True, max_length=512)['input_ids']}
ds = ds.map(preprocess, batched=False)
ds = ds.filter(lambda x: len(x['input_ids'])>1)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=500,
    fp16=True,
    logging_steps=100,
    save_steps=500,
    report_to="none"
)
for p in model.parameters():
    if p.requires_grad:
        p.data = p.data.float()

opt_params = []
for n,m in model.named_modules():
    if isinstance(m, LoRALayer):
        opt_params.extend([m.adapter_A, m.adapter_B])
optimizer = torch.optim.Adam(opt_params, lr=2e-4)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator,
    optimizers=(optimizer, None)
)
trainer.train()

after = generate_samples(model, original_prompts)

row_template = '''  <tr>
    <td style="width:20%; border:1px solid black"><pre align="left">`{}`</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
  </tr>'''
rows = []
for p,b,a in zip(original_prompts,before,after):
    rows.append(row_template.format(p, b, a))
table_template = """<table style="border:1px solid black" >
  <tr>
    <th style="text-align: center; border:1px solid black">PROMPT</th>
    <th style="text-align: center; border:1px solid black">BEFORE</th>
    <th style="text-align: center; border:1px solid black">AFTER</th>
  </tr>
{}
</table>"""
display(HTML(table_template.format('\n'.join(rows))))

NameError: name 'ds' is not defined

If you reach this: congratulations! you've completed everything in this practice session.

If you want to dig deeper, try to implement prompt-tuning (for bonus points!).
You can read more about prompt tuning variants in paper [1](https://arxiv.org/abs/2104.08691) or paper [2](https://arxiv.org/abs/2101.00190). Both versions can be implemented by passing trainable prompts as `model.forward(..., past_key_values=your_prompts)`.



### Read more

* How post-training quantization works: https://arxiv.org/abs/2208.07339
* An overview of running large models: https://huggingface.co/docs/accelerate/package_reference/big_modeling
* A general library for different adapter types: https://adapterhub.ml/


### [extra info] Running other models.

This notebook's code can run with other models of similar size, such as [Falcon-7B](https://huggingface.co/tiiuae/falcon-7b), [OPT-6.7B](https://huggingface.co/facebook/opt-6.7b) or [BLOOM-7.1B](https://huggingface.co/bigscience/bloom-7b1). However, they will require minor code tweaks:
1. change the model name in `AutoModelForCausalLM.from_pretrained()` __and__ `AutoTokenizer`
2. In the prompt tuning code, change `model.model.embed_tokens` to refer to the target model's word embeddings. Simply `print(model)` to navigate to them.
3. Change code to add Lora layers - specifically where you what the transformer block components, since those components now have different names.