# QLoRA and Gradient Accumulation

### Dependencies

In [None]:
!pip install peft bitsandbytes accelerate==0.34.0 -q

In [None]:
import os
import gc
from typing import List, Dict, Any

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

from datasets import load_dataset
from peft import PeftModel, get_peft_model, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, AutoConfig, set_seed

set_seed(12, True)

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

In [None]:
torch.use_deterministic_algorithms(True)
torch.manual_seed(0)

### Gradient Accumulation 

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
input_size = 512
output_size = 256
batch_size = 64
gradient_accumulation_steps = 4

model = nn.Linear(input_size, output_size).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

x = torch.randn(batch_size, input_size).to(device)
y = torch.randn(batch_size, output_size).to(device)
loss_fn = nn.MSELoss()
for i in range(1000):
    optimizer.zero_grad()
    output = model(x)
    loss = loss_fn(output, y)
    loss.backward()
    optimizer.step()
    
print(loss.item())

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
input_size = 512
output_size = 256
batch_size = 64
gradient_accumulation_steps = 4


model = nn.Linear(input_size, output_size).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)


x = torch.randn(batch_size, input_size).to(device)
y = torch.randn(batch_size, output_size).to(device)
loss_fn = nn.MSELoss()

mini_batch_size = batch_size // gradient_accumulation_steps

for i in range(1000):
    optimizer.zero_grad()
    for j in range(gradient_accumulation_steps):
        x_mini_batch = x[j*mini_batch_size:(j+1)*mini_batch_size]
        y_mini_batch = y[j*mini_batch_size:(j+1)*mini_batch_size]
        output = model(x_mini_batch)
        loss = loss_fn(output, y_mini_batch) / gradient_accumulation_steps
        loss.backward()
    optimizer.step()
    
print(loss.item())

### QLORA 

In [None]:
model_name = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

imdb = load_dataset("imdb")

In [None]:
def collate_fn(batch: List[Dict[str, Any]]):
    class_mapping = {0: "negative", 1: "positive"}
    texts = [sample["text"] + " ||| " + f' {class_mapping[sample["label"]]} {tokenizer.eos_token}' for sample in batch]
    tokenized_values = tokenizer(texts, padding='longest', return_tensors='pt')
    
    labels_ids = tokenized_values['input_ids'].clone()
    for i in range(len(labels_ids)):
        labels_ids[i, :-2] = -100

    return {
        "input_ids": tokenized_values['input_ids'],
        "attention_mask": tokenized_values['attention_mask'],
        "labels": labels_ids
    }

res = collate_fn([imdb["train"][0], imdb["train"][12505], imdb["train"][2]])

assert tokenizer.decode(res["input_ids"][res["labels"] != -100]) == "negative</s> positive</s> negative</s>"

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
)

model.enable_input_require_grads()

In [None]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    fan_in_fan_out=False,
    bias="none"
)
model = PeftModel(model, peft_config)

In [None]:
num_true = 0
for name, param in model.named_parameters():
    if "lora" in name:
        num_true += 1
        param.requires_grad = True
    else:
        param.requires_grad = False
print(num_true)

In [None]:
args = TrainingArguments(
    output_dir='experiments',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    max_steps=10,
    logging_steps=5,
    save_steps=5,
    save_total_limit=3,
    fp16=True,
    report_to=None,
    remove_unused_columns=False,
    torch_empty_cache_steps=10,
    gradient_checkpointing=True
)

# Настройка Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=imdb["train"],
    tokenizer=tokenizer,
    data_collator=collate_fn,
)

# Запуск обучения
trainer.train()

In [None]:
input_text = imdb["test"][0]["text"] + " ||| "
label = imdb["test"][0]["label"]
x = tokenizer(input_text, return_tensors="pt")
for k, v in x.items():
    x[k] = v.cuda()

print(label)
g = model.generate(**x, max_new_tokens=2, do_sample=False)
print(tokenizer.decode(g[0].tolist()))