In [None]:
from transformers import (
    AutoConfig, 
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TextStreamer, 
    GenerationConfig, 
    logging,
    TrainingArguments,
    Trainer,
)
import datasets
import json
import pandas as pd
from pathlib import Path
import torch
import transformers
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

## Setup Dataset

In [None]:
# _OPENFUNCTIONS_TEST = "datasets/gorilla_openfunctions/test.jsonl"
# _OPENFUNCTIONS_TRAIN = "datasets/gorilla_openfunctions/train.jsonl"

Base Zephyr Model Prompt Template:
```text
<|system|>
You are a friendly chatbot who always responds in the style of a pirate.</s>
<|user|>
How many helicopters can a human eat in one sitting?</s>
<|assistant|>
Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
```

In [None]:
# test_data = pd.read_json(_OPENFUNCTIONS_TEST, lines=True)
# train_data = pd.read_json(_OPENFUNCTIONS_TRAIN, lines=True)

In [None]:
# column_types = {
#     'question': 'string',
#     'function': 'string',
#     'model_answer': 'string',
# }
# test_data = test_data.astype(column_types)
# train_data = train_data.astype(column_types)

In [None]:
# train_data['Functions'][432]

## Train Model

In [None]:
logging.set_verbosity_info()

In [None]:
_BASE_MODEL_PATH = Path('../models/zephyr-7b-beta/')
_LORA_OUTPUT_PATH = Path('../models/loras/')

In [None]:
base_model_tokenizer = AutoTokenizer.from_pretrained(_BASE_MODEL_PATH, use_fast=False)
base_model_config = AutoConfig.from_pretrained(_BASE_MODEL_PATH)

In [None]:
base_model_config.torch_dtype = torch.float16

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    _BASE_MODEL_PATH,
    config=base_model_config, 
    device_map='auto', 
    torch_dtype=base_model_config.torch_dtype,
    low_cpu_mem_usage=True
)

In [None]:
for param in base_model.parameters():
    # Turning off gradient calculation for base model as we want to train lora, not base model
    param.requires_grad = False

In [None]:
base_model.config.use_cache = False

In [None]:
base_model_tokenizer.bos_token, base_model_tokenizer.pad_token, base_model_tokenizer.eos_token, base_model_tokenizer.unk_token

### [Gradient Accumulation](https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-accumulation)
    The idea behind gradient accumulation is to instead of calculating the gradients for the whole batch at once to do it in smaller steps. The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process. When enough gradients are accumulated we run the model’s optimization step. This way we can easily increase the overall batch size to numbers that would never fit into the GPU’s memory. In turn, however, the added forward and backward passes can slow down the training a bit.

### [Gradient Checkpointing](https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing)
    Even when we set the batch size to 1 and use gradient accumulation we can still run out of memory when working with large models. In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead. Alternatively, one could forget all activations during the forward pass and recompute them on demand during the backward pass. This would however add a significant computational overhead and slow down training.

    Gradient checkpointing strikes a compromise between the two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. See this great article explaining the ideas behind gradient checkpointing.

In [None]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()

In [None]:
def print_trainable_parameters(model: nn.Module):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable(%): {100 * trainable_params / all_param}"
    )

https://medium.com/@manyi.yim/more-about-loraconfig-from-peft-581cf54643db

In [None]:
lora_config = LoraConfig(
    # peft_type: str | PeftType = None,
    # auto_mapping: dict | None = None,
    # base_model_name_or_path: str = None,
    # revision: str = None,
    task_type = TaskType.CAUSAL_LM,
    # inference_mode: bool = False,
    r = 64, #! 8, 16, 32, 64
    target_modules = ["q_proj", "v_proj"],
    lora_alpha = 16, #! 8, 16, 32
    lora_dropout = 0.1, #! 0.05
    # fan_in_fan_out: bool = False,
    bias = "none",
    # modules_to_save: List[str] | None = None,
    # init_lora_weights: bool = True,
    # layers_to_transform: List[int] | int | None = None,
    # layers_pattern: str | None = None
)
peft_model = get_peft_model(base_model, lora_config)
print_trainable_parameters(peft_model)

In [None]:
trainig_parms = TrainingArguments(
    output_dir=_LORA_OUTPUT_PATH,
    num_train_epochs=1,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=4,
    
    logging_steps=25, # Default: 500
    # fp16=True,
    
    save_steps=25,
    save_safetensors=True,
    report_to="tensorboard",
)

In [None]:
# Sample dataset
data = datasets.load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: base_model_tokenizer(samples['quote']), batched=True)

In [None]:
data['train'] = data['train'].select(range(100))
data

In [None]:
trainer = Trainer(
    model=peft_model,
    train_dataset=data['train'],
    # eval_dataset=data['validation'],
    args=trainig_parms,
    tokenizer=base_model_tokenizer,
    # callbacks=[],
    data_collator=transformers.DataCollatorForLanguageModeling(base_model_tokenizer, mlm=False),
)

In [None]:
trainer.train()

In [None]:
prompt = data['train'][76]['input_ids']
prompt = torch.tensor(prompt).unsqueeze(0)
outputs = base_model.generate(prompt)
base_model_tokenizer.decode(outputs[0], skip_special_tokens=True), data['train'][76]['quote']

In [None]:
peft_model.save_pretrained(_LORA_OUTPUT_PATH)
peft_model.config.save_pretrained(_LORA_OUTPUT_PATH)
peft_model = AutoModelForCausalLM.from_pretrained(
    _LORA_OUTPUT_PATH,
    config=base_model_config,
    device_map='auto',
    torch_dtype=base_model_config.torch_dtype,
    low_cpu_mem_usage=True
)

peft_model.config.use_cache = False


In [None]:
peft_model_tokenizer = AutoTokenizer.from_pretrained(_BASE_MODEL_PATH, use_fast=False)

prompt = data['train'][76]['input_ids'][:5]
prompt = torch.tensor(prompt).unsqueeze(0)
outputs = peft_model.generate(prompt)
peft_model_tokenizer.decode(outputs[0], skip_special_tokens=True), data['train'][76]['quote']
