## Modules

In [1]:
import torch
import os
import re
import pandas as pd
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    GenerationConfig
)
from peft.tuners.lora import LoraLayer

from trl import SFTTrainer

## Config

In [2]:
TESTING = False
os.environ["WANDB_DISABLED"] = "true"
MAX_SEQ_LENGTH = None
QUANTIZE = True
model_path = "/mnt/shared/tibor/Llama-2-13b-chat-hf"
EPOCHS = 6
OUTPUT_DIR = f"adapters/adapter_13b_4bit_quant_{EPOCHS}_epochs_noansw_no_modansw"

## Prepare trainer and tokenizer

In [3]:
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, add_eos_token=True)
#Create a new token and add it to the tokenizer
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = 'left'
print(f"{tokenizer.model_max_length=}")

tokenizer.model_max_length=1000000000000000019884624838656


In [4]:
print(f"{tokenizer.eos_token_id=}")
inputs = tokenizer("Ez egy teszt", return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
input_ids

tokenizer.eos_token_id=2


tensor([[    1, 18817,  5524,   260, 23293,     2]], device='cuda:0')

In [5]:
# compute_dtype = getattr(torch, "float16")
compute_dtype = "bfloat16"

# 8 bit qunatised 7b + LoRa > 20 GB

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
        # load_in_8bit=True,
        # bnb_8bit_quant_type="nf8",
        # bnb_8bit_compute_dtype=compute_dtype,
        # bnb_8bit_use_double_quant=True,
)

if QUANTIZE:
    model = AutoModelForCausalLM.from_pretrained(
              model_path, quantization_config=bnb_config, device_map={"": 0}
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
              model_path, torch_dtype=torch.bfloat16,  device_map={"": 0}
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
#Resize the embeddings
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16) # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc

Embedding(32016, 5120)

In [7]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching

In [8]:
model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules_all_linear_layers = list(set(names))
target_modules_all_linear_layers

['v_proj',
 'up_proj',
 'k_proj',
 'q_proj',
 'gate_proj',
 'o_proj',
 'lm_head',
 'down_proj']

In [9]:
model = prepare_model_for_kbit_training(model) # ?
peft_config = LoraConfig(
        lora_alpha=32,
        # lora_alpha=16,
        lora_dropout=0.1,
        r=8,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj","v_proj"]
        # target_modules=target_modules_all_linear_layers
)

In [10]:

training_arguments=None
if TESTING:
    training_arguments = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        per_device_eval_batch_size=4,
        log_level="debug",
        optim="paged_adamw_32bit",
        save_steps=2, #change to 500, test: 2
        logging_steps=1, #change to 100, test: 1
        learning_rate=1e-4,
        eval_steps=5, #change to 200, test: 5
        bf16=True,
        max_grad_norm=0.3,
        # num_train_epochs=3, # remove "#"
        max_steps=10, #remove this
        warmup_ratio=0.03,
        lr_scheduler_type="constant",
)
else:
    training_arguments = TrainingArguments(
            output_dir=OUTPUT_DIR,
            evaluation_strategy="steps",
            do_eval=True,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=1,
            per_device_eval_batch_size=4,
            log_level="debug",
            optim="paged_adamw_32bit",
            save_steps=500, #change to 500, test: 2
            logging_steps=100, #change to 100, test: 1
            learning_rate=1e-4,
            eval_steps=200, #change to 200, test: 5
            bf16=True,
            max_grad_norm=0.3,
            num_train_epochs=EPOCHS, # remove "#"
            # max_steps=10, #remove this
            warmup_ratio=0.03,
            lr_scheduler_type="constant",
            report_to=None
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
dataset = load_dataset("csv", 
                       data_files={'train': 'data/train_w_noansw.csv', 'eval': 'data/eval_w_noansw.csv'},
                       delimiter=";",
                       column_names=['question', 'context', 'answer', 'text'])

### Check for longest input

In [12]:
inputs = tokenizer("Ez egy teszt", return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
print(f"{input_ids=}")


input_ids=tensor([[    1, 18817,  5524,   260, 23293,     2]], device='cuda:0')


In [13]:
train_df = pd.read_csv("data/train.csv", sep=';')
eval_df = pd.read_csv("data/test.csv", sep=';')
test_df = pd.read_csv("data/test.csv", sep=';')

all_data=[train_df, eval_df, test_df]
longest_embedding=0


for df in all_data:
    for index, row in df.loc[:].iterrows():

        inputs = tokenizer(row["text"], return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()
        
        if len(input_ids[0]) > longest_embedding:
            longest_embedding = len(input_ids[0])

    print(f"{longest_embedding=}")
    longest_embedding=0
        

longest_embedding=985
longest_embedding=965
longest_embedding=965


## Train

In [14]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['eval'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=min(tokenizer.model_max_length, 1024),  # default: min(tokenizer.model_max_length, 1024),
        tokenizer=tokenizer,
        args=training_arguments,
)



Map:   0%|          | 0/3188 [00:00<?, ? examples/s]

Map:   0%|          | 0/684 [00:00<?, ? examples/s]

[codecarbon INFO @ 10:06:30] [setup] RAM Tracking...
[codecarbon INFO @ 10:06:30] [setup] GPU Tracking...
[codecarbon INFO @ 10:06:30] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 10:06:30] [setup] CPU Tracking...
[codecarbon INFO @ 10:06:32] CPU Model on constant consumption mode: AMD EPYC-Rome Processor
[codecarbon INFO @ 10:06:32] >>> Tracker's metadata:
[codecarbon INFO @ 10:06:32]   Platform system: Linux-5.15.0-41-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 10:06:32]   Python version: 3.10.12
[codecarbon INFO @ 10:06:32]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 10:06:32]   Available RAM : 31.354 GB
[codecarbon INFO @ 10:06:32]   CPU count: 8
[codecarbon INFO @ 10:06:32]   CPU model: AMD EPYC-Rome Processor
[codecarbon INFO @ 10:06:32]   GPU count: 1
[codecarbon INFO @ 10:06:32]   GPU model: 1 x GRID A100-20C


In [15]:
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 3,188
  Num Epochs = 6
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 4,782
  Number of trainable parameters = 6,553,600
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,1.1973,1.178835
400,1.1505,1.145909
600,1.1291,1.125832
800,1.1094,1.105048
1000,1.0372,1.091474
1200,1.0381,1.072575
1400,1.0234,1.055258
1600,1.0047,1.037315
1800,0.9086,1.020952
2000,0.9158,1.000045


[codecarbon INFO @ 10:06:48] Energy consumed for RAM : 0.000049 kWh. RAM Power : 11.757657051086426 W
[codecarbon INFO @ 10:06:48] Energy consumed for all GPUs : 0.000000 kWh. Total GPU Power : 0.0 W
[codecarbon INFO @ 10:06:48] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:06:48] 0.000226 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:03] Energy consumed for RAM : 0.000098 kWh. RAM Power : 11.757657051086426 W
[codecarbon INFO @ 10:07:03] Energy consumed for all GPUs : 0.000000 kWh. Total GPU Power : 0.0 W
[codecarbon INFO @ 10:07:03] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:07:03] 0.000452 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:07:18] Energy consumed for RAM : 0.000147 kWh. RAM Power : 11.757657051086426 W
[codecarbon INFO @ 10:07:18] Energy consumed for all GPUs : 0.000000 kWh. Total GPU Power : 0.0 W
[codecarbon INFO @ 10:07:18] Energy 

TrainOutput(global_step=4782, training_loss=0.8513053820652724, metrics={'train_runtime': 15625.5374, 'train_samples_per_second': 1.224, 'train_steps_per_second': 0.306, 'total_flos': 9.468150134956032e+17, 'train_loss': 0.8513053820652724, 'epoch': 6.0})