In [1]:
%load_ext autoreload
%autoreload 2

## Modules

In [2]:
import torch
import os
import re
import pandas as pd
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    GenerationConfig
)
from peft.tuners.lora import LoraLayer
from trl import SFTTrainer
from tools import prep_tokenizer, prepare_model
from numba import cuda

## Config

In [3]:
# %load_ext autoreload
# %autoreload 2
# elejére

TESTING = True
os.environ["WANDB_DISABLED"] = "true"
MAX_SEQ_LENGTH = None
QUANTIZE = True
model_path = "/mnt/shared/tibor/Llama-2-7b-chat-hf"
EPOCHS = 6
# OUTPUT_DIR = f"adapters/adapter_7b_4bit_quant_{EPOCHS}_epochs_noansw_no_modansw"
OUTPUT_DIR = f"adapters/adapter_7b_4bit_quant_TEST"

## Prepare trainer and tokenizer

In [4]:
tokenizer = prep_tokenizer(model_path=model_path, add_eos_token=True)

In [5]:
print(f"{tokenizer.model_max_length=}")

tokenizer.model_max_length=1000000000000000019884624838656


In [6]:
# Test
print(f"{tokenizer.eos_token_id=}")
inputs = tokenizer("Ez egy teszt", return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
input_ids

tokenizer.eos_token_id=2


tensor([[    1, 18817,  5524,   260, 23293,     2]], device='cuda:0')

In [7]:
model = prepare_model(model_path=model_path, tokenizer=tokenizer, quantize=True, load_in_4bit=True, load_in_8bit=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
peft_config = LoraConfig(
        lora_alpha=32,
        # lora_alpha=16,
        lora_dropout=0.1,
        r=8,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj","v_proj"]
        # target_modules=target_modules_all_linear_layers
)

In [9]:

training_arguments=None
if TESTING:
    training_arguments = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        per_device_eval_batch_size=4,
        log_level="debug",
        optim="paged_adamw_32bit",
        save_steps=2, #change to 500, test: 2
        logging_steps=1, #change to 100, test: 1
        learning_rate=1e-4,
        eval_steps=5, #change to 200, test: 5
        bf16=True,
        max_grad_norm=0.3,
        # num_train_epochs=3, # remove "#"
        max_steps=10, #remove this
        warmup_ratio=0.03,
        lr_scheduler_type="constant",
)
else:
    training_arguments = TrainingArguments(
            output_dir=OUTPUT_DIR,
            evaluation_strategy="steps",
            do_eval=True,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=1,
            per_device_eval_batch_size=4,
            log_level="debug",
            optim="paged_adamw_32bit",
            save_steps=500, #change to 500, test: 2
            logging_steps=100, #change to 100, test: 1
            learning_rate=1e-4,
            eval_steps=200, #change to 200, test: 5
            bf16=True,
            max_grad_norm=0.3,
            num_train_epochs=EPOCHS, # remove "#"
            # max_steps=10, #remove this
            warmup_ratio=0.03,
            lr_scheduler_type="constant",
            report_to=None
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [10]:
dataset = load_dataset("csv", 
                       data_files={'train': 'data/train_w_noansw.csv', 'eval': 'data/eval_w_noansw.csv'},
                       delimiter=";",
                       column_names=['question', 'context', 'answer', 'text'])

## Check for longest inputs

In [11]:
train_df = pd.read_csv("data/train.csv", sep=';')
eval_df = pd.read_csv("data/test.csv", sep=';')
test_df = pd.read_csv("data/test.csv", sep=';')

all_data=[train_df, eval_df, test_df]
longest_embedding=0


for df in all_data:
    for index, row in df.loc[:].iterrows():

        inputs = tokenizer(row["text"], return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()
        
        if len(input_ids[0]) > longest_embedding:
            longest_embedding = len(input_ids[0])

    print(f"{longest_embedding=}")
    longest_embedding=0
        

longest_embedding=985
longest_embedding=965
longest_embedding=965


## Train

In [12]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['eval'],
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=min(tokenizer.model_max_length, 1024),  # default: min(tokenizer.model_max_length, 1024),
        tokenizer=tokenizer,
        args=training_arguments,
)

max_steps is given, it will override any value given in num_train_epochs
[codecarbon INFO @ 13:45:21] [setup] RAM Tracking...
[codecarbon INFO @ 13:45:22] [setup] GPU Tracking...
[codecarbon INFO @ 13:45:22] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 13:45:22] [setup] CPU Tracking...
[codecarbon INFO @ 13:45:24] CPU Model on constant consumption mode: AMD EPYC-Rome Processor
[codecarbon INFO @ 13:45:24] >>> Tracker's metadata:
[codecarbon INFO @ 13:45:24]   Platform system: Linux-5.15.0-41-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 13:45:24]   Python version: 3.10.12
[codecarbon INFO @ 13:45:24]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 13:45:24]   Available RAM : 31.354 GB
[codecarbon INFO @ 13:45:24]   CPU count: 8
[codecarbon INFO @ 13:45:24]   CPU model: AMD EPYC-Rome Processor
[codecarbon INFO @ 13:45:24]   GPU count: 1
[codecarbon INFO @ 13:45:24]   GPU model: 1 x GRID A100-20C


In [13]:
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 3,188
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 4,194,304
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
5,1.8666,1.918341
10,1.7049,1.750651


Saving model checkpoint to adapters/adapter_7b_4bit_quant_TEST/checkpoint-2
tokenizer config file saved in adapters/adapter_7b_4bit_quant_TEST/checkpoint-2/tokenizer_config.json
Special tokens file saved in adapters/adapter_7b_4bit_quant_TEST/checkpoint-2/special_tokens_map.json
Saving model checkpoint to adapters/adapter_7b_4bit_quant_TEST/checkpoint-4
tokenizer config file saved in adapters/adapter_7b_4bit_quant_TEST/checkpoint-4/tokenizer_config.json
Special tokens file saved in adapters/adapter_7b_4bit_quant_TEST/checkpoint-4/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 684
  Batch size = 4
[codecarbon INFO @ 13:45:40] Energy consumed for RAM : 0.000049 kWh. RAM Power : 11.757657051086426 W
[codecarbon INFO @ 13:45:40] Energy consumed for all GPUs : 0.000000 kWh. Total GPU Power : 0.0 W
[codecarbon INFO @ 13:45:40] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 13:45:40] 0.000226 kWh of electricity used since the 

TrainOutput(global_step=10, training_loss=1.9409354090690614, metrics={'train_runtime': 182.6829, 'train_samples_per_second': 0.219, 'train_steps_per_second': 0.055, 'total_flos': 1017129075671040.0, 'train_loss': 1.9409354090690614, 'epoch': 0.01})