In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

from peft import (
    LoraConfig,
    get_peft_model,
)

from datasets import load_dataset
from trl import SFTTrainer
import torch

Wandb will be done later

In [None]:
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
access_token = "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"

tokenizer = AutoTokenizer.from_pretrained(model)
# QLoRA config
torch_dtype = torch.float16
attn_implementation = "eager"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

Datasets preparation

In [None]:
train_dataset_url = "datasets_train_valid_test/test.jsonl"
test_dataset_url ="datasets_train_valid_test/test.jsonl"
validation_dataset_url ="datasets_train_valid_test/test.jsonl"

Datasets loading

In [None]:
data_files = {
    'train': train_dataset_url,
    'test': test_dataset_url,
    'validation': validation_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

Datasets tokenization

In [None]:
def tokenize_function(examples):
    inputs = examples['input']
    targets = examples['output']
    max_length = 2048
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    model_input['labels'] = labels['input_ids']
    return model_input

trained_data = train_dataset.map(tokenize_function, batched=True)
validation_data = validation_dataset.map(tokenize_function, batched=True)
test_data = test_dataset.map(tokenize_function, batched=True)

Training arguments

In [None]:
batch_size = 5
epochs = 5
max_length = 512
output_dir = 't5_datasets_class1/results'
logs_dir = 't5_datasets_class1/logs'



training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=epochs,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    logging_dir=logs_dir,
    warmup_steps=100,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
)

Evaluation metrics

In [None]:
from evaluate import load
import numpy as np

perplexity = load("perplexity", module_type="metric")
def compute_metrics(eval_pred):
    metrics, labels = eval_pred
    predictions = np.argmax(metrics, axis=-1)

    return perplexity.compute(predictions=predictions, model_id='Meta-Llama-3.1-8B-Instruct')


Training

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_arguments,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

saving the model

In [None]:
model_path = 't5_data/model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)