In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [28]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Loading weights: 100%|██████████| 201/201 [00:00<00:00, 597.02it/s, Materializing param=model.norm.weight]                              


In [29]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [30]:
model = get_peft_model(model, lora_config)

In [31]:
model.print_trainable_parameters()

trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


In [32]:
data = load_dataset("json", data_files=r"Dataset\train2.jsonl")["train"]

data

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 100
})

In [33]:
def format_example(example):
        prompt = f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}<|endoftext|>"""
        return prompt

In [45]:
training_args = SFTConfig(
    output_dir="./finetuned_improved",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    warmup_steps=20,
    weight_decay=0.01,
    fp16=False,  
    bf16=False,  
)

In [46]:
trainer = SFTTrainer(
    model=model,
    args=training_args,  
    train_dataset=data,
    formatting_func=format_example,
)

In [47]:
trainer.processing_class = tokenizer

In [48]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
  super().__init__(loader)


Step,Training Loss
10,2.423492
20,1.493382
30,0.458161


  super().__init__(loader)
  super().__init__(loader)


TrainOutput(global_step=39, training_loss=1.1937886812748053, metrics={'train_runtime': 4248.8443, 'train_samples_per_second': 0.071, 'train_steps_per_second': 0.009, 'total_flos': 129120842907648.0, 'train_loss': 1.1937886812748053})

In [49]:
model.save_pretrained("./finetuned_improved")

tokenizer.save_pretrained("./finetuned_improved")

('./finetuned_improved\\tokenizer_config.json',
 './finetuned_improved\\chat_template.jinja',
 './finetuned_improved\\tokenizer.json')