In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer , AutoModelForCausalLM , TrainingArguments, Trainer , BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map = 'auto',
    trust_remote_code = True
)

tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)


In [3]:
lora_config = LoraConfig(
    r=8,
    lora_alpha = 16,
    target_modules=['q_proj','v_proj'],
    lora_dropout=0.05,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM
)
model=get_peft_model(model,lora_config)

In [4]:
data = load_dataset('openai/gsm8k','main',split='train[:200]')

In [5]:
def tokenize(batch):
    texts = [
        f'### Instruction:\n{question}\n### Response:\n{output}'
        for question, output in zip(batch['question'], batch['answer'])
    ]
    tokens = tokenizer(
        texts,
        padding='max_length',
        max_length=512,
        truncation=True,
        return_tensors='pt'
    )
    tokens['labels'] = tokens['input_ids'].clone()
    return tokens

In [6]:
tokenized_data = data.map(tokenize,batched=True,remove_columns=data.column_names)

Map: 100%|██████████| 200/200 [00:00<00:00, 1031.81 examples/s]


In [9]:
training_args = TrainingArguments(
    output_dir='./lora-tinyllama-gsm8k',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-3,
    num_train_epochs=50,
    fp16=True,
    logging_steps=20,
    save_strategy='epoch',
    report_to='none',
    remove_unused_columns=False,
    label_names=['labels']
)

In [10]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_data,
    processing_class= tokenizer
)

In [11]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
20,2.8441
40,0.4552
60,0.3912
80,0.3457
100,0.2997
120,0.2663
140,0.228
160,0.1841
180,0.1629
200,0.123


TrainOutput(global_step=650, training_loss=0.1843914624131643, metrics={'train_runtime': 870.9587, 'train_samples_per_second': 11.482, 'train_steps_per_second': 0.746, 'total_flos': 3.181482344448e+16, 'train_loss': 0.1843914624131643, 'epoch': 50.0})

In [13]:
model.save_pretrained('./lora-tinyllama-gsm8k-after-train')
tokenizer.save_pretrained('./lora-tinyllama-gsm8k-after-train')

('./lora-tinyllama-gsm8k-after-train\\tokenizer_config.json',
 './lora-tinyllama-gsm8k-after-train\\special_tokens_map.json',
 './lora-tinyllama-gsm8k-after-train\\chat_template.jinja',
 './lora-tinyllama-gsm8k-after-train\\tokenizer.json')