In [1]:
import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="LLMs/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 24.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], 
    use_rslora=True,
    use_gradient_checkpointing="unsloth"
)


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [18]:
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

def apply_template(examples):
    messages = examples["conversations"]
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    return {"text": text}

#dataset = load_dataset("Fine_tuning_data/FineTome-100k", split="train")
#dataset = load_dataset("CoT_data", split="train")
#dataset = dataset.map(apply_template, batched=True)
data_files = 'Fine_tuning_data/CoT_data.jsonl'
dataset = load_dataset('json', data_files=data_files, split='train')
dataset = dataset.map(apply_template, batched=True)



Map:   0%|          | 0/1685 [00:00<?, ? examples/s]

In [19]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        seed=0,
    ),
)

trainer.train()


Generating train split: 0 examples [00:00, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 700 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 44
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.0211
2,1.0129
3,0.996
4,0.9379
5,0.9142
6,0.8983
7,0.8162
8,0.8507
9,0.7462
10,0.7365


TrainOutput(global_step=44, training_loss=0.71166385168379, metrics={'train_runtime': 1202.804, 'train_samples_per_second': 0.582, 'train_steps_per_second': 0.037, 'total_flos': 6.49151373115392e+16, 'train_loss': 0.71166385168379, 'epoch': 1.0})

In [24]:
model = FastLanguageModel.for_inference(model)

messages = [
    {"from": "human", "value": "Is 9.11 larger than 9.9?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=1280, use_cache=True)


<|im_start|>user
Is 9.11 larger than 9.9?<|im_end|>
<|im_start|>assistant
Agent 1 (Reasoning): Okay, let's break down this question. We need to compare 9.11 and 9.9. Since we're dealing with numbers, we can use the comparison operator "greater than" to determine which one is larger.

Agent 2 (Verification): Hold on, let's make sure we're on the right track. We're comparing two decimal numbers, right?

Agent 1 (Reasoning): That's correct. We have 9.11 and 9.9, and we need to determine which one is larger.

Agent 2 (Verification): Okay, so what's the first step in comparing these numbers?

Agent 1 (Reasoning): Well, we can convert them to fractions. 9.11 can be written as 911/100, and 9.9 can be written as 990/100.

Agent 2 (Verification): That makes sense. Now, what's the next step?

Agent 1 (Reasoning): We can compare the fractions. Since 911/100 is larger than 990/100, we can conclude that 9.11 is larger than 9.9.

Agent 2 (Verification): I see. And what about the comparison operator?

In [8]:
model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 1.94 out of 15.51 RAM for saving.


100%|██████████| 32/32 [00:09<00:00,  3.48it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [None]:
quant_methods = ["q2_k", "q3_k_m", "q4_k_m", "q5_k_m", "q6_k", "q8_0"]
for quant in quant_methods:
    model.push_to_hub_gguf("mlabonne/FineLlama-3.1-8B-GGUF", tokenizer, quant)

In [2]:
dataset = load_dataset("Fine_tuning_data/FineTome-100k", split="train")