In [1]:
# Modules for fine-tuning
from unsloth import FastLanguageModel
import torch # Import PyTorch
from trl import SFTTrainer # Trainer for supervised fine-tuning (SFT)
from unsloth import is_bfloat16_supported # Checks if the hardware supports bfloat16 precision
from transformers import TrainingArguments # Defines training hyperparameters
from datasets import load_dataset # Lets you load fine-tuning datasets

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from transformers.utils import logging
from huggingface_hub import HfApi, HfFolder, hf_hub_download

import os
os.environ["TRANSFORMERS_OFFLINE"] = "1"  # <- ini penting


In [3]:
model_path = r"C:\Users\Kentdry\Documents\VSCODE\TA1(Deepseek)\models--unsloth--llama-3.2-1b-instruct-unsloth-bnb-4bit\snapshots\0a4436e20494a6504464ce35274b7e53fb7883d0"  # lengkapin path-nya
max_seq_length = 2048  # Maximum number of tokens processed at once
dtype = None  # Default data type (adjusts automatically)
load_in_4bit = True  # Enable 4-bit quantization to save memory
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_path,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3050 6GB Laptop GPU. Num GPUs = 1. Max memory: 6.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
# Define a system prompt under prompt_style 
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
{}

### Question:
{}

### Response:
{}"""

In [5]:
# Creating a test question for inference
instruction = "tell me who is this person"
question = "Gibran Rakabuming Raka"

# Enable optimized inference mode for Unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_style.format(instruction,question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
tell me who is this person

### Question:
Gibran Rakabuming Raka

### Response:
Gibran Rakabuming Raka is a Chilean poet, writer, and artist. He is known for his lyrical and mystical poetry, often exploring themes of love, nature, and spirituality. Raka's work has been widely translated and admired globally, and he has received numerous awards and honors for his contributions to literature.<|eot_id|>


In [6]:
# We need to format the dataset to fit our prompt training style 
EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which the model when to stop generating text during training
EOS_TOKEN

'<|eot_id|>'

In [11]:
chat_templates="""Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question 
    ### Instruction:
    {}
    ### Input:
    {}
    ### Response:
    {}"""

def formatting_prompts_func(example):
    instructions = example["instruction"]
    inputs       = example["input"]
    outputs      = example["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = chat_templates.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


dataset_path = r"C:\Users\Kentdry\Documents\VSCODE\TA1(Deepseek)\alpaca_cleaned.json"
from datasets import load_dataset

dataset = load_dataset("json", data_files=dataset_path, split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
alpaca_dataset = dataset.train_test_split(test_size=0.01, seed=42)["train"]  # Ambil 10% data acak
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 51760
})


In [13]:
print(alpaca_dataset[1])
print(alpaca_dataset[5000])
# print(alpaca_dataset[5000])

{'instruction': 'Tell me the title and author of a book that won the Pulitzer Prize in 2019', 'input': '', 'output': 'The book that won the Pulitzer Prize for Fiction in 2019 is "The Overstory" by Richard Powers.', 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question \n    ### Instruction:\n    Tell me the title and author of a book that won the Pulitzer Prize in 2019\n    ### Input:\n    \n    ### Response:\n    The book that won the Pulitzer Prize for Fiction in 2019 is "The Overstory" by Richard Powers.<|eot_id|>'}
{'instruction': 'Identify two potential interpretations of the sentence.', 'input': 'I said goodbye to my parents.', 'output': '1. The speaker physically spoke the words "goodbye" to their parents as a farewell.\n2. The speaker metaphorically said goodbye to their parents, meaning they parted ways or end

In [14]:
# Apply LoRA (Low-Rank Adaptation) fine-tuning to the model 
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # LoRA rank: Determines the size of the trainable adapters (higher = more parameters, lower = more efficiency)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=64,  # Scaling factor for LoRA updates (higher values allow more influence from LoRA layers)
    lora_dropout=0,  # Dropout rate for LoRA layers (0 means no dropout, full retention of information)
    bias="none",  # Specifies whether LoRA layers should learn bias terms (setting to "none" saves memory)
    use_gradient_checkpointing="unsloth",  # Saves memory by recomputing activations instead of storing them (recommended for long-context fine-tuning)
    random_state=3407,  # Sets a seed for reproducibility, ensuring the same fine-tuning behavior across runs
    use_rslora=False,  # Whether to use Rank-Stabilized LoRA (disabled here, meaning fixed-rank LoRA is used)
    loftq_config=None,  # Low-bit Fine-Tuning Quantization (LoFTQ) is disabled in this configuration
)
model.print_trainable_parameters()

Unsloth 2025.5.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


trainable params: 22,544,384 || all params: 1,258,358,784 || trainable%: 1.7916


Now, we initialize `SFTTrainer`, a supervised fine-tuning trainer from `trl` (Transformer Reinforcement Learning), to fine-tune our model efficiently on a dataset.

In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        # max_steps = 60,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "lora_modely",
        report_to = "none",
    ),
)

Unsloth: Tokenizing ["text"]: 100%|██████████| 51242/51242 [00:08<00:00, 6256.22 examples/s]


## Step 4 — Model training! 

This should take around 30 to 40 minutes — we can then check out our training results on Weights and Biases

In [16]:
# Start the fine-tuning process
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,242 | Num Epochs = 1 | Total steps = 6,405
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 22,544,384/1,000,000,000 (2.25% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.7997
20,1.1443
30,1.0804
40,1.0805
50,0.9844
60,1.0773
70,1.0063
80,1.0474
90,1.0407
100,0.9524


In [17]:
# model.save_pretrained("lora_model")  # Local saving
trainer.model.save_pretrained("lora_modely")
tokenizer.save_pretrained("lora_modely")
trainer.save_model("lora_modely")

In [None]:
from unsloth import FastLanguageModel
if True:   
    # Set parameters
    model_path = r"C:\Users\Kentdry\Documents\VSCODE\TA1(Deepseek)\models--unsloth--llama-3.2-1b-instruct-unsloth-bnb-4bit\snapshots\0a4436e20494a6504464ce35274b7e53fb7883d0"  # lengkapin path-nya
    max_seq_length = 2048  # Maximum number of tokens processed at once
    dtype = None  # Default data type (adjusts automatically)
    load_in_4bit = True  # Enable 4-bit quantization to save memory
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
model.load_adapter("lora_modely", adapter_name="default")
model.set_adapter("default")

FastLanguageModel.for_inference(model)


==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3050 6GB Laptop GPU. Num GPUs = 1. Max memory: 6.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=32, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=32, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
              (

In [19]:
chat_templates="""Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question 
    ### Instruction:
    {}
    ### Input:
    {}
    ### Response:
    {}"""

In [20]:

# FastLanguageModel.for_inference(model)
instruction = "describe him as a bad person"
question = "Angelo"

inputs = tokenizer(
    chat_templates.format(instruction,question,"")# output - leave this blank for generation!
    , return_tensors = "pt").to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question 
    ### Instruction:
    describe him as a bad person
    ### Input:
    Angelo
    ### Response:
     I'm sorry, but as an AI, I do not have enough information about Angelo to accurately describe him as a bad person. Can you please provide more details or context about Angelo's behavior or actions that would help me better understand his character and describe him as a bad person?<|eot_id|>


In [21]:
inputs = tokenizer(
[
    chat_templates.format(
        "how many student studied at SMA Katolik Yos Sudarso Batam?",        
        "",
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)



<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question 
    ### Instruction:
    how many student studied at SMA Katolik Yos Sudarso Batam?
    ### Input:
    
    ### Response:
     According to my knowledge, SMA Katolik Yos Sudarso Batam is a school affiliated with the Roman Catholic Diocese of Riau in Riau Island, which is located in the southeast of the Indonesian archipelago. The school is based in RIAU's administrative capital, Pekanbaru in KALBAR, and has a student population of around 1,500 students, divided among its different schools and campuses. However, please note that the exact number of students may change over time and I couldn't find any up-to-date information on this specific number.<|eot_id|>
