In [1]:
!pip install -q transformers torch accelerate trl peft unsloth ollama datasets evaluate

In [2]:
import torch, unsloth, trl, transformers, bitsandbytes, datasets, accelerate, peft
print("torch :", torch.__version__)
print("unsloth:", unsloth.__version__)
print("trl   :", trl.__version__)
print("trans :", transformers.__version__)
print("bnb   :", bitsandbytes.__version__)
print("datasets:", datasets.__version__)
print("peft:", peft.__version__)
print("accelerate:", accelerate.__version__)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
torch : 2.7.0+cu126
unsloth: 2025.4.7
trl   : 0.15.2
trans : 4.51.3
bnb   : 0.45.5
datasets: 3.5.1
peft: 0.15.2
accelerate: 1.6.0


In [3]:
# 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
from huggingface_hub import login

login("")

In [7]:
import unsloth
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import (
    Trainer,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    default_data_collator
)
from unsloth import is_bfloat16_supported
import os, math
from datasets import load_dataset, Features, Value
import evaluate
from trl import SFTTrainer, SFTConfig
from unsloth import FastLanguageModel

In [8]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

model_id = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = ""
)


==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


### **ADD LORA**

In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Already have LoRA adapters! We shall skip this step.


# **Pre-Train GSM8K**

In [16]:
raw = load_dataset("gsm8k", "main")
small_train = (
    raw["train"]
    .shuffle(seed=42)
    .select(range(int(len(raw["train"]) * 0.25)))
)

In [17]:
def preprocess_answer(batch):
    batch["answer"] = [ans.split("####")[0].strip() for ans in batch["answer"]]
    return batch

small_train = small_train.map(
    preprocess_answer,
    batched=True,          # process batches for speed
    num_proc=4,            # ← adjust to your CPU cores (optional)
)

Map (num_proc=4):   0%|          | 0/1868 [00:00<?, ? examples/s]

In [18]:
def make_messages(example):
    return {
        "messages": [
            {"role": "system",    "content": "You are a helpful math tutor. Show your work step by step."},
            {"role": "user",      "content": example["question"]},
            {"role": "assistant", "content": example["answer"]},
        ]
    }

ds = small_train.map(
    make_messages,
)

Map:   0%|          | 0/1868 [00:00<?, ? examples/s]

In [19]:
def format_conversation(example):
    text = tokenizer.apply_chat_template(
        example["messages"],   # ← the list of {role, content} you showed
        tokenize=False         # keep raw text; tokenize later in the trainer
    )
    return {"text": text}

formatted_gsm = ds.map(format_conversation,
                      remove_columns=ds.column_names)

Map:   0%|          | 0/1868 [00:00<?, ? examples/s]

In [20]:
small_train

Dataset({
    features: ['question', 'answer'],
    num_rows: 1868
})

In [21]:
formatted_gsm[0]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a helpful math tutor. Show your work step by step.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nMimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMimi has 2 x 12 = <<2*12=24>>24 sea shells.\nKyle has 24 x 2 = <<24*2=48>>48 sea shells.\nLeigh has 48 / 3 = <<48/3=16>>16 sea shells.<|eot_id|>'}

In [22]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_gsm,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1868 [00:00<?, ? examples/s]

In [23]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,868 | Num Epochs = 2 | Total steps = 466
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.3749
2,2.138
3,1.7689
4,1.5229
5,1.1919
6,0.9868
7,0.9904
8,0.8781
9,0.9111
10,1.0517


# **Import self made dataset**

In [24]:
from google.colab import drive
import json
import glob

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
DATASET_PATH = 'drive/MyDrive/dataset2/CoMTA_dataset.json'

with open(DATASET_PATH, 'r', encoding='utf-8') as f:
  dataset = json.load(f)

In [26]:
# LOAD self-made dataset
DATASET_PATH2 = 'drive/MyDrive/dataset2/dataset - *.json'

dataset2 = []

for filename in glob.glob(DATASET_PATH2):
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
        dataset2.extend(data)

# **Concat public dataset**

In [27]:
print(f"test: {len(dataset)}")
print(dataset)

test: 188
[{'expected_result': 'Answer Accepted', 'math_level': 'Elementary', 'data': [{'role': 'user', 'content': "I'm struggling with multiplying decimal numbers, can you guide me through the process?"}, {'role': 'assistant', 'content': "Absolutely! Let's begin with a basic problem. How about \\(0.2 \\times 0.3\\)? Try to solve it and let me know your answer. Remember, you can initially treat the decimals as if they were whole numbers!"}, {'role': 'user', 'content': '0.6'}, {'role': 'assistant', 'content': "Let's reconsider this. If we treat \\(0.2\\) and \\(0.3\\) as \\(2\\) and \\(3\\), we get \\(2 \\times 3 = 6\\). But these are not whole numbers, they're tenths! So, what should we do with the decimal point in our result?"}, {'role': 'user', 'content': 'what is the rule on where to put the decimal?'}, {'role': 'assistant', 'content': "Excellent question! Here's a tip: count the total number of digits after the decimal in the original numbers. How many are there in total?"}, {'role

In [28]:
print(f"test: {len(dataset2)}")
print(dataset2)

test: 238
[{'data': [{'role': 'user', 'content': 'Saya penasaran dengan materi lingkaran di matematika. Apa itu lingkaran sebenarnya?'}, {'role': 'assistant', 'content': 'Lingkaran itu seperti lukisan sempurna di alam semesta! Bayangkan semua titik yang jaraknya sama dari satu titik pusat. Nah, menurutmu, apa saja komponen dasar lingkaran yang sudah kamu ketahui?'}, {'role': 'user', 'content': 'Kalau tidak salah ada jari-jari dan diameter?'}, {'role': 'assistant', 'content': 'Tepat sekali! Jari-jari (radius) adalah jarak dari pusat ke tepi, sedangkan diameter adalah... bisakah kamu menyelesaikan definisi ini?'}, {'role': 'user', 'content': 'Diameter itu dua kali jari-jari, kan? Jadi kalau radius = r, diameter = 2r'}, {'role': 'assistant', 'content': 'Benar! Sekarang, ada komponen magis dalam lingkaran bernama π (phi). Menurutmu, apa peran π dalam rumus lingkaran?'}, {'role': 'user', 'content': 'Katanya π itu 22/7 atau 3.14, tapi kenapa angka itu penting?'}, {'role': 'assistant', 'conte

In [29]:
dataset.extend(dataset2)

In [30]:
print(f"test: {len(dataset)}")
print(dataset[-1])

test: 426
{'data': [{'role': 'user', 'content': 'Kerangka limas alas persegi panjang 16 cm \\times 12 cm, tinggi 24 cm. Berapa panjang kawat yang diperlukan?'}, {'role': 'assistant', 'content': 'Kerangka limas terdiri dari keliling alas dan 4 rusuk tegak. Pertama, hitung keliling alas: \\(2 \\times (16 + 12)\\). Berapa hasilnya?'}, {'role': 'user', 'content': '2 \\times 28 = 56 cm.'}, {'role': 'assistant', 'content': 'Benar! Sekarang, hitung panjang rusuk tegak. Pusat alas ke sudut = \\(\\sqrt{(8)^2 + (6)^2} = 10\\) cm. Rusuk tegak = \\(\\sqrt{10^2 + 24^2}\\). Berapa?'}, {'role': 'user', 'content': '\\(\\sqrt{100 + 576} = \\sqrt{676} = 26\\) cm.'}, {'role': 'assistant', 'content': 'Tepat! Total kawat = 56 + (4 \\times 26) = ...?'}, {'role': 'user', 'content': '56 + 104 = 160 cm.'}, {'role': 'assistant', 'content': 'Sempurna! Panjang kawat yang dibutuhkan 160 cm. Lanjut ke soal lain?'}, {'role': 'user', 'content': 'Limas alas persegi panjang 18 cm \\times 32 cm, tinggi 42 cm. Berapa vol

# **Example DATASET**

In [31]:
sample = dataset[0]

In [32]:
sample

{'expected_result': 'Answer Accepted',
 'math_level': 'Elementary',
 'data': [{'role': 'user',
   'content': "I'm struggling with multiplying decimal numbers, can you guide me through the process?"},
  {'role': 'assistant',
   'content': "Absolutely! Let's begin with a basic problem. How about \\(0.2 \\times 0.3\\)? Try to solve it and let me know your answer. Remember, you can initially treat the decimals as if they were whole numbers!"},
  {'role': 'user', 'content': '0.6'},
  {'role': 'assistant',
   'content': "Let's reconsider this. If we treat \\(0.2\\) and \\(0.3\\) as \\(2\\) and \\(3\\), we get \\(2 \\times 3 = 6\\). But these are not whole numbers, they're tenths! So, what should we do with the decimal point in our result?"},
  {'role': 'user', 'content': 'what is the rule on where to put the decimal?'},
  {'role': 'assistant',
   'content': "Excellent question! Here's a tip: count the total number of digits after the decimal in the original numbers. How many are there in tot

# **Preprocess self made dataset**

In [33]:
from datasets import Dataset

def format_conversation(conversation):
  messages = conversation['data']
  full_conversation = tokenizer.apply_chat_template(messages, tokenize=False)

  return {"text": full_conversation}

In [34]:
formatted_data = [format_conversation(conversation) for conversation in dataset]
formatted_train = Dataset.from_list(formatted_data)

In [35]:
formatted_train[0]

{'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI'm struggling with multiplying decimal numbers, can you guide me through the process?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAbsolutely! Let's begin with a basic problem. How about \\(0.2 \\times 0.3\\)? Try to solve it and let me know your answer. Remember, you can initially treat the decimals as if they were whole numbers!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n0.6<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nLet's reconsider this. If we treat \\(0.2\\) and \\(0.3\\) as \\(2\\) and \\(3\\), we get \\(2 \\times 3 = 6\\). But these are not whole numbers, they're tenths! So, what should we do with the decimal point in our result?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nwhat is the rule on where to put the decimal?<|eot_id|><|start_header_

In [36]:
formatted_train[-1]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nKerangka limas alas persegi panjang 16 cm \\times 12 cm, tinggi 24 cm. Berapa panjang kawat yang diperlukan?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nKerangka limas terdiri dari keliling alas dan 4 rusuk tegak. Pertama, hitung keliling alas: \\(2 \\times (16 + 12)\\). Berapa hasilnya?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n2 \\times 28 = 56 cm.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBenar! Sekarang, hitung panjang rusuk tegak. Pusat alas ke sudut = \\(\\sqrt{(8)^2 + (6)^2} = 10\\) cm. Rusuk tegak = \\(\\sqrt{10^2 + 24^2}\\). Berapa?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\\(\\sqrt{100 + 576} = \\sqrt{676} = 26\\) cm.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nTepat! Total kawat = 56 + (4 \\times 26) = ...?<|eot_id|><|s

In [37]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        # num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/426 [00:00<?, ? examples/s]

In [38]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 426 | Num Epochs = 2 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
1,1.1222
2,1.4313
3,1.2707
4,1.0599
5,0.9627
6,1.0581
7,0.9646
8,0.9281
9,0.998
10,0.9293


# **TESTING INFERENCE**

In [49]:
FastLanguageModel.for_inference(model)
messages = [
    {"role": "system","content": "Gunakan Bahasa Indonesia untuk semua jawaban."},
    {"role": "user", "content": "1/3x + 4<7"},
    {"role": "assistant", "content": """Tentu! Mari kita analisis:

1. Kita punya persamaan \( \frac{1}{3}x + 4 < 7 \).
2. Langkah pertama: Kita harus memisahkan \( x \) dari konstanta.
3. Coba kamu ubah persamaan tersebut. Apa yang kamu lakukan?"""},
    {"role": "user", "content": "membagi kedua ruas dengan 3?"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, pad_token_id = tokenizer.eos_token_id)

Tepat! Sekarang, kita memiliki:
\[ x + 12 < 21 \]
4. Langkah kedua: Kita harus mengurangi \( 12 \) dari kedua ruas.
5. Coba kamu lakukan itu. Apa hasilnya?<|eot_id|>


In [50]:
FastLanguageModel.for_inference(model)
messages = [
    {"role": "user", "content": "apa itu pi"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, pad_token_id = tokenizer.eos_token_id)

Pi (π) adalah konstanta matematika yang menggambarkan perbandingan keliling lingkaran dengan jari-jarinya. Nilainya adalah \(3.14159\ldots\). Pi digunakan dalam berbagai aplikasi matematika, seperti menghitung luas dan keliling lingkaran, sudut pusat, dan banyak lagi.<|eot_id|>


In [41]:
FastLanguageModel.for_inference(model)
messages = [
    {"role": "user", "content": "Berapa slope dari 3x+2y = 1"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, pad_token_id = tokenizer.eos_token_id)

Mari kita analisis bersama! Apa yang kamu lakukan untuk menemukan slope dari persamaan garis?<|eot_id|>


In [42]:
FastLanguageModel.for_inference(model)
messages = [
    {"role": "user", "content": "Pada acara Dies Natalis di sekolah, kelas kalian berniat membuat stand minuman. Minuman yang dijual adalah jus buah seharga Rp10.000,00 setiap gelas. Karena standard bazar dilombakan, maka harus dihias dengan bagus yang menghabiskan biaya Rp80.000,00. Selesai bazar ternyata mendapatkan uang Rp600.000,00. Jika keuntungan didapat dari hasil penjualan dikurangi biaya menghias stand, berapa banyak jus buah yang laku terjual?"},
    {"role": "assistant", "content": "Ah, sebuah soal yang menarik! Mari kita analisisnya. Kita memiliki dua informasi penting: \n\n1. Biaya untuk menghias stand minuman adalah Rp80.000,00.\n2. Keuntungan yang didapat setelah menjual jus buah dan menghias stand minuman adalah Rp600.000,00.\n\nKita ingin tahu berapa banyak jus buah yang laku terjual. Bagaimana kita bisa mengetahuinya?"},
    {"role": "user", "content": "tidak tau, gimana cara menentukannya"},
    {"role": "assistant", "content": "Ah, tidak masalah! Mari kita mulai dari awal. Kita memiliki biaya Rp80.000,00 untuk menghias stand minuman. Setelah itu, kita mendapatkan uang Rp600.000,00. Bagaimana kita bisa menghitung berapa banyak jus buah yang laku terjual jika kita tahu harga setiap gelas?"},
    {"role": "user", "content": "600rb dikurang 80rb = 520rb, dibagi 10rb menjadi 52 gelas?"}
  ]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, pad_token_id = tokenizer.eos_token_id)

Tepat! Kita memiliki keuntungan Rp600.000,00 dan biaya Rp80.000,00. Jika kita kurangi biaya dari keuntungan, maka:

600.000 - 80.000 = 520.000

Karena harga satu gelas adalah Rp10.000,00, maka:

520.000 ÷ 10.000 = 52 gelas

Jadi, **52 gelas jus buah yang laku terjual**.<|eot_id|>


In [43]:
FastLanguageModel.for_inference(model)
messages = [
    {"role": "user", "content": "Diketahui sudut keliling PAQ = 130°, terus disuruh cari sudut pusat POQ."},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, pad_token_id = tokenizer.eos_token_id)

Oke, mari kita pelajari. Sudut keliling dan sudut pusat itu hubungannya seperti apa?<|eot_id|>


### **SAVING**

In [44]:
SAVE_PATH = "/content/drive/MyDrive/SKRIPSI/pretrained_all"

In [45]:
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

('/content/drive/MyDrive/SKRIPSI/pretrained_all/tokenizer_config.json',
 '/content/drive/MyDrive/SKRIPSI/pretrained_all/special_tokens_map.json',
 '/content/drive/MyDrive/SKRIPSI/pretrained_all/tokenizer.json')