# Preparing Dataset

Load Dataset

In [2]:
file_path = "dataset.txt"
with open(file_path, "r") as file:
    data = file.read().splitlines()

data[:5]

['User;Response',
 'Apa itu rendang?;Rendang adalah masakan daging berasal dari Minangkabau yang dimasak lama dengan santan dan rempah hingga kering.',
 'Apa bahan utama membuat rendang sapi?;Bahan utamanya adalah daging sapi, santan kelapa tua, dan campuran bumbu halus serta rempah daun.',
 'Bagian daging sapi apa yang terbaik untuk rendang?;Paha belakang (knuckle) adalah yang terbaik karena teksturnya padat dan tidak mudah hancur.',
 'Mengapa rendang dimasak sangat lama?;Tujuannya agar santan terkaramelisasi menjadi minyak dan bumbu meresap sempurna ke dalam serat daging.']

Convert ke format JSON

In [5]:
import json

def txt_qa_to_jsonl(input_path: str, output_path: str, encoding: str = "utf-8"):

    with open(input_path, "r", encoding=encoding) as f:
        lines = f.read().splitlines()

    lines = lines[1:]

    samples = []

    for idx, line in enumerate(lines):
        if ";" not in line:
            continue

        instruction, response = line.split(";", 1)

        instruction = instruction.strip()
        response = response.strip()

        if not instruction or not response:
            continue

        text = (
            "### Instruction:\n"
            f"{instruction}\n\n"
            "### Response:\n"
            f"{response}"
        )

        samples.append({"text": text})

    with open(output_path, "w", encoding=encoding) as f:
        for sample in samples:
            f.write(json.dumps(sample, ensure_ascii=False) + "\n")


In [6]:
txt_qa_to_jsonl("dataset.txt", "dataset.jsonl")

# Load Model

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-small-indonesian")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("flax-community/gpt2-small-indonesian")
model.to(device)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Tokenization

In [11]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="dataset.jsonl", split="train")

def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=256
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 349/349 [00:00<00:00, 1417.29 examples/s]


In [12]:
dataset[:5]

{'text': ['### Instruction:\nApa itu rendang?\n\n### Response:\nRendang adalah masakan daging berasal dari Minangkabau yang dimasak lama dengan santan dan rempah hingga kering.',
  '### Instruction:\nApa bahan utama membuat rendang sapi?\n\n### Response:\nBahan utamanya adalah daging sapi, santan kelapa tua, dan campuran bumbu halus serta rempah daun.',
  '### Instruction:\nBagian daging sapi apa yang terbaik untuk rendang?\n\n### Response:\nPaha belakang (knuckle) adalah yang terbaik karena teksturnya padat dan tidak mudah hancur.',
  '### Instruction:\nMengapa rendang dimasak sangat lama?\n\n### Response:\nTujuannya agar santan terkaramelisasi menjadi minyak dan bumbu meresap sempurna ke dalam serat daging.',
  '### Instruction:\nApa perbedaan gulai, kalio, dan rendang?\n\n### Response:\nGulai masih berkuah encer, kalio berkuah kental berminyak, dan rendang sudah kering serta berwarna gelap.']}

# Setup LoRA

In [13]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn"] 
)


model = get_peft_model(model, peft_config)
model.print_trainable_parameters() 

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




# Training

In [14]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-rendang",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=8,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    use_cpu=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

print("Mulai training...")
trainer.train()

model.save_pretrained("./gpt2-rendang-final")
print("Selesai! Model tersimpan di folder 'gpt2-rendang-final'")

Mulai training...


Step,Training Loss
10,4.1466
20,3.9245
30,3.6733
40,3.3482
50,3.0338
60,2.5932
70,2.3363
80,2.2523
90,2.1498
100,2.0541


Selesai! Model tersimpan di folder 'gpt2-rendang-final'


# Eval

In [18]:
import torch
import math
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_path = "flax-community/gpt2-small-indonesian"
adapter_path = "./gpt2-rendang-final"

print("Sedang memuat model hasil training...")
model_eval = AutoModelForCausalLM.from_pretrained(base_model_path)
model_eval = PeftModel.from_pretrained(model_eval, adapter_path)

model_eval.to(device)
model_eval.eval()

tokenizer_eval = AutoTokenizer.from_pretrained(base_model_path)
tokenizer_eval.pad_token = tokenizer_eval.eos_token

def calculate_perplexity(text_list, model, tokenizer):
    encodings = tokenizer("\n\n".join(text_list), return_tensors="pt")
    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc 
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return ppl.item()

texts_to_eval = dataset["text"][:50]

print("Sedang menghitung Perplexity...")
ppl_score = calculate_perplexity(texts_to_eval, model_eval, tokenizer_eval)

print(f"✅ Perplexity Score: {ppl_score:.2f}")
print("Catatan: Semakin rendah skor (mendekati 1), semakin baik modelnya.")


Sedang memuat model hasil training...
Sedang menghitung Perplexity...
✅ Perplexity Score: 24.92
Catatan: Semakin rendah skor (mendekati 1), semakin baik modelnya.


# Test

In [24]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# 1. Setup Model (Sama seperti evaluasi)
base_model_path = "flax-community/gpt2-small-indonesian"
adapter_path = "./gpt2-rendang-final"
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Memuat model untuk tes...")
model = AutoModelForCausalLM.from_pretrained(base_model_path)
model = PeftModel.from_pretrained(model, adapter_path)
model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
tokenizer.pad_token = tokenizer.eos_token

def generate_resep(pertanyaan):
    # Format prompt HARUS SAMA PERSIS dengan saat training
    prompt = f"### Instruction:\n{pertanyaan}\n\n### Response:\n"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate jawaban
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,      # Batasi panjang jawaban biar gak ngelantur
            do_sample=True,          # Supaya jawaban bervariasi
            temperature=0.4,         # Kreativitas (0.1 kaku, 1.0 liar)
            top_k=50,                # Ambil 50 kata terbaik
            top_p=0.95,              # Ambil probabilitas kumulatif 95%
            repetition_penalty=1.2,  # Cegah pengulangan kata
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode hasil (ubah angka jadi teks)
    hasil_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Ambil bagian Response saja
    if "### Response:" in hasil_text:
        jawaban = hasil_text.split("### Response:\n")[1]
    else:
        jawaban = hasil_text
        
    return jawaban.strip()

Memuat model untuk tes...


In [25]:
print("-" * 30)
pertanyaan_kamu = "Apakah membuat rendang memerlukan bawang?"
print(f"Pertanyaan: {pertanyaan_kamu}")
print("Model sedang berpikir...")
print("-" * 30)

jawaban_model = generate_resep(pertanyaan_kamu)
print(f"Jawaban Model:\n{jawaban_model}")
print("-" * 30)

------------------------------
Pertanyaan: Apakah membuat rendang memerlukan bawang?
Model sedang berpikir...
------------------------------
Jawaban Model:
Untuk memasak rendang, kita perlu menggunakan bawang putih. Bawang putih memiliki kandungan sulfur yang tinggi dan bisa membantu proses pematangan bumbu. Selain itu, bawang putih juga mengandung antioksidan yang cukup tinggi.
------------------------------
