In [1]:
!pip install datasets
!pip install seacrowd accelerate peft bitsandbytes wandb

Collecting seacrowd
  Downloading seacrowd-0.2.2-py3-none-any.whl.metadata (1.1 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting loguru>=0.5.3 (from seacrowd)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting bioc>=1.3.7 (from seacrowd)
  Downloading bioc-2.1-py3-none-any.whl.metadata (4.6 kB)
Collecting black~=22.0 (from seacrowd)
  Downloading black-22.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flake8>=3.8.3 (from seacrowd)
  Downloading flake8-7.2.0-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting isort>=5.0.0 (from seacrowd)
  Downloading isort-6.0.1-py3-none-any.whl.metadata (11 kB)
Collecting pre-commit>=2.19.0 (from seacrowd)
  Downloading pre_commit-4.2.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting jso

In [2]:
%%capture
!pip install unsloth vllm
!pip install triton==3.1.0
!pip install -U pynvml
# Install latest Hugging Face for Gemma-3!
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [3]:
import pandas as pd
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import random 
import torch
import datasets

from tqdm import tqdm


In [4]:
# Detect train, dev, and test files
DATASET_ROOT = '/kaggle/input/indosum/indosum'

files_id_dir = os.listdir(DATASET_ROOT)
train_files = []
dev_files = []
test_files = []

for filename in files_id_dir:
    if 'train' in filename:
        train_files.append(filename)
    elif 'dev' in filename:
        dev_files.append(filename)
    elif 'test' in filename:
        test_files.append(filename)

train_files, dev_files, test_files

(['train.01.jsonl',
  'train.05.jsonl',
  'train.03.jsonl',
  'train.04.jsonl',
  'train.02.jsonl'],
 ['dev.01.jsonl',
  'dev.05.jsonl',
  'dev.04.jsonl',
  'dev.03.jsonl',
  'dev.02.jsonl'],
 ['test.05.jsonl',
  'test.04.jsonl',
  'test.02.jsonl',
  'test.03.jsonl',
  'test.01.jsonl'])

In [5]:
train_files = ['train.01.jsonl']
test_files = ['test.01.jsonl']
dev_files = ['dev.01.jsonl']

In [6]:
def load_file_to_json_list(filename):
    file = os.path.join(DATASET_ROOT, filename)
    data = []
    with open(file, 'r') as f:
        json_list = list(f)
        for json_str in tqdm(json_list, desc=f'Loading data {filename}'):
            d = json.loads(json_str)
            data.append(d)
    return data

def label_to_dict_str(label_list):
    label_dict = {} # key = paragraph_id : value = label list 
    for i, label in enumerate(label_list[:]):
        label_dict[i] = label

    json_str = json.dumps(label_dict)
    num = len(label_dict)
    return json_str, num

def paragraph_to_dict_str(paragraph_list):
    paragraph_dict = {} # key = paragraph_id : value = paragraph list 
    for i, paragraph in enumerate(paragraph_list):
        new_paragraph = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            new_paragraph.append(sentence)
        paragraph_dict[i] = new_paragraph

    json_str = json.dumps(paragraph_dict)
    num = len(paragraph_dict)
    return json_str, num
def paragraph_to_text(raw_paragraph_list):
    new_paragraph_list = []
    for i, paragraph in enumerate(raw_paragraph_list):
        paragraph_list = []
        for sentence in paragraph:
            sentence = ' '.join(sentence)
            paragraph_list.append(sentence)
        
        new_paragraph = ' '.join(paragraph_list)
        new_paragraph_list.append(new_paragraph)

    paragraph_str = ' '.join(new_paragraph_list)
    return paragraph_str
def summary_to_dict_str(summary_list):
    summary_dict = {} # key = summary_id : value = summary sentence 
    for i, summary in enumerate(summary_list):
        summary_dict[i] = ' '.join(summary)

    json_str = json.dumps(summary_dict)
    num = len(summary_dict)
    return json_str, num
def summary_to_text(raw_summary_list):
    summary_list = []
    for i, summary in enumerate(raw_summary_list):
        summary_list.append(' '.join(summary))

    summary_str = ' '.join(summary_list)
    return summary_str
def alter_json_data(json_list_data, filename=''):
    new_json_list = []
    for json_data in tqdm(json_list_data, desc=f'Altering json data {filename}'):
        json_data = json_data.copy()
        json_data['gold_labels'], _ = label_to_dict_str(json_data['gold_labels'])
        json_data['news_text'] = paragraph_to_text(json_data['paragraphs'])
        json_data['paragraphs'], num_paragraph = paragraph_to_dict_str(json_data['paragraphs'])
        json_data['num_of_paragraphs'] = num_paragraph
        json_data['summary_text'] = summary_to_text(json_data['summary'])
        json_data['summary'], num_summary = summary_to_dict_str(json_data['summary'])
        json_data['num_of_summary'] = num_summary
        
        new_json_list.append(json_data)
    
    return new_json_list
def create_dataset(jsonl):
    header = list(jsonl[0].keys())
    dataset_list = []
    for json_data in jsonl:
        row = []
        for h in header:
            row.append(json_data[h])
        dataset_list.append(row)
    
    return header, dataset_list
def create_dataset_from_files(file_list):
    df_header = None
    dataset_list = []
    for filename in file_list:
        json_l = load_file_to_json_list(filename)
        new_json_l = alter_json_data(json_l, filename)
        header, dataset_part = create_dataset(new_json_l)
        
        if not df_header: df_header = header
        dataset_list.extend(dataset_part)
        
    df_full = pd.DataFrame().from_records(dataset_list)
    df_full = df_full.rename(columns=dict(enumerate(header)))
    return df_full
df_train = create_dataset_from_files(train_files)
df_dev = create_dataset_from_files(dev_files)
df_test = create_dataset_from_files(test_files)

Loading data train.01.jsonl: 100%|██████████| 14262/14262 [00:02<00:00, 6759.21it/s]
Altering json data train.01.jsonl: 100%|██████████| 14262/14262 [00:00<00:00, 14493.54it/s]
Loading data dev.01.jsonl: 100%|██████████| 750/750 [00:00<00:00, 4215.42it/s]
Altering json data dev.01.jsonl: 100%|██████████| 750/750 [00:00<00:00, 14699.80it/s]
Loading data test.01.jsonl: 100%|██████████| 3762/3762 [00:00<00:00, 10146.53it/s]
Altering json data test.01.jsonl: 100%|██████████| 3762/3762 [00:00<00:00, 14689.18it/s]


In [7]:
from datasets import Dataset, DatasetDict

# Konversi DataFrame ke Dataset
train_dataset = Dataset.from_pandas(df_train[['news_text', 'summary_text']])
dev_dataset = Dataset.from_pandas(df_dev[['news_text', 'summary_text']])
test_dataset = Dataset.from_pandas(df_test[['news_text', 'summary_text']])

# Gabungkan menjadi DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": dev_dataset,
    "test": test_dataset
})

# Cek struktur dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['news_text', 'summary_text'],
        num_rows: 14262
    })
    validation: Dataset({
        features: ['news_text', 'summary_text'],
        num_rows: 750
    })
    test: Dataset({
        features: ['news_text', 'summary_text'],
        num_rows: 3762
    })
})


In [8]:
df_train_h = df_train[df_train['category'] == 'hiburan']
df_dev_h = df_dev[df_dev['category'] == 'hiburan']
df_test_h = df_test[df_test['category'] == 'hiburan']

train_dataset = Dataset.from_pandas(df_train_h[['news_text', 'summary_text']])
dev_dataset = Dataset.from_pandas(df_dev_h[['news_text', 'summary_text']])
test_dataset = Dataset.from_pandas(df_test_h[['news_text', 'summary_text']])

In [9]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it-bnb-4bit",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-06 07:50:19 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.3.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/965M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [10]:
"""# Akses embed_tokens dan lm_head langsung dari base_model
model.base_model.model.embed_tokens = model.base_model.model.embed_tokens.to(torch.float32)
model.lm_head = model.lm_head.to(torch.float32)
"""

'# Akses embed_tokens dan lm_head langsung dari base_model\nmodel.base_model.model.embed_tokens = model.base_model.model.embed_tokens.to(torch.float32)\nmodel.lm_head = model.lm_head.to(torch.float32)\n'

In [11]:
from unsloth.chat_templates import standardize_data_formats
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [12]:
def convert_to_conversations(example):
    return {
        "conversations": [
            {
                "role": "user",
                "content": f"Ringkaskan teks berikut:\n\n{example['news_text']}"
            },
            {
                "role": "assistant",
                "content": example["summary_text"]
            }
        ]
    }

# Kalau train_data bertipe Dataset (bukan DatasetDict)
train_dataset = train_dataset.map(convert_to_conversations)
test_dataset = test_dataset.map(convert_to_conversations)
dev_dataset = dev_dataset.map(convert_to_conversations)

Map:   0%|          | 0/1372 [00:00<?, ? examples/s]

Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

In [13]:
# Ambil hanya kolom 'conversations'
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col != "conversations"])
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names if col != "conversations"])
dev_dataset = dev_dataset.remove_columns([col for col in dev_dataset.column_names if col != "conversations"])

train_dataset[0]

{'conversations': [{'content': 'Ringkaskan teks berikut:\n\nJakarta , CNN Indonesia - - Dinas Pariwisata Provinsi Bengkulu kembali menggelar kegiatan Bimbingan Teknis ( Bimtek ) SDM Kepariwisataan dalam menyongson " Visit 2020 Wonderful Bengkulu " . Kegiatan yang berlangsung pada 8 hingga 10 November kemarin tersebut sebagai bagian dari upaya Pemerintah Provinsi Bengkulu dalam Hadir sebagai pemateri kegiatan pada 8 - 10 November itu adalah Plt. Asdep Strategi Pemasaran Pariwisata Nusantara , Deputi Bidang Pengembangan Pemasaran Pariwisata Nusantara Hariyanto serta perwakilan dari Deputi Bidang Pengembangan Kelembagaan Kementerian Pariwisata , Faizal . Kepala Dinas Pariwisata Provinsi Bengkulu Yudi Satria mengatakan , kegiatan Bimtek diikuti 250 peserta yang terdiri dari aparatur Pemerintah Provinsi , ASN Kabupaten / Kota , Kelompok Sadar Wisata serta pihak terkait sektor pariwisata di Bengkulu . " Kegiatan ini dimaksudkan untuk memberikan pembekalan kepada peserta di bidang kepariwisat

In [14]:
def apply_chat_template(examples):
    texts = tokenizer.apply_chat_template(examples["conversations"], tokenize=False)
    return { "text" : texts }
pass
train_dataset = train_dataset.map(apply_chat_template, batched = True)
test_dataset = test_dataset.map(apply_chat_template, batched = True)
dev_dataset = dev_dataset.map(apply_chat_template, batched = True)

Map:   0%|          | 0/1372 [00:00<?, ? examples/s]

Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

In [15]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [16]:
"""!pip install evaluate
!pip install rouge_score
"""

'!pip install evaluate\n!pip install rouge_score\n'

In [17]:
"""import evaluate
import numpy as np

rouge = evaluate.load("rouge")
"""

'import evaluate\nimport numpy as np\n\nrouge = evaluate.load("rouge")\n'

In [18]:
"""def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # ROUGE hanya menerima teks, jadi perlu decoding
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Kadang label pakai -100 untuk padding, ubah dulu ke tokenizer.pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Bersihkan whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Hitung ROUGE
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"],
        "rougeLsum": result["rougeLsum"],
    }
"""

'def compute_metrics(eval_preds):\n    predictions, labels = eval_preds\n\n    # ROUGE hanya menerima teks, jadi perlu decoding\n    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)\n    \n    # Kadang label pakai -100 untuk padding, ubah dulu ke tokenizer.pad_token_id\n    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n\n    # Bersihkan whitespace\n    decoded_preds = [pred.strip() for pred in decoded_preds]\n    decoded_labels = [label.strip() for label in decoded_labels]\n\n    # Hitung ROUGE\n    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)\n    \n    return {\n        "rouge1": result["rouge1"],\n        "rouge2": result["rouge2"],\n        "rougeL": result["rougeL"],\n        "rougeLsum": result["rougeLsum"],\n    }\n'

In [19]:
import wandb
wandb.login(key='ea5b934c345990bf66ca82b76040cf0748acdb7a')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjauharulumam[0m ([33mjauharulumam-uin-walisongo[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [20]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = dev_dataset, # Can set up evaluation!
    args = SFTConfig(
        evaluation_strategy="steps",   # ← aktifkan evaluasi
        eval_steps=10,                  # ← setiap berapa langkah mau evaluasi
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb",
        logging_dir = "./logs",
        output_dir = "./results",  # direktori checkpoint
        save_strategy = "epoch",  # Simpan tiap epoch
        save_total_limit = 3,# Use this for WandB etc
        save_safetensors=True,
    ),
)



Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/1372 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/76 [00:00<?, ? examples/s]

In [21]:
"""from transformers import TrainerCallback
import wandb

class WandbModelCheckpoint(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        artifact = wandb.Artifact(f"model-epoch-{state.epoch:.0f}", type="model")
        artifact.add_dir(args.output_dir)
        wandb.log_artifact(artifact)

trainer.add_callback(WandbModelCheckpoint())
"""

'from transformers import TrainerCallback\nimport wandb\n\nclass WandbModelCheckpoint(TrainerCallback):\n    def on_save(self, args, state, control, **kwargs):\n        artifact = wandb.Artifact(f"model-epoch-{state.epoch:.0f}", type="model")\n        artifact.add_dir(args.output_dir)\n        wandb.log_artifact(artifact)\n\ntrainer.add_callback(WandbModelCheckpoint())\n'

In [22]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=4):   0%|          | 0/1372 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/76 [00:00<?, ? examples/s]

In [23]:
tokenizer.decode(trainer.train_dataset[100]["input_ids"])

'<bos><bos><start_of_turn>user\nRingkaskan teks berikut:\n\nMerdeka.com - Thilafushi , mungkin nama ini masih asing terdengar . Thilafushi adalah salah satu dari gugusan pulau-pulau bagian dari wilayah negeri Maladewa . Seperti telah banyak diketahui , Maladewa terkenal dengan industri pariwisatanya . Negeri ini memiliki sejumlah pantai tropis dengan air lair berwarna kehijauan , pastinya indah . Namun Thilafushi menampilkan kenyataan yang jauh berbeda dengan pulau-pulau lainnya di Maladewa . Thilafushi adalah pulau buatan hasil reklamasi . Jika di tempat lain terdapat pantai-pantai yang indah , pulau ini merupakan pusat pembuangan limbah . Bahkan pulau ini pun dibuat dari tumpukan sampah . Dulunya Thilafushi merupakan sebuah laguna . Kemajuan industri pariwisata memaksa pemerintah Maladewa untuk mereklamasi laguna ini , karena kebutuhan akan tempat pembuangan semakin mendesak . Berawal pada Desember 1991 penggalian lubang penampungan limbah mulai dilakukan . Sampah - sampah berdatanga

In [24]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Thilafushi adalah salah satu dari gugusan pulau-pulau bagian dari wilayah negeri Maladewa . Namun Thilafushi menampilkan kenyataan yang jauh berbeda dengan pulau-pulau lainnya di Maladewa . Thilafushi adalah pulau buatan hasil reklamasi . Jika di tempat lain terdapat pantai-pantai yang indah , pulau ini merupakan pusat pembuangan limbah . Bahkan pulau ini pun dibuat dari tumpukan sampah .<end_of_turn>\n'

In [25]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
1.473 GB of memory reserved.


In [26]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,372 | Num Epochs = 3 | Total steps = 255
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 6,522,880/1,000,000,000 (0.65% trained)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.074,0.864247
20,0.7368,0.761549
30,0.7039,0.723947
40,0.582,0.706793
50,0.6367,0.694774
60,0.6754,0.687771
70,0.6059,0.685369
80,0.7579,0.68437
90,0.5437,0.689673
100,0.5471,0.705891


Unsloth: Not an error, but Gemma3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [27]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2529.5644 seconds used for training.
42.16 minutes used for training.
Peak reserved memory = 2.238 GB.
Peak reserved memory for training = 0.765 GB.
Peak reserved memory % of max memory = 15.182 %.
Peak reserved memory for training % of max memory = 5.19 %.


In [28]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)
messages = [{
    "role": "user",
    "content": [{
        "type" : "text",
        "text" : "Ringkaskan teks berikut:\n\n Thilafushi , mungkin nama ini masih asing terdengar . Thilafushi adalah salah satu dari gugusan pulau-pulau bagian dari wilayah negeri Maladewa . Seperti telah banyak diketahui , Maladewa terkenal dengan industri pariwisatanya . Negeri ini memiliki sejumlah pantai tropis dengan air lair berwarna kehijauan , pastinya indah . Namun Thilafushi menampilkan kenyataan yang jauh berbeda dengan pulau-pulau lainnya di Maladewa . Thilafushi adalah pulau buatan hasil reklamasi . Jika di tempat lain terdapat pantai-pantai yang indah , pulau ini merupakan pusat pembuangan limbah . Bahkan pulau ini pun dibuat dari tumpukan sampah . Dulunya Thilafushi merupakan sebuah laguna . Kemajuan industri pariwisata memaksa pemerintah Maladewa untuk mereklamasi laguna ini , karena kebutuhan akan tempat pembuangan semakin mendesak . Berawal pada Desember 1991 penggalian lubang penampungan limbah mulai dilakukan . Sampah - sampah berdatangan dari seluruh penjuru Maldives . Diendapkan ke dalam lubang berukuran 1060 meter kubik hingga penuh . Bagian atasnya ditutup dengan puing-puing bangunan , merata dengan ketinggian tanah di sekitarnya . Terakhir , kemudian bagian permukaannya ditutup dengan pasir pantai . Proyek reklamasi tersebut berjalan dengan baik sampai sekarang . Bahkan sebagian wilayahnya kini menjadi daerah industri dan pemukiman . Saat ini , setidaknya ada lebih dari 30 pabrik berdiri di Pulau Thilafushi . Mulai dari pabrik pengemasan semen dan gas , manufaktur perahu , hingga pergudangan . Pulau ini juga menjadi tempat tinggal sekitar 150 imigran asal Bangladesh yang sehari-hari bekerja memilah sampah . Pulau Sampah Thilafushi , Maladewa 2014 Merdeka.com / Populer Mechanics Reklamasi Thilafushi pun masih berjalan sampai sekarang dengan sampah - sampah yang terus berdatangan . Membuat luas pulau pembuangan ini bertambah 1 meter persegi setiap harinya . Sayangnya , belakangan sampah - sampah di pulau ini terhanyut diterjang ombak . Mengotori laut dan mencemari keindahan spot - spot diving di pulau wisata sekitar . Meski reklamasi sempat dihentikan , pemerintah Maldives melanjutkannya kembali . Sebagian jenis sampah kini diekspor ke India untuk didaur ulang .",
    }]
}]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
    **tokenizer([text], return_tensors = "pt").to("cuda"),
    max_new_tokens = 128, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)

['<bos><bos><start_of_turn>user\nRingkaskan teks berikut:\n\n Thilafushi , mungkin nama ini masih asing terdengar . Thilafushi adalah salah satu dari gugusan pulau-pulau bagian dari wilayah negeri Maladewa . Seperti telah banyak diketahui , Maladewa terkenal dengan industri pariwisatanya . Negeri ini memiliki sejumlah pantai tropis dengan air lair berwarna kehijauan , pastinya indah . Namun Thilafushi menampilkan kenyataan yang jauh berbeda dengan pulau-pulau lainnya di Maladewa . Thilafushi adalah pulau buatan hasil reklamasi . Jika di tempat lain terdapat pantai-pantai yang indah , pulau ini merupakan pusat pembuangan limbah . Bahkan pulau ini pun dibuat dari tumpukan sampah . Dulunya Thilafushi merupakan sebuah laguna . Kemajuan industri pariwisata memaksa pemerintah Maladewa untuk mereklamasi laguna ini , karena kebutuhan akan tempat pembuangan semakin mendesak . Berawal pada Desember 1991 penggalian lubang penampungan limbah mulai dilakukan . Sampah - sampah berdatangan dari selur

In [30]:
!zip -r /kaggle/working/my_model_checkpoint_86.zip /kaggle/working/results/checkpoint-86
!zip -r /kaggle/working/my_model_checkpoint_172.zip /kaggle/working/results/checkpoint-172
!zip -r /kaggle/working/my_model_checkpoint_255.zip /kaggle/working/results/checkpoint-255

  adding: kaggle/working/results/checkpoint-86/ (stored 0%)
  adding: kaggle/working/results/checkpoint-86/trainer_state.json (deflated 76%)
  adding: kaggle/working/results/checkpoint-86/tokenizer.model (deflated 52%)
  adding: kaggle/working/results/checkpoint-86/rng_state.pth (deflated 25%)
  adding: kaggle/working/results/checkpoint-86/adapter_model.safetensors (deflated 7%)
  adding: kaggle/working/results/checkpoint-86/README.md (deflated 66%)
  adding: kaggle/working/results/checkpoint-86/tokenizer_config.json (deflated 96%)
  adding: kaggle/working/results/checkpoint-86/optimizer.pt (deflated 10%)
  adding: kaggle/working/results/checkpoint-86/scheduler.pt (deflated 56%)
  adding: kaggle/working/results/checkpoint-86/tokenizer.json (deflated 83%)
  adding: kaggle/working/results/checkpoint-86/adapter_config.json (deflated 56%)
  adding: kaggle/working/results/checkpoint-86/added_tokens.json (stored 0%)
  adding: kaggle/working/results/checkpoint-86/special_tokens_map.json (defl