In [None]:
!pip install transformers
!pip install transformers datasets accelerate



In [None]:
!pip install --upgrade transformers



In [None]:
!pip install -U transformers datasets




# 1. Install Library

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset, Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [None]:
# Load CSV dengan pandas
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")

# Konversi ke Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Gabungkan jika ingin buat DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [None]:
def format_example(example):
    return {'text': f"{example['input']} {example['output']}"}

dataset = dataset.map(format_example)

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
# Load model GPT-2 yang sudah di-train dengan bahasa Indonesia
tokenizer = GPT2Tokenizer.from_pretrained("cahya/gpt2-small-indonesian-522M")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("cahya/gpt2-small-indonesian-522M")
model.config.pad_token_id = tokenizer.pad_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/894k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/452k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=50,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=20,
    logging_dir='./logs',
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
import transformers
print(transformers.__version__)

4.52.4


In [None]:
trainer.train()

Step,Training Loss
20,0.7635
40,0.5367
60,0.4368
80,0.3687
100,0.3727
120,0.3421
140,0.3091
160,0.2836
180,0.2393


TrainOutput(global_step=180, training_loss=0.40583419534895154, metrics={'train_runtime': 161.4985, 'train_samples_per_second': 4.396, 'train_steps_per_second': 1.115, 'total_flos': 185517342720000.0, 'train_loss': 0.40583419534895154, 'epoch': 10.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# (Optional) STEP 8: Simpan model & tokenizer
trainer.save_model("/content/drive/MyDrive/LASKAR/Gpt-2")
tokenizer.save_pretrained("/content/drive/MyDrive/LASKAR/Gpt-2")

('/content/drive/MyDrive/LASKAR/Gpt-2/tokenizer_config.json',
 '/content/drive/MyDrive/LASKAR/Gpt-2/special_tokens_map.json',
 '/content/drive/MyDrive/LASKAR/Gpt-2/vocab.json',
 '/content/drive/MyDrive/LASKAR/Gpt-2/merges.txt',
 '/content/drive/MyDrive/LASKAR/Gpt-2/added_tokens.json')

In [None]:
model_path = "/content/drive/MyDrive/LASKAR/Gpt-2"

tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [None]:
from transformers import pipeline

def generate_text(
    model,
    tokenizer,
    prompt,
    max_length=150,
    do_sample=True,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1,
    repetition_penalty=1.0,
    eos_token_id=None
):
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

    outputs = generator(
        prompt,
        max_length=max_length,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=num_return_sequences,
        repetition_penalty=repetition_penalty,
        eos_token_id=eos_token_id,
    )

    return [out["generated_text"] for out in outputs]


In [None]:
prompt = (
    "ojol"
)


results = generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_length=10,
    temperature=0.7,
    top_k=30,
    top_p=0.92,
    num_return_sequences=3,
    repetition_penalty=1.5,
    eos_token_id=tokenizer.eos_token_id
)

for i, res in enumerate(results):
    print(f"\n=== Generated Text {i+1} ===\n{res}")


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== Generated Text 1 ===
ojol Pengendara ojol sedang berada di jalur Anda. Ada motor dengan pengemudi ojol mendekat.


=== Generated Text 2 ===
ojol Waspada, ojek online berhenti di dekat Anda.


=== Generated Text 3 ===
ojol Ojek online terlihat, harap berhati-hati. Jangan menyeberang dulu di tempat yang aman.

