In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

def prepare_model_and_tokenizer():
    """Initialize model and tokenizer"""
    model_name = "gpt2"  # 'gpt2-medium or gpt2-large'
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    # Tambahkan special tokens untuk format haiku 
    # Seperti tanda baca untuk model memahami struktur teks generate (haiku)
    special_tokens = {
        'pad_token': '<|pad|>', # mengisi bagian yang kosong dengan pad, harapannya panjangnya jadi sama
        'bos_token': '<|startoftext|>', # menandai awal dari haiku (konteks awal)
        'eos_token': '<|endoftext|>', # konteks akhir, menandai haiku sudah selesai
    }
    tokenizer.add_special_tokens(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    return model, tokenizer

def load_dataset(file_path, tokenizer):
    """Load and prepare dataset"""
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128  # Sesuaikan dengan panjang maksimal haiku
    )
    return dataset

def train_model(model, train_dataset, tokenizer, output_dir):
    """Configure and run training"""
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
        learning_rate=5e-5,
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Kita menggunakan causal language modeling
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    
    trainer.train()
    
def generate_haiku(model, tokenizer, prompt="<|startoftext|>", max_length=64):
    """Generate haiku using the fine-tuned model"""
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [None]:
# Setup
model, tokenizer = prepare_model_and_tokenizer()

# Load dataset
train_dataset = load_dataset("", tokenizer) # add dataset

# Train model
train_model(model, train_dataset, tokenizer, "./haiku_model") # output directory "./haiku_model"

# Generate sample haiku
generated_haiku = generate_haiku(model, tokenizer)
print(generated_haiku)