In [8]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import numpy as np
from torch.utils.data import Dataset
import os

In [9]:
messages = [
        'Kamu ngapain begini begitu',
        'Kelazz vroo',
        'Bukti himnel tidak bersalah',
        'Mantap bang sudah selesai',
        'Ayo main game bareng',
        'Jangan lupa istirahat ya',
        'Besok kita ketemu dimana?',
        'Sudah makan belum?',
        'Keren banget projectnya',
        'Jangan lupa deadline besok',
        'Mau ikut gathering tidak?',
        'Update progress dong',
        'Gimana kabarnya hari ini?',
        'Sudah selesai tugasnya?',
        'Main game yuk nanti malam',
        'Besok ada acara apa?',
        'Kumpul dimana nih?',
        'Mantap jiwa bro',
        'Sip dah beres semua',
        'Nanti kabarin lagi ya',
        'Semangat pagi ini!',
        'Meeting jam berapa nanti?',
        'Project kita udah bagus banget.',
        'Jangan lupa minum air putih ya.',
        'Kita bisa ngobrol lagi malam ini.',
        'Selamat atas kesuksesanmu!',
        'Ada update progress gak?',
        'Besok kita keluar bareng ya.',
        'Terima kasih sudah bantu.',
        'Kamu ada rencana akhir pekan?',
        'Bagaimana progres tugasmu?',
        'Jangan terlalu keras dalam bekerja.',
        'Kita bisa diskusikan lagi nanti.',
        'Semoga harimu menyenangkan!',
        'Ayo selesaikan ini bareng-bareng.'
    ]

In [10]:
with open('chat_data.txt', 'w', encoding='utf-8') as f:
    for message in messages:
        f.write(message + '\n')

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained('openai-community/gpt2')
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2')

In [12]:
tokenizer.pad_token = tokenizer.eos_token

In [13]:
class ChatDataset(Dataset):
    def __init__(self, txt_file, tokenizer):
        self.tokenizer = tokenizer
        with open(txt_file, 'r', encoding='utf-8') as f:
            self.lines = [line.strip() for line in f.readlines()]

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx]
        encodings = self.tokenizer(line, truncation=True, padding='max_length',
                                 max_length=64, return_tensors='pt')

        item = {key: torch.squeeze(val) for key, val in encodings.items()}
        item['labels'] = item['input_ids'].clone()

        return item

In [14]:
dataset = ChatDataset('chat_data.txt', tokenizer)

In [15]:
training_args = TrainingArguments(
    output_dir="./chat-model",
    num_train_epochs=50,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=5e-5,
    report_to='none'
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=lambda data: {'input_ids': torch.stack([f['input_ids'] for f in data]),
                               'attention_mask': torch.stack([f['attention_mask'] for f in data]),
                               'labels': torch.stack([f['labels'] for f in data])}
)

In [17]:
trainer.train()

Step,Training Loss
100,0.6358
200,0.0846
300,0.0571
400,0.0522


TrainOutput(global_step=450, training_loss=0.19042477501763239, metrics={'train_runtime': 999.43, 'train_samples_per_second': 1.751, 'train_steps_per_second': 0.45, 'total_flos': 57157632000000.0, 'train_loss': 0.19042477501763239, 'epoch': 50.0})

In [18]:
model.save_pretrained('./chat-model-final')
tokenizer.save_pretrained('./chat-model-final')

('./chat-model-final\\tokenizer_config.json',
 './chat-model-final\\special_tokens_map.json',
 './chat-model-final\\vocab.json',
 './chat-model-final\\merges.txt',
 './chat-model-final\\added_tokens.json')

In [70]:
def generate_response(prompt, model, tokenizer, max_length=50):
    encoded_input = tokenizer.encode_plus(
        prompt,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=max_length,
        return_attention_mask=True
    )
    
    outputs = model.generate(
        input_ids=encoded_input['input_ids'],
        attention_mask=encoded_input['attention_mask'],
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id 
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [82]:
def test_model():
    model = GPT2LMHeadModel.from_pretrained('./chat-model-final')
    tokenizer = GPT2Tokenizer.from_pretrained('./chat-model-final')
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id
    
    test_prompts = [
        "Halo",
        "udah",
        "aku baik"
    ]
    
    print("Testing chat model responses:")
    print("-" * 50)
    for prompt in test_prompts:
        response = generate_response(prompt, model, tokenizer)
        print(f"Input: {prompt}")
        print(f"Response: {response}")
        print("-" * 50)

In [83]:
if __name__ == "__main__":
    test_model()

Testing chat model responses:
--------------------------------------------------
Input: Halo
Response: Halo belum air putih ya.
--------------------------------------------------
Input: udah
Response: udah makan belum?
--------------------------------------------------
Input: aku baik
Response: aku baikut gathering tidak?
--------------------------------------------------
