## Wannakadee-Generator-CLM

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM,pipeline
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
tokenizer = AutoTokenizer.from_pretrained("tupleblog/generate-thai-lyrics")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("tupleblog/generate-thai-lyrics")

In [None]:
class WannakadeeDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.file_path = file_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.lines = []
        with open(self.file_path, 'r', encoding='utf-8') as f:
            cur = ""
            for line in f:
                line = line.strip()
                if line:
                    if(line[0]=="๏"):
                        self.lines.append(cur)
                        cur = ""
                    cur += line + " \n "
                else:
                    lines.append(cur + line + " \n ")
        self.lines = self.lines[1:]        

    def __len__(self):
        return len(self.lines)
    
    def __getitem__(self, idx):
        line = self.lines[idx]
        encoded = self.tokenizer.encode_plus(
            line,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].squeeze()
        attention_mask = encoded['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [None]:
train_dataset = WannakadeeDataset("Dataset/phra_aphai-train.txt", tokenizer, max_length=400)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_dataset = WannakadeeDataset("Dataset/phra_aphai-val.txt", tokenizer, max_length=400)
valid_loader = DataLoader(valid_dataset, batch_size=8)

In [None]:
from transformers import AdamW
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
num_epochs = 3

In [None]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone().detach()
        labels[labels == tokenizer.pad_token_id] = -100
        output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    model.eval()
    valid_loss = 0
    for batch in tqdm(valid_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone().detach()
        labels[labels == tokenizer.pad_token_id] = -100
        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        valid_loss += loss.item()
    valid_loss /= len(valid_loader)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')

model.save_pretrained("model")

# Test

In [None]:
nlp = pipeline("text-generation",model=model,tokenizer=tokenizer)    
text = "๏ สัมผัสเส้นขอบฟ้าชลาลัย"
generated_text = nlp(text,max_length=140,top_k=25,temperature=1)
print(f"Input: {text}")
print(f"Output:\n {generated_text[0]['generated_text']}")