In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer

# Dataset

In [None]:
train_dataset = load_dataset("pythainlp/thai_food_v1.0", split="train")

In [None]:
def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"# ชื่อ\n{example['name'][i]}\n\n# ข้อความ\n{example['text'][i]}"
        output_texts.append(text)
    return output_texts

# Tokenizer and Model

In [None]:
tokenizer_checkpoint = "scb10x/typhoon-7b"
model_checkpoint = "scb10x/typhoon-7b"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, device_map="cuda" if torch.cuda.is_available() else "cpu")

if tokenizer.pad_token is None and model.config.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

# Train

## Config

In [None]:
args = SFTConfig(output_dir="tmp_trainer/thai_food")

## Run

In [None]:
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    formatting_func=formatting_func
)
trainer.train()