# MedMT: Training & Inference on Colab
โน้ตบุ๊กนี้สำหรับรันเทรนและอินเฟอเรนซ์โมเดล MedMT บน Google Colab

## 1. ติดตั้ง dependencies
ติดตั้งไลบรารีที่จำเป็นทั้งหมดสำหรับโปรเจกต์นี้

In [None]:
!pip install torch>=1.13 transformers==4.40.0 datasets>=2.19.0 sentencepiece sacrebleu pandas scikit-learn accelerate tqdm openpyxl jupyter safetensors huggingface_hub peft deepspeed tenacity>=8.2.0 python-dotenv>=1.0.0 openai>=1.0.0

## 2. อัปโหลดไฟล์โค้ดและไฟล์ข้อมูล
อัปโหลดไฟล์ในโฟลเดอร์ src/ และไฟล์ข้อมูลที่จำเป็น เช่น config.yaml, train.csv, test.csv

In [None]:
from google.colab import files
uploaded = files.upload()

## 3. เทรนโมเดล (train.py)
ตัวอย่างโค้ดสำหรับเทรนโมเดล MedMT

In [None]:
import argparse
import yaml
from src.data_loader import load_data
from src.model import MedMTModel
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from tqdm import tqdm
import os
from huggingface_hub import HfApi, HfFolder, Repository, create_repo

class MedDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        return self.df.iloc[idx]['source'], self.df.iloc[idx]['target']

def train(config):
    df = load_data(config['train_data'], is_train=True)
    model = MedMTModel(config['pretrained_model'])
    optimizer = AdamW(model.model.parameters(), lr=config['learning_rate'])
    dataset = MedDataset(df)
    loader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
    model.model.train()
    for epoch in range(config['epochs']):
        for src, tgt in tqdm(loader, desc=f"Epoch {epoch+1}"):
            inputs = model.tokenizer(list(src), return_tensors="pt", padding=True, truncation=True, max_length=config['max_seq_length']).to(model.device)
            labels = model.tokenizer(list(tgt), return_tensors="pt", padding=True, truncation=True, max_length=config['max_seq_length']).input_ids.to(model.device)
            outputs = model.model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")
    os.makedirs(os.path.dirname(config['model_save_path']), exist_ok=True)
    model.model.save_pretrained(config['model_save_path'])
    model.tokenizer.save_pretrained(config['model_save_path'])
    try:
        from safetensors.torch import save_file as save_safetensors
        import torch
        save_safetensors(model.model.state_dict(), os.path.join(config['model_save_path'], 'model.safetensors'))
        print(f"Model weights saved as .safetensors at {config['model_save_path']}")
    except ImportError:
        print("safetensors not installed, skipping .safetensors save.")

## 4. อินเฟอเรนซ์ (inference.py)
ตัวอย่างโค้ดสำหรับรันอินเฟอเรนซ์โมเดล MedMT

In [None]:
import argparse
import yaml
import pandas as pd
from src.data_loader import load_data
from src.model import MedMTModel
from tqdm import tqdm
import os

def inference(config, input_path, output_path):
    df = load_data(input_path, is_train=False)
    model = MedMTModel(config['model_save_path'])
    srcs = df['source'].tolist()
    preds = []
    for src in tqdm(srcs, desc="Translating"):
        pred = model.translate([src])[0]
        preds.append(pred)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pd.DataFrame({'id': range(len(preds)), 'translation': preds}).to_csv(output_path, index=False)

## 5. ตัวอย่างการใช้งาน
ตัวอย่างการเรียกใช้ฟังก์ชัน train และ inference

In [None]:
# โหลด config
with open('config.yaml', 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
# เทรนโมเดล
# train(config)
# อินเฟอเรนซ์
# inference(config, 'data/test.csv', 'output.csv')

In [None]:
# --- src/data_loader.py ---
import pandas as pd
import random
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder, create_repo, Repository
import os
from typing import Optional

def load_data(path: str, is_train: bool = True):
    df = pd.read_csv(path)
    if is_train:
        assert set(['context', 'source', 'target']).issubset(df.columns)
    else:
        assert set(['context', 'source']).issubset(df.columns)
    return df

def generate_synthetic_medical_dialogue(n_samples=1000, seed=42):
    random.seed(seed)
    data = []
    for i in range(n_samples):
        context = "แพทย์: คุณมีอาการอะไรบ้าง?\nผู้ป่วย: ฉันปวดหัว"
        source = random.choice(["คุณกินยาหรือยัง?", "คุณต้องพักผ่อน", "คุณต้องตรวจเพิ่มเติม", "คุณมีไข้ไหม?"])
        target = random.choice(["คุณกินยาแล้วหรือยัง?", "คุณควรพักผ่อน", "คุณต้องตรวจร่างกาย", "คุณมีไข้ไหม?"])
        data.append({"context": context, "source": source, "target": target})
    df = pd.DataFrame(data)
    return df

def save_dataset_to_hub(df, repo_name, split_name="train", private=False):
    api = HfApi()
    token = HfFolder.get_token()
    if token is None:
        raise RuntimeError("Please login to Hugging Face CLI: huggingface-cli login")
    repo_url = create_repo(repo_name, token=token, private=private, exist_ok=True)
    local_dir = f"./tmp_{repo_name.replace('/', '_')}"
    os.makedirs(local_dir, exist_ok=True)
    dataset = Dataset.from_pandas(df)
    dataset.save_to_disk(local_dir)
    repo = Repository(local_dir=local_dir, clone_from=repo_url.repo_url, use_auth_token=token)
    repo.git_add()
    repo.git_commit("Add synthetic dataset")
    repo.git_push()
    print(f"Dataset pushed to Hugging Face Hub: {repo_url.repo_url}")

In [None]:
# --- src/model.py ---
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from safetensors.torch import save_file as save_safetensors
from typing import Optional
import os

class MedMTModel:
    def __init__(self, model_name_or_path: str, device: Optional[str] = None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def translate(self, src_texts, max_length=256):
        inputs = self.tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(self.device)
        outputs = self.model.generate(**inputs, max_length=max_length)
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

    def save(self, save_dir: str, use_safetensors: bool = False):
        save_dir = os.path.join(os.getcwd(), 'models')
        os.makedirs(save_dir, exist_ok=True)
        self.tokenizer.save_pretrained(save_dir)
        if use_safetensors:
            weights_path = os.path.join(save_dir, "model.safetensors")
            save_safetensors(self.model.state_dict(), weights_path)
        else:
            self.model.save_pretrained(save_dir)

In [None]:
# --- src/evaluate.py ---
import argparse
import yaml
from src.data_loader import load_data
from src.model import MedMTModel
import sacrebleu
from tqdm import tqdm

def evaluate(config):
    df = load_data(config['train_data'], is_train=True)
    model = MedMTModel(config['model_save_path'])
    srcs = df['source'].tolist()
    tgts = df['target'].tolist()
    preds = []
    for src in tqdm(srcs, desc="Evaluating"):
        pred = model.translate([src])[0]
        preds.append(pred)
    bleu = sacrebleu.corpus_bleu(preds, [tgts], smooth_method='exp')
    print(f"BLEU: {bleu.score:.2f}")

In [None]:
# --- src/generate_dataset.py ---
# หมายเหตุ: โค้ดนี้ยาวมากและมีฟังก์ชันย่อยจำนวนมาก สามารถนำเข้าไฟล์นี้หรือคัดลอกฟังก์ชันที่ต้องการใช้งานได้
# ตัวอย่างการเรียกใช้งาน (แนะนำให้รันใน Colab เฉพาะฟังก์ชันที่ต้องการ)
# from src.generate_dataset import generate_deepseek_medical_dialogue
# df = generate_deepseek_medical_dialogue(n_samples=10)
# df.head()
# สามารถดูรายละเอียดฟังก์ชันอื่น ๆ ได้ในไฟล์ generate_dataset.py


In [None]:
# --- src/train.py ---
import argparse
import yaml
from src.data_loader import load_data
from src.model import MedMTModel
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from tqdm import tqdm
import os
from huggingface_hub import HfApi, HfFolder, Repository, create_repo

class MedDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        return self.df.iloc[idx]['source'], self.df.iloc[idx]['target']

def train(config):
    df = load_data(config['train_data'], is_train=True)
    model = MedMTModel(config['pretrained_model'])
    optimizer = AdamW(model.model.parameters(), lr=config['learning_rate'])
    dataset = MedDataset(df)
    loader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
    model.model.train()
    for epoch in range(config['epochs']):
        for src, tgt in tqdm(loader, desc=f"Epoch {epoch+1}"):
            inputs = model.tokenizer(list(src), return_tensors="pt", padding=True, truncation=True, max_length=config['max_seq_length']).to(model.device)
            labels = model.tokenizer(list(tgt), return_tensors="pt", padding=True, truncation=True, max_length=config['max_seq_length']).input_ids.to(model.device)
            outputs = model.model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")
    os.makedirs(os.path.dirname(config['model_save_path']), exist_ok=True)
    model.model.save_pretrained(config['model_save_path'])
    model.tokenizer.save_pretrained(config['model_save_path'])

In [None]:
# --- src/inference.py ---
import argparse
import yaml
import pandas as pd
from src.data_loader import load_data
from src.model import MedMTModel
from tqdm import tqdm
import os

def inference(config, input_path, output_path):
    df = load_data(input_path, is_train=False)
    model = MedMTModel(config['model_save_path'])
    srcs = df['source'].tolist()
    preds = []
    for src in tqdm(srcs, desc="Translating"):
        pred = model.translate([src])[0]
        preds.append(pred)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pd.DataFrame({'id': range(len(preds)), 'translation': preds}).to_csv(output_path, index=False)

In [None]:
# --- translate_med_dialogue.py ---
import argparse
import os
import pandas as pd
from datasets import load_metric, Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)
import torch

MODEL_NAME = "Qwen/Qwen2.5-Omni-7B"

def load_data(train_path, test_path, context_col="Context", source_col="Source", target_col="Target"):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    return train_df, test_df

def preprocess_function(examples, tokenizer, context_col, source_col, target_col=None, max_length=256):
    src_texts = [
        (c + "\n" if c and not pd.isna(c) else "") + s
        for c, s in zip(examples[context_col], examples[source_col])
    ]
    model_inputs = tokenizer(src_texts, max_length=max_length, truncation=True)
    if target_col and target_col in examples:
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples[target_col], max_length=max_length, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def train_model(train_file, output_dir, context_col="Context", source_col="Source", target_col="Target"):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
    train_df = pd.read_csv(train_file)
    train_dataset = Dataset.from_pandas(train_df)
    tokenized_train = train_dataset.map(lambda x: preprocess_function(x, tokenizer, context_col, source_col, target_col), batched=True)
    args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="no",
        per_device_train_batch_size=8,
        num_train_epochs=3,
        save_total_limit=1,
        fp16=torch.cuda.is_available(),
        predict_with_generate=True,
        logging_steps=50,
        save_steps=500,
        learning_rate=5e-5,
        report_to=[],
    )
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tokenized_train,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

def evaluate_model(model_dir, test_file, context_col="Context", source_col="Source", target_col="Target"):
    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, trust_remote_code=True)
    test_df = pd.read_csv(test_file)
    test_dataset = Dataset.from_pandas(test_df)
    metric = load_metric("sacrebleu")
    preds = []
    refs = []
    for i, row in test_df.iterrows():
        src = (str(row[context_col]) + "\n" if context_col in row and pd.notna(row[context_col]) else "") + str(row[source_col])
        input_ids = tokenizer(src, return_tensors="pt", truncation=True, max_length=256).input_ids
        with torch.no_grad():
            output = model.generate(input_ids, max_length=256, num_beams=4)
        pred = tokenizer.decode(output[0], skip_special_tokens=True)
        preds.append(pred)
        refs.append([str(row[target_col])])
    bleu = metric.compute(predictions=preds, references=refs)["score"]
    print(f"BLEU score: {bleu:.2f}")
    return bleu

def translate_file(model_dir, input_file, output_file, context_col="Context", source_col="Source"):
    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, trust_remote_code=True)
    df = pd.read_csv(input_file)
    translations = []
    for i, row in df.iterrows():
        src = (str(row[context_col]) + "\n" if context_col in row and pd.notna(row[context_col]) else "") + str(row[source_col])
        input_ids = tokenizer(src, return_tensors="pt", truncation=True, max_length=256).input_ids
        with torch.no_grad():
            output = model.generate(input_ids, max_length=256, num_beams=4)
        pred = tokenizer.decode(output[0], skip_special_tokens=True)
        translations.append(pred)
    df["Prediction"] = translations
    df.to_csv(output_file, index=False)
    print(f"Saved translations to {output_file}")