In [None]:
!pip install -q transformers peft datasets accelerate torch tqdm

In [None]:
import sys
sys.path.insert(0, "/home/admin/.local/lib/python3.10/site-packages")

In [None]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
MAX_LENGTH = 256
SRC = 'en'

TRAIN_EN_PATH = ''
TRAIN_VI_PATH = ''
TEST_EN_PATH = ''
TEST_VI_PATH = ''

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.float16
)

model = prepare_model_for_int8_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
with open(TRAIN_EN_PATH, 'r', encoding='utf-8') as f:
    ens = f.read().splitlines()
with open(TRAIN_VI_PATH, 'r', encoding='utf-8') as f:
    vis = f.read().splitlines()

import random
random.seed(42)
indices = list(range(len(ens)))
random.shuffle(indices)

split = int(len(ens) * 0.1)
valid_indices = indices[:split]
train_indices = indices[split:]

train_ens = [ens[i] for i in train_indices]
train_vis = [vis[i] for i in train_indices]

valid_ens = [ens[i] for i in valid_indices]
valid_vis = [vis[i] for i in valid_indices]

if SRC == 'en':
    train_src_texts = train_ens
    train_tgt_texts = train_vis
    valid_src_texts = valid_ens
    valid_tgt_texts = valid_vis

    train_src_texts = [
        f"Translate the following English medical text into Vietnamese exactly, preserving all medical terms, numbers, units, and formatting: {s}" 
        for s in train_ens
    ]
    valid_src_texts = [
        f"Translate the following English medical text into Vietnamese exactly, preserving all medical terms, numbers, units, and formatting: {s}" 
        for s in valid_ens
    ]

else:
    train_src_texts = train_vis
    train_tgt_texts = train_ens
    valid_src_texts = valid_vis
    valid_tgt_texts = valid_ens

    train_src_texts = [
        f"Dịch chính xác đoạn văn y tế sau từ Vietnamese sang English, giữ nguyên tất cả thuật ngữ, con số, đơn vị và định dạng: {s}" 
        for s in train_vis
    ]
    valid_src_texts = [
        f"Dịch chính xác đoạn văn y tế sau từ Vietnamese sang English, giữ nguyên tất cả thuật ngữ, con số, đơn vị và định dạng: {s}" 
        for s in valid_vis
    ]

In [None]:
train_data = Dataset.from_dict({"input": train_src_texts, "output": train_tgt_texts})
valid_data = Dataset.from_dict({"input": valid_src_texts, "output": valid_tgt_texts})
def tokenize_fn(batch):
    enc = tokenizer(
        batch["input"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )
    dec = tokenizer(
        batch["output"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )
    enc["labels"] = dec["input_ids"]
    return enc

train_ds = train_data.map(tokenize_fn, batched=True, remove_columns=["input","output"])
valid_ds = valid_data.map(tokenize_fn, batched=True, remove_columns=["input","output"])

data_collator = DataCollatorForSeq2Seq(tokenizer, padding="longest", return_tensors="pt")

In [None]:
training_args = TrainingArguments(
    output_dir=f"./qwen-lora-ft-{SRC}",

    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=1,
    
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_steps=500,
    max_grad_norm=1.0,

    num_train_epochs=10,

    logging_strategy="steps",
    logging_steps=500,
    log_level="info",

    save_strategy="epoch",
    save_steps=None,
    save_total_limit=3,

    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,

    fp16=True,
    dataloader_drop_last=False,
    gradient_checkpointing=True,

    seed=42,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=data_collator
)
trainer.train()
model.save_pretrained(f"./qwen-lora-ft-{SRC}")

In [None]:
import re
def normalize(text: str) -> str:
    text = text.strip()
    replace_map = {
        '–':'-', '—':'-', '−':'-', '﹣':'-',
        '':'≥', '':'≤',
        '∕':'/', '／':'/',
        '＝':'=',
        '＋':'+',
        '％':'%',
        '±':'±',
        '‘':"'", '’':"'",
        '“':'"', '”':'"',
    }
    for k, v in replace_map.items():
        text = text.replace(k, v)
    text = re.sub(r'\s+%', '%', text)
    text = re.sub(r'\s*/\s*', '/', text)
    text = re.sub(r'\s*-\s*', '-', text)
    for symbol in ['<', '>', '=', '±', '+', '≥', '≤']:
        text = re.sub(r'\s*{}\s*'.format(re.escape(symbol)), f' {symbol} ', text)

    def merge_numbers_only(match):
        s = match.group(0)
        return s.replace(' ', '')
    text = re.sub(r'\b\d[\d\s/.,%]*\d\b', merge_numbers_only, text)

    text = re.sub(r'([.,:?!])(?=[A-Za-zÀ-ÖØ-öø-ÿ])', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(\d),\s*(\d)', r'\1,\2', text)
    text = re.sub(r'(\d)([.,])\s*(\d)', r'\1\2\3', text)
    return text

with open(TEST_EN_PATH, 'r', encoding='utf-8') as f:
    test_ens = f.read().splitlines()
with open(TEST_VI_PATH, 'r', encoding='utf-8') as f:
    test_vis = f.read().splitlines()

if SRC == 'en':
    test_src_texts = test_ens
    test_tgt_texts = test_vis
    test_src_texts = [normalize(text) for text in test_src_texts]
    test_src_texts = [
        f"Translate the following English medical text into Vietnamese exactly, preserving all medical terms, numbers, units, and formatting: {s}" 
        for s in test_ens
    ]

else:
    test_src_texts = test_vis
    test_tgt_texts = test_ens
    test_src_texts = [normalize(text) for text in test_src_texts]
    test_src_texts = [
        f"Dịch chính xác đoạn văn y tế sau từ Vietnamese sang English, giữ nguyên tất cả thuật ngữ, con số, đơn vị và định dạng: {s}" 
        for s in test_vis
    ]

In [None]:
from tqdm import tqdm
model.eval()
preds = []
for src_text in tqdm(test_src_texts):
    inputs = tokenizer(
        src_text,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    ).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_LENGTH,
            do_sample=False
        )
    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    preds.append(pred)

with open(f'predictions_{SRC}.txt', 'w', encoding='utf-8') as f:
    for pred in preds:
        f.write(pred + '\n')