In [1]:
!pip install -q transformers peft datasets accelerate torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from transformers import EarlyStoppingCallback

In [3]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
MAX_LENGTH = 512
SRC = 'en'

TRAIN_EN_PATH = 'clean_train.en.txt'
TRAIN_VI_PATH = 'clean_train.vi.txt'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    device_map="auto",
    dtype=torch.float16
)

# # Enable gradient checkpointing before applying PEFT
# model.gradient_checkpointing_enable()

# Enable input gradients for the model
if hasattr(model, 'enable_input_require_grads'):
    model.enable_input_require_grads()
else:
    def make_inputs_require_grad(module, input, output):
        output.requires_grad_(True)
    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


In [5]:
with open(TRAIN_EN_PATH, 'r', encoding='utf-8') as f:
    ens = f.read().splitlines()
with open(TRAIN_VI_PATH, 'r', encoding='utf-8') as f:
    vis = f.read().splitlines()

import random
random.seed(42)
indices = list(range(len(ens)))
random.shuffle(indices)

split = int(len(ens) * 0.1)
valid_indices = indices[:split]
train_indices = indices[split:]

train_ens = [ens[i] for i in train_indices]
train_vis = [vis[i] for i in train_indices]

valid_ens = [ens[i] for i in valid_indices]
valid_vis = [vis[i] for i in valid_indices]

def make_prompt(src_text, src_lang="en"):
    if src_lang == "en":
        prompt = f"""
You are an expert in translating English medical text into Vietnamese.
Translate the text exactly, preserving all medical terms, numbers, units, and formatting.
Return only the Vietnamese translation, without explanations, notes, comments, or suggestions.

Example:
English text: He's so tired.
Vietnamese translation: Anh ấy rất mệt.

Now translate the following:
English text: {src_text}
Vietnamese translation:
""".strip()
    else:
        prompt = f"""
Bạn là một chuyên gia trong việc dịch các đoạn văn y tế từ Vietnamese sang English.
Dịch chính xác đoạn văn sau, giữ nguyên tất cả thuật ngữ, số liệu, đơn vị và định dạng.
Chỉ trả về bản dịch tiếng Anh, không giải thích, ghi chú, nhận xét hay gợi ý nào khác.

Ví dụ:
Tiếng Việt: Anh ấy rất mệt.
Tiếng Anh: He's so tired.

Now translate the following:
Tiếng Việt: {src_text}
Tiếng Anh:
""".strip()
    return prompt

if SRC == "en":
    train_src_texts = [make_prompt(s, "en") for s in train_ens]
    valid_src_texts = [make_prompt(s, "en") for s in valid_ens]
    train_tgt_texts = train_vis
    valid_tgt_texts = valid_vis
else:
    train_src_texts = [make_prompt(s, "vi") for s in train_vis]
    valid_src_texts = [make_prompt(s, "vi") for s in valid_vis]
    train_tgt_texts = train_ens
    valid_tgt_texts = valid_ens

In [6]:
train_data = Dataset.from_dict({"input": train_src_texts, "output": train_tgt_texts})
valid_data = Dataset.from_dict({"input": valid_src_texts, "output": valid_tgt_texts})
def tokenize_fn(batch):
    input_texts = [f"{src} {tgt} {tokenizer.eos_token}" for src, tgt in zip(batch["input"], batch["output"])]
    encodings = tokenizer(
        input_texts,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

    labels = []
    for src, tgt, ids in zip(batch["input"], batch["output"], encodings["input_ids"]):
        prompt_ids = tokenizer(src, truncation=True, padding=False)["input_ids"]
        label = [-100] * len(prompt_ids) + ids[len(prompt_ids):]
        label = label[:MAX_LENGTH]
        labels.append(label)
    
    encodings["labels"] = labels
    return encodings

train_ds = train_data.map(tokenize_fn, batched=True, remove_columns=["input","output"])
valid_ds = valid_data.map(tokenize_fn, batched=True, remove_columns=["input","output"])

Map:   0%|          | 0/303838 [00:00<?, ? examples/s]

Map:   0%|          | 0/33759 [00:00<?, ? examples/s]

In [7]:
training_args = TrainingArguments(
    output_dir=f"./qwen-lora-ft-{SRC}",

    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=1,
    
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_steps=2000,
    max_grad_norm=1.0,

    num_train_epochs=5,

    logging_steps=1000,
    log_level="info",

    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,

    eval_strategy="steps",
    eval_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,

    fp16=True,
    dataloader_drop_last=False,

    seed=42,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()
model.save_pretrained(f"./qwen-lora-ft-{SRC}")

The model is already on multiple devices. Skipping the move to device specified in `args`.
Using auto half precision backend
***** Running training *****
  Num examples = 303,838
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 23,740
  Number of trainable parameters = 8,798,208


Step,Training Loss,Validation Loss
1000,0.394,0.143869
2000,0.1303,0.121327
3000,0.115,0.110658
4000,0.1081,0.105074
5000,0.1027,0.101785
6000,0.0965,0.09973
7000,0.0956,0.097721
8000,0.0939,0.095868
9000,0.0925,0.094688
10000,0.0877,0.094377



***** Running Evaluation *****
  Num examples = 33759
  Batch size = 64
Saving model checkpoint to ./qwen-lora-ft-en/checkpoint-1000
loading configuration file config.json from cache at /home/admin/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/7ae557604adf67be50417f59c2c2f167def9a775/config.json
Model config Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
   

In [8]:
TEST_EN_PATH = 'public_test.en.txt'
TEST_VI_PATH = 'public_test.vi.txt'

In [9]:
import re
def normalize(text: str) -> str:
    text = text.strip()
    replace_map = {
        '–':'-', '—':'-', '−':'-', '﹣':'-',
        '':'≥', '':'≤',
        '∕':'/', '／':'/',
        '＝':'=',
        '＋':'+',
        '％':'%',
        '±':'±',
        '‘':"'", '’':"'",
        '“':'"', '”':'"',
    }
    for k, v in replace_map.items():
        text = text.replace(k, v)
    text = re.sub(r'\s+%', '%', text)
    text = re.sub(r'\s*/\s*', '/', text)
    text = re.sub(r'\s*-\s*', '-', text)
    for symbol in ['<', '>', '=', '±', '+', '≥', '≤']:
        text = re.sub(r'\s*{}\s*'.format(re.escape(symbol)), f' {symbol} ', text)

    def merge_numbers_only(match):
        s = match.group(0)
        return s.replace(' ', '')
    text = re.sub(r'\b\d[\d\s/.,%]*\d\b', merge_numbers_only, text)

    text = re.sub(r'([.,:?!])(?=[A-Za-zÀ-ÖØ-öø-ÿ])', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(\d),\s*(\d)', r'\1,\2', text)
    text = re.sub(r'(\d)([.,])\s*(\d)', r'\1\2\3', text)
    return text

with open(TEST_EN_PATH, 'r', encoding='utf-8') as f:
    test_ens = f.read().splitlines()
with open(TEST_VI_PATH, 'r', encoding='utf-8') as f:
    test_vis = f.read().splitlines()

if SRC == 'en':
    test_src_texts = test_ens
    test_tgt_texts = test_vis
    test_src_texts = [normalize(text) for text in test_src_texts]
    test_src_texts = [make_prompt(s, "en") for s in test_ens]

else:
    test_src_texts = test_vis
    test_tgt_texts = test_ens
    test_src_texts = [normalize(text) for text in test_src_texts]
    test_src_texts = [make_prompt(s, "vi") for s in test_vis]

In [11]:
from tqdm import tqdm
model.eval()
preds = []

for src_text in tqdm(test_src_texts):
    inputs = tokenizer(
        src_text,
        truncation=True,
        return_tensors='pt'
    ).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_LENGTH,
            do_sample=False
        )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    if SRC == "en":
        if "Vietnamese translation:" in pred:
            pred = pred.split("Vietnamese translation:")[-1].strip()
    else:
        if "Tiếng Anh:" in pred_text:
            pred = pred.split("Tiếng Anh:")[-1].strip()

    preds.append(pred)

with open(f'predictions_{SRC}.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(preds))

  0%|          | 0/3000 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k'].
- `temperature`: `do_sample` is set to `False`. However, `temperature` is set to `0.7` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
- `top_p`: `do_sample` is set to `False`. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.
- `top_k`: `do_sample` is set to `False`. However, `top_k` is set to `20` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.
If you're using a pretrained model, note that some of these attributes may be set through the model's `generation_config.json` file.
100%|██████████| 3000/3000 [58:52<00:00,  1.18s/it]  


In [12]:
for i in range(5):
    print(f'{test_src_texts[i]} {preds[i]}')
    print('-' * 50)

You are an expert in translating English medical text into Vietnamese.
Translate the text exactly, preserving all medical terms, numbers, units, and formatting.
Return only the Vietnamese translation, without explanations, notes, comments, or suggestions.

Example:
English text: He's so tired.
Vietnamese translation: Anh ấy rất mệt.

Now translate the following:
English text: Knowledge, practices in public health service utilization among health insurance card’s holders and influencing factors in Vientiane, Lao
Vietnamese translation: Kiến thức, thực hành sử dụng dịch vụ y tế công cộng của người có thẻ bảo hiểm y tế và các yếu tố liên quan tại Viêng Chăn, Lào
--------------------------------------------------
You are an expert in translating English medical text into Vietnamese.
Translate the text exactly, preserving all medical terms, numbers, units, and formatting.
Return only the Vietnamese translation, without explanations, notes, comments, or suggestions.

Example:
English text: H

In [13]:
!pip install -q sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [18]:
from sacrebleu.metrics import BLEU

bleu = BLEU(
    lowercase=False,
    effective_order=True,
    smooth_method='exp'
)
score = bleu.corpus_score(preds, [test_tgt_texts])

print("BLEU score:", score.score)

BLEU score: 25.035836616354402
