**Vietnamese Question Answering**

In [None]:
!pip install -i https://pypi.org/simple --default-timeout=100 transformers datasets evaluate accelerate gradio

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os, torch, numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering,
    TrainingArguments, Trainer, default_data_collator
)
import evaluate

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

Torch: 2.9.0.dev20250902+cu128
CUDA available: True


In [2]:
dataset = load_dataset("taidng/UIT-ViQuAD2.0")
print(dataset)
print(dataset["train"][0])

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.20M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/735k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28454 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3814 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7301 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'uit_id', 'title', 'context', 'question', 'answers', 'is_impossible', 'plausible_answers'],
        num_rows: 28454
    })
    validation: Dataset({
        features: ['id', 'uit_id', 'title', 'context', 'question', 'answers', 'is_impossible', 'plausible_answers'],
        num_rows: 3814
    })
    test: Dataset({
        features: ['id', 'uit_id', 'title', 'context', 'question', 'answers', 'is_impossible', 'plausible_answers'],
        num_rows: 7301
    })
})
{'id': '0001-0001-0001', 'uit_id': 'uit_000001', 'title': 'Phạm Văn Đồng', 'context': 'Phạm Văn Đồng (1 tháng 3 năm 1906 – 29 tháng 4 năm 2000) là Thủ tướng đầu tiên của nước Cộng hòa Xã hội chủ nghĩa Việt Nam từ năm 1976 (từ năm 1981 gọi là Chủ tịch Hội đồng Bộ trưởng) cho đến khi nghỉ hưu năm 1987. Trước đó ông từng giữ chức vụ Thủ tướng Chính phủ Việt Nam Dân chủ Cộng hòa từ năm 1955 đến năm 1976. Ông là vị Thủ tướng Việt Nam tại vị lâu nhất (1955–1987). Ông là học t

In [3]:
model_name = "xlm-roberta-large"  # dùng bản large (có fast tokenizer)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Thiết lập độ dài
max_length = 384        # chiều dài tối đa (context + question)
doc_stride  = 128       # trượt cửa sổ trên context dài
n_best_size = 20        # số câu trả lời tốt nhất để xét
max_answer_length = 30  # giới hạn độ dài câu trả lời sinh ra

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Chuẩn bị features cho TRAIN
def prepare_train_features(examples):
    # Tokenize question + context; chỉ cắt ở phía context ("only_second")
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,   # FAST tokenizer mới có
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        # index của example gốc
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        # sequence_ids: 0=question, 1=context, None=special
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Nếu không có câu trả lời (is_impossible=True)
        if len(answers["answer_start"]) == 0:
            cls_index = tokenized_examples["input_ids"][i].index(tokenizer.cls_token_id)
            start_positions.append(cls_index)
            end_positions.append(cls_index)
            # mask offset ở phần question
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == 1 else None) for k, o in enumerate(offsets)
            ]
            continue

        # Có câu trả lời
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        # Tìm vùng context trong tokens
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        token_end_index = len(tokenized_examples["input_ids"][i]) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Nếu answer nằm ngoài đoạn context đã cắt → trả về CLS
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            cls_index = tokenized_examples["input_ids"][i].index(tokenizer.cls_token_id)
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            # Tiến tới token bắt đầu
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)
            # Lùi tới token kết thúc
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)

        # Chỉ giữ offset ở vùng context (giúp post-process gọn hơn)
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == 1 else None) for k, o in enumerate(offsets)
        ]

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples


# Chuẩn bị features cho VALIDATION/TEST (giữ offset & example_id để post-process)
def prepare_validation_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Chỉ giữ offset ở vùng context
        sequence_ids = tokenized_examples.sequence_ids(i)
        offsets = tokenized_examples["offset_mapping"][i]
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == 1 else None) for k, o in enumerate(offsets)
        ]

    return tokenized_examples


In [5]:
# Map
tokenized_train = dataset["train"].map(
    prepare_train_features,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

tokenized_valid = dataset["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)

print(tokenized_train)
print(tokenized_valid)

Map:   0%|          | 0/28454 [00:00<?, ? examples/s]

Map:   0%|          | 0/3814 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
    num_rows: 30399
})
Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 3937
})


In [6]:
metric = evaluate.load("squad_v2")  # vì có is_impossible

# gom predictions (start/end logits) thành text
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    import collections
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, f in enumerate(features):
        features_per_example[example_id_to_index[f["example_id"]]].append(i)

    predictions = {}

    for example_index, example in enumerate(examples):
        feature_indices = features_per_example[example_index]
        min_null_score = None
        valid_answers = []

        context = example["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits   = all_end_logits[feature_index]
            offsets      = features[feature_index]["offset_mapping"]

            # điểm null (chọn CLS) cho SQuAD v2
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # top n start/end
            start_indexes = np.argsort(start_logits)[-1:-n_best_size-1:-1].tolist()
            end_indexes   = np.argsort(end_logits)[-1:-n_best_size-1:-1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(offsets) or end_index >= len(offsets):
                        continue
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue

                    start_char = offsets[start_index][0]
                    end_char   = offsets[end_index][1]
                    text = context[start_char:end_char]
                    score = start_logits[start_index] + end_logits[end_index]
                    valid_answers.append({"score": float(score), "text": text})

        if len(valid_answers) > 0:
            best_non_null = max(valid_answers, key=lambda x: x["score"])
            # chọn giữa null vs non-null
            if min_null_score is not None and min_null_score > best_non_null["score"]:
                predictions[example["id"]] = ""
            else:
                predictions[example["id"]] = best_non_null["text"]
        else:
            predictions[example["id"]] = ""

    return predictions


def compute_metrics(eval_preds):
    preds = postprocess_qa_predictions(
        examples=dataset["validation"],
        features=tokenized_valid,
        raw_predictions=eval_preds,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
    )
    refs = [{"id": ex["id"], "answers": ex["answers"]} for ex in dataset["validation"]]

    return metric.compute(
        predictions=[{"id": k, "prediction_text": v} for k, v in preds.items()],
        references=refs,
    )


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [8]:
from transformers import TrainingArguments, Trainer, default_data_collator

bsz = 16

args = TrainingArguments(
    output_dir="xlmr-large-viquad",
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=bsz,
    per_device_eval_batch_size=bsz,
    gradient_accumulation_steps=1,
    weight_decay=0.01,
    fp16=True,
    bf16=False,
    report_to="none",
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Step,Training Loss,Validation Loss
500,1.5738,No log
1000,1.4011,No log
1500,1.1987,No log
2000,0.8775,No log
2500,0.8873,No log
3000,0.9021,No log
3500,0.8926,No log
4000,0.5768,No log
4500,0.6062,No log
5000,0.6072,No log


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=9500, training_loss=0.7336859492251747, metrics={'train_runtime': 1420.5294, 'train_samples_per_second': 106.999, 'train_steps_per_second': 6.688, 'total_flos': 1.0586911445137152e+17, 'train_loss': 0.7336859492251747, 'epoch': 5.0})

In [9]:
metrics = trainer.evaluate()
print(metrics)

trainer.save_model("xlmr-large-viquad-final")
tokenizer.save_pretrained("xlmr-large-viquad-final")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'eval_runtime': 7.7147, 'eval_samples_per_second': 510.322, 'eval_steps_per_second': 32.017, 'epoch': 5.0}


('xlmr-large-viquad-final/tokenizer_config.json',
 'xlmr-large-viquad-final/special_tokens_map.json',
 'xlmr-large-viquad-final/tokenizer.json')

In [26]:
model_name = "xlmr-large-viquad-final"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

context = """
Khải là bạn cùng phòng của Nam và Thiên ở căn trọ nằm tại 81 Phó Đức Chính thuộc quận Bình Thạnh. Căn trọ này bị bẩn do Khải cứ ngồi dựa vào như con heo cọ lưng vào tường, tạo ra 1 vết ố vàng trông rất hãi hùng. Nam và Thiên cảm thấy rất sợ hãi vết ố đó.
"""
question = "Khải đã làm gì?"

# test
result = qa_pipeline(question=question, context=context)
print("Answer:", result['answer'])


Device set to use cuda:0


Answer: cứ ngồi dựa vào như con heo cọ lưng vào tường,
