# HUẤN LUYỆN MÔ HÌNH NER V2 - HUGGINGFACE TRANSFORMERS

## I. Import

In [2]:
import json
import os
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
W0510 23:12:50.518000 15392 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


## II. Data Preprocess

### 1. Load data

In [2]:
with open("./data/ner_data.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)["dataset"]

### 2. Flatten and convert to token classification format

In [3]:
def preprocess_data(data, tokenizer):
    tokenized_inputs = []
    label_all_tokens = True
    label2id = {"O": 0, "B-Major": 1, "I-Major": 2, "B-Programme": 3, "I-Programme": 4, "B-Group": 5, "I-Group": 6}
    id2label = {v: k for k, v in label2id.items()}

    texts = [item["text"] for item in data]
    annotations = [item["entities"] for item in data]

    for text, entities in zip(texts, annotations):
        tokens = tokenizer(text, truncation=True, is_split_into_words=False)
        word_ids = tokens.word_ids()

        labels = ["O"] * len(tokens.input_ids)
        for start, end, label in entities:
            for idx, word_id in enumerate(word_ids):
                if word_id is None:
                    continue
                token_start = tokens.token_to_chars(idx).start
                token_end = tokens.token_to_chars(idx).end
                if token_start >= start and token_end <= end:
                    prefix = "B-" if token_start == start else "I-"
                    labels[idx] = prefix + label

        tokens["labels"] = [label2id.get(l, 0) for l in labels]
        tokenized_inputs.append(tokens)

    return Dataset.from_dict({
        "input_ids": [x["input_ids"] for x in tokenized_inputs],
        "attention_mask": [x["attention_mask"] for x in tokenized_inputs],
        "labels": [x["labels"] for x in tokenized_inputs],
    }), label2id, id2label

### 3. Initialize tokenizer and model

In [4]:
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### 4. Preprocess data

In [5]:
dataset, label2id, id2label = preprocess_data(raw_data, tokenizer)
dataset = dataset.train_test_split(test_size=0.05)

### 5. Load model

In [6]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 6. Metrics

In [7]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## III. Training

In [8]:
# Training arguments
args = TrainingArguments(
    "ner-bert-multilingual",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [9]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0012,5.9e-05,0.999823,0.999823,0.999823,0.999989
2,0.0005,8e-06,1.0,1.0,1.0,1.0
3,0.0001,1.6e-05,1.0,1.0,1.0,1.0


TrainOutput(global_step=17814, training_loss=0.001700691110901497, metrics={'train_runtime': 5174.713, 'train_samples_per_second': 55.076, 'train_steps_per_second': 3.443, 'total_flos': 5062257405987936.0, 'train_loss': 0.001700691110901497, 'epoch': 3.0})

In [10]:
# Save final model
trainer.save_model("ner-bert-multilingual")

In [11]:
# Test
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 7.822525731171481e-06, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 15.994, 'eval_samples_per_second': 312.618, 'eval_steps_per_second': 19.57, 'epoch': 3.0}


In [18]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import numpy as np

model = AutoModelForTokenClassification.from_pretrained("ner-bert-multilingual")
tokenizer = AutoTokenizer.from_pretrained("ner-bert-multilingual")
id2label = model.config.id2label

def predict_entities(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities = []

    current = None
    for token, pred_id in zip(tokens, predictions):
        label = id2label[pred_id]
        if label.startswith("B-"):
            if current:
                entities.append(current)
            current = {"label": label[2:], "tokens": [token]}
        elif label.startswith("I-") and current:
            current["tokens"].append(token)
        else:
            if current:
                entities.append(current)
                current = None
    if current:
        entities.append(current)

    return entities

# Ví dụ test
text = "Liên kết quốc tế"
print(predict_entities(text))

[{'label': 'Programme', 'tokens': ['Liên', 'kết', 'quốc', 'tế']}]


## IV. Export to ONNX

In [3]:
os.system("pip install optimum[onnxruntime] -q")
from optimum.onnxruntime import ORTModelForTokenClassification
from transformers import AutoConfig

In [2]:
from optimum.exporters.onnx import main_export
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm
W0511 06:57:35.238000 7064 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [4]:
main_export(
    model_name_or_path="ner-bert-multilingual",    # Đường dẫn mô hình đã fine-tune
    task="token-classification",
    output=Path("onnx/ner_model_opset14"),         # Thư mục xuất
    opset=16,                                      # ✅ bắt buộc dùng >= 14
    device="cpu"                                   # hoặc "cuda" nếu dùng GPU
)

Mô hình đang overfitting hoặc đánh giá sai lệch