In [4]:
import os
# Bật cờ để vô hiệu hóa việc tải Metal Plugin
os.environ['TF_ENABLE_METAL_PLUGINS'] = '0'

import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [5]:
import pandas as pd

In [6]:
df = pd.read_json('ner.json')
df.head()

Unnamed: 0,id,tokens,tags
0,1,"[Dầu, tẩy, trang, Cocoon, hoa, hồng, cho, da, ...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, B-BRAND, I-P..."
1,2,"[Tinh, chất, Some, By, Mi, AHA, BHA, giảm, mụn]","[B-PRODUCT, I-PRODUCT, B-BRAND, I-BRAND, I-BRA..."
2,3,"[Serum, Balance, Niacinamide, 10%, cho, da, dầ...","[B-PRODUCT, B-BRAND, B-INGREDIENT, I-INGREDIEN..."
3,4,"[Kem, dưỡng, ẩm, CeraVe, chứa, Ceramide, và, H...","[B-PRODUCT, I-PRODUCT, I-PRODUCT, B-BRAND, O, ..."
4,5,"[Toner, Hada, Labo, cấp, ẩm, cho, da, khô, nhạ...","[B-PRODUCT, B-BRAND, I-BRAND, B-BENEFIT, I-BEN..."


In [7]:
sentences = list(df['tokens'])
tags_lists = list(df['tags'])
unique_tags = sorted(df['tags'].explode().unique().tolist())

In [8]:
label2id = {label: i for i, label in enumerate(unique_tags)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(unique_tags)

In [9]:
from torch.utils.data import Dataset
MODEL_NAME = "bert-base-multilingual-cased" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
import torch


class NERDataset(Dataset):
    def __init__(self, sentences, tags_lists, tokenizer, label2id, max_len=64):
        self.sentences = sentences
        self.tags_lists = tags_lists
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = 64
        
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        words = self.sentences[index]
        tags = self.tags_lists[index]
        encoding = self.tokenizer(
            words,
            is_split_into_words=True,
            return_offsets_mapping=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_len
        )
        
        word_ids = encoding.word_ids()
        encoding.pop("offset_mapping")
        
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                if word_idx != previous_word_idx:
                    label_ids.append(self.label2id[tags[word_idx]])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

        encoding["labels"] = label_ids

        # convert sang tensor
        return {k: torch.tensor(v) for k, v in encoding.items()}

In [11]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(
    sentences, tags_lists, test_size=0.2, random_state=42
)

In [12]:
train_dataset = NERDataset(X_train, y_train, tokenizer, label2id)
val_dataset   = NERDataset(X_val,   y_val,   tokenizer, label2id)

In [13]:
from transformers import DataCollatorForTokenClassification


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [14]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_tags = []
    pred_tags = []

    for pred_seq, lab_seq in zip(predictions, labels):
        cur_true = []
        cur_pred = []
        for p_i, l_i in zip(pred_seq, lab_seq):
            if l_i == -100:
                continue
            cur_true.append(id2label[l_i])
            cur_pred.append(id2label[p_i])
        if len(cur_true) > 0:
            true_tags.append(cur_true)
            pred_tags.append(cur_pred)

    return {
        "precision": precision_score(true_tags, pred_tags),
        "recall":    recall_score(true_tags, pred_tags),
        "f1":        f1_score(true_tags, pred_tags),
    }

In [15]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import TrainingArguments


training_args = TrainingArguments(
    output_dir="./ner_product_demo",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./ner_logs",
    logging_steps=10,
    report_to="all"
)


In [19]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [20]:
trainer.train()




Step,Training Loss
10,2.0723
20,1.6182
30,1.0697
40,0.8532
50,0.8227
60,0.6055
70,0.49
80,0.4696
90,0.4189




TrainOutput(global_step=90, training_loss=0.9355677392747667, metrics={'train_runtime': 278.373, 'train_samples_per_second': 1.293, 'train_steps_per_second': 0.323, 'total_flos': 11759523425280.0, 'train_loss': 0.9355677392747667, 'epoch': 3.0})

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def ner_predict(text):
    # tạm thời split đơn giản theo khoảng trắng (sau này bạn có thể dùng tokenizer tiếng Việt tốt hơn)
    words = text.split()

    encoding = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=64
    )

    encoding = {k: v.to(device) for k, v in encoding.items()}
    with torch.no_grad():
        outputs = model(**encoding)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)[0].cpu().numpy()

    word_ids = encoding["input_ids"].new_tensor(tokenizer(text.split(), is_split_into_words=True).word_ids())
    # Cách khác: dùng encoding.word_ids(batch_index=0) nếu dùng tokenizer fast
    word_ids = encoding.word_ids(batch_index=0)

    entities = []
    current_tokens = []
    current_label = None

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue

        # chỉ lấy subword đầu tiên
        if idx > 0 and word_idx == word_ids[idx-1]:
            continue

        label_id = predictions[idx]
        label_name = id2label[label_id]
        word = words[word_idx]

        if label_name.startswith("B-"):
            # kết thúc entity cũ
            if current_tokens:
                entities.append({
                    "text": " ".join(current_tokens),
                    "label": current_label
                })
            current_tokens = [word]
            current_label = label_name[2:]  # bỏ "B-"
        elif label_name.startswith("I-") and current_label == label_name[2:]:
            current_tokens.append(word)
        else:
            if current_tokens:
                entities.append({
                    "text": " ".join(current_tokens),
                    "label": current_label
                })
                current_tokens = []
                current_label = None

    # entity cuối
    if current_tokens:
        entities.append({
            "text": " ".join(current_tokens),
            "label": current_label
        })

    return entities


In [22]:
test_text = "Dầu tẩy trang Cocoon hoa hồng cho da dầu mụn"
ents = ner_predict(test_text)
for e in ents:
    print(e)


TypeError: 'NoneType' object cannot be interpreted as an integer