In [1]:
# pip install spacy

In [None]:
import json

input_file = "annotation.json"      
output_file = "spacy_train.json"

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

spacy_data = []

for item in data:
    text = item["data"]["text"]
    entities = []

    annotation = item["annotations"][0]
    for res in annotation["result"]:
        if res["type"] == "labels":
            start = res["value"]["start"]
            end = res["value"]["end"]
            label = res["value"]["labels"][0] 
            entities.append([start, end, label])

    spacy_data.append({
        "text": text,
        "entities": entities
    })

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(spacy_data, f, ensure_ascii=False, indent=2)

print("✅ Done! Saved:", output_file)

✅ Done! Saved: spacy_train.json


In [3]:
import json

with open("spacy_train.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    
spacy_data = []
for item in data:
    text = item["text"]
    entities = [(start, end, label) for start, end, label in item["entities"] if label != "O"]
    spacy_data.append((text, {"entities": entities}))

print(spacy_data)

[('Tôi đi học ở trường Đại học A. Trường A nằm ở thành phố B, nơi có con sông C đi qua.', {'entities': [(20, 29, 'ORG'), (31, 39, 'ORG'), (46, 57, 'LOC'), (66, 76, 'MISC'), (0, 3, 'PER'), (13, 19, 'MISC')]})]


In [None]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding

nlp = spacy.blank("vi")

# Thêm NER pipe
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add các label từ dữ liệu
for _, annotations in spacy_data:
    for _, _, label in [(start, end, lbl) for start, end, lbl in annotations["entities"]]:
        ner.add_label(label)

# Bắt đầu training
optimizer = nlp.begin_training()

# Training loop
for epoch in range(20):
    losses = {}
    # Shuffle dữ liệu nếu nhiều data
    batches = minibatch(spacy_data, size=compounding(1.0, 4.0, 1.5))
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], sgd=optimizer, drop=0.2, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

# Lưu model
nlp.to_disk("vi_ner_model")



Epoch 1, Losses: {'ner': np.float32(19.000002)}
Epoch 2, Losses: {'ner': np.float32(18.731369)}
Epoch 3, Losses: {'ner': np.float32(18.410828)}
Epoch 4, Losses: {'ner': np.float32(18.001324)}
Epoch 5, Losses: {'ner': np.float32(17.555805)}
Epoch 6, Losses: {'ner': np.float32(16.620216)}
Epoch 7, Losses: {'ner': np.float32(16.065145)}
Epoch 8, Losses: {'ner': np.float32(15.074325)}
Epoch 9, Losses: {'ner': np.float32(12.83108)}
Epoch 10, Losses: {'ner': np.float32(11.291933)}
Epoch 11, Losses: {'ner': np.float32(9.327143)}
Epoch 12, Losses: {'ner': np.float32(7.36459)}
Epoch 13, Losses: {'ner': np.float32(6.5358105)}
Epoch 14, Losses: {'ner': np.float32(6.3410835)}
Epoch 15, Losses: {'ner': np.float32(5.9351707)}
Epoch 16, Losses: {'ner': np.float32(6.494541)}
Epoch 17, Losses: {'ner': np.float32(5.461256)}
Epoch 18, Losses: {'ner': np.float32(4.588262)}
Epoch 19, Losses: {'ner': np.float32(4.465509)}
Epoch 20, Losses: {'ner': np.float32(12.295354)}


In [7]:
# test
doc = nlp("Tôi đi học ở trường Đại học A.")
print(doc.ents)
for ent in doc.ents:
    print(ent.text, ent.label_)

(Tôi, trường)
Tôi PER
trường MISC


In [10]:
# test
doc = nlp("Gần Trường A, có con sông C uốn lượn quanh.")
print(doc.ents)
for ent in doc.ents:
    print(ent.text, ent.label_)

(Trường A, có con sông C uốn lượn quanh.,)
Trường A, có con sông C uốn lượn quanh. ORG
