### Import Libraries

In [None]:
import json
import spacy
import random
from spacy.training.example import Example
from spacy.util import minibatch

### Convert JSON to Spacy Format

In [None]:
def convert_to_spacy_format(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    training_data = []
    for item in data:
        text = item["content"]
        entities = [(ent["start"], ent["end"], ent["label"]) for ent in item["entities"]]
        training_data.append((text, {"entities": entities}))
    
    return training_data
filepath = "../../data/all_intents_ner.json"
spacy_data = convert_to_spacy_format(filepath)

### Remove Overlap Entities

In [None]:
def remove_overlapping_entities(entities):
    seen = set()
    result = []
    for start, end, label in entities:
        key = (start, end)
        if key not in seen:
            seen.add(key)
            result.append((start, end, label))
    return result

# Apply to your data
cleaned_data = []
for text, annots in spacy_data:
    cleaned_ents = remove_overlapping_entities(annots["entities"])
    cleaned_data.append((text, {"entities": cleaned_ents}))

In [None]:
spacy_data

In [None]:
cleaned_data

### Model Training

In [None]:
nlp = spacy.blank("en")  # create blank English model
ner = nlp.add_pipe("ner")

# Add labels
for _, annotations in cleaned_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Train the model
nlp.begin_training()
for itn in range(30):  # number of iterations
    random.shuffle(cleaned_data)
    losses = {}
    batches = minibatch(cleaned_data, size=2)
    for batch in batches:
        examples = []
        for text, annots in batch:
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        nlp.update(examples, losses=losses)
    print("Losses", losses)

### Save Model

In [None]:
nlp.to_disk("ner_model")

### Load Model

In [None]:
nlp = spacy.load("ner_model")

doc = nlp("training topic: machine learning. number of participants: Three.")
for ent in doc.ents:
    print(ent.text, ent.label_)
