### Import Libraries

In [1]:
import json
import spacy
import random

from typing import List, Tuple
from spacy.util import minibatch
from spacy.training.example import Example

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DistilBertTokenizerFast, DataCollatorForTokenClassification, DistilBertForTokenClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


### Data Preparation

In [2]:
def convert_to_spacy_format(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    training_data = []
    for item in data:
        text = item["content"]
        entities = [(ent["start"], ent["end"], ent["label"]) for ent in item["entities"]]
        training_data.append((text, {"entities": entities}))
    
    return training_data

In [3]:
def convert_to_bio(data):
    dataset = []
    for entry in data:
        text = entry["content"]
        labels = ["O"] * len(text)
        for entity in entry["entities"]:
            start, end, label = entity["start"], entity["end"], entity["label"]
            labels[start] = f"B-{label}"
            for i in range(start + 1, end):
                labels[i] = f"I-{label}"

        # Tokenize by whitespace and align labels (this assumes no tokenization mismatch)
        tokens, tags = [], []
        word = ''
        idx = 0
        while idx < len(text):
            if text[idx].isspace():
                if word:
                    tokens.append(word)
                    tags.append(labels[idx - len(word)])
                    word = ''
                idx += 1
                continue
            word += text[idx]
            idx += 1
            # If it's end of word
            if idx == len(text) or text[idx].isspace():
                tokens.append(word)
                tags.append(labels[idx - len(word)])
                word = ''
        dataset.append((tokens, tags))
    return dataset

In [4]:
filepath = "../../data/all_intents_ner.json"
spacy_data = convert_to_spacy_format(filepath)

In [5]:
with open(filepath, "r", encoding="utf-8") as f:
     data = json.load(f)
     dataset = convert_to_bio(data)

In [6]:
# First build a label list and mapping
all_labels = sorted(set(label for _, labels in dataset for label in labels))
label2id = {label: idx for idx, label in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

# Then convert
huggingface_format = []
for tokens, labels in dataset:
    ner_tags = [label2id[label] for label in labels]
    huggingface_format.append({
        "tokens": tokens,
        "ner_tags": ner_tags
    })

In [7]:
train_data, temp_data = train_test_split(huggingface_format, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [8]:
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_align_labels(examples):
    # Tokenize text (split into words for NER)
    tokenized_inputs = tokenizer(
        examples["tokens"],  # Replace "tokens" with your dataset's text column
        truncation=True,
        padding="max_length",
        max_length=128,  # Adjust as needed
        is_split_into_words=True,  # Required for token classification
    )

    # Align labels with tokens (adjust for your dataset)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):  # Replace "ner_tags" with your label column
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Handle subword tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
# Tokenize and align labels for both datasets
train_data = dataset['train'].map(tokenize_and_align_labels, batched=True)
val_data = dataset['validation'].map(tokenize_and_align_labels, batched=True)
test_data = dataset['test'].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

Map: 100%|██████████| 385/385 [00:00<00:00, 802.58 examples/s]
Map: 100%|██████████| 82/82 [00:00<00:00, 836.85 examples/s]
Map: 100%|██████████| 83/83 [00:00<00:00, 678.52 examples/s]


In [23]:
train_data = dataset['train'].filter(lambda x: len(x["tokens"]) > 0)
val_data = dataset['validation'].filter(lambda x: len(x["tokens"]) > 0)
test_data = dataset['test'].filter(lambda x: len(x["tokens"]) > 0)

Filter: 100%|██████████| 385/385 [00:00<00:00, 3893.54 examples/s]
Filter: 100%|██████████| 82/82 [00:00<00:00, 3645.09 examples/s]
Filter: 100%|██████████| 83/83 [00:00<00:00, 3335.70 examples/s]


In [25]:
# Tokenize and remove unused columns
train_data = dataset['train'].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]  # Remove original columns
)
val_data = dataset['validation'].map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

test_data = dataset['test'].map(
     tokenize_and_align_labels,
     batched=True,
     remove_columns=["tokens", "ner_tags"]
)

Map:   0%|          | 0/385 [00:00<?, ? examples/s]

Map: 100%|██████████| 385/385 [00:00<00:00, 2182.95 examples/s]
Map: 100%|██████████| 82/82 [00:00<00:00, 1880.31 examples/s]
Map: 100%|██████████| 83/83 [00:00<00:00, 1911.82 examples/s]


In [26]:
print("Train dataset features:", train_data.features)
# Output should show: ['input_ids', 'attention_mask', 'labels']

Train dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


### Remove Overlap Entities

In [None]:
def remove_overlapping_entities(entities):
    seen = set()
    result = []
    for start, end, label in entities:
        key = (start, end)
        if key not in seen:
            seen.add(key)
            result.append((start, end, label))
    return result

# Apply to your data
cleaned_data = []
for text, annots in spacy_data:
    cleaned_ents = remove_overlapping_entities(annots["entities"])
    cleaned_data.append((text, {"entities": cleaned_ents}))

### Model Training

In [None]:
nlp = spacy.blank("en")  # create blank English model
ner = nlp.add_pipe("ner")

# Add labels
for _, annotations in cleaned_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Train the model
nlp.begin_training()
for itn in range(30):  # number of iterations
    random.shuffle(cleaned_data)
    losses = {}
    batches = minibatch(cleaned_data, size=2)
    for batch in batches:
        examples = []
        for text, annots in batch:
            examples.append(Example.from_dict(nlp.make_doc(text), annots))
        nlp.update(examples, losses=losses)
    print("Losses", losses)

In [16]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

with open(filepath) as f:
    raw_data = json.load(f)

# Define label list
labels = set()
for item in raw_data:
    for ent in item["entities"]:
        labels.add(ent["label"])
labels = sorted(list(labels))
label2id = {label: idx for idx, label in enumerate(labels)}
label2id["O"] = len(label2id)
id2label = {v: k for k, v in label2id.items()}

In [27]:
model = DistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label2id), id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./distilbert-ner",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    remove_unused_columns=False,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
def filter_invalid_labels(example):
    return all(tag < len(label2id) for tag in example["ner_tags"])

# Filter out invalid samples
train_data = dataset['train'].filter(filter_invalid_labels)
val_data = dataset['validation'].filter(filter_invalid_labels)
test_data = dataset['test'].filter(filter_invalid_labels)

Filter: 100%|██████████| 385/385 [00:00<00:00, 4888.76 examples/s]
Filter: 100%|██████████| 82/82 [00:00<00:00, 5151.94 examples/s]
Filter: 100%|██████████| 83/83 [00:00<00:00, 4268.41 examples/s]


In [18]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
trainer = Trainer(
    model=model,
    args=training_args, 
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

### Save Model

In [None]:
nlp.to_disk("ner_model")

### Load Model

In [None]:
nlp = spacy.load("ner_model")

doc = nlp("training topic: machine learning. number of participants: Three.")
for ent in doc.ents:
    print(ent.text, ent.label_)
