In [1]:
import json
import os
import numpy as np
import random
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, Trainer, TrainingArguments
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import nltk
from nltk.corpus import wordnet
from itertools import chain

In [2]:
# The project root directory is one level above the notebooks folder
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
dataset_path = os.path.join(project_root, "dataset", "ner_data.json")

# Uploading data
with open(dataset_path, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"✅ Uploaded {len(data)} notes.")

✅ Uploaded 10000 notes.


In [3]:
model_name = "distilbert-base-cased"

config = AutoConfig.from_pretrained(
    model_name, 
    num_labels=2,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)

texts = [item["sentence"] for item in data]
entities = [[(ent["start"], ent["end"], ent["label"]) for ent in item["entities"]] for item in data]

tokenized_inputs = tokenizer(texts, padding=True, truncation=True, return_offsets_mapping=True)

labels = []
for i, offset_mapping in enumerate(tokenized_inputs.offset_mapping):
    label = np.zeros(len(offset_mapping), dtype=int)
    for start, end, _ in entities[i]:
        for idx, (token_start, token_end) in enumerate(offset_mapping):
            if token_start >= start and token_end <= end:
                label[idx] = 1
    labels.append(label)

tokenized_inputs["labels"] = labels

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Rearranging words in a sentence
def shuffle_words(sentence):
    words = sentence.split()
    random.shuffle(words)
    return ' '.join(words)
    
def synonym_replacement(sentence, num_synonyms=1):
    words = sentence.split()
    new_sentence = words.copy()

    for _ in range(num_synonyms):
        word_idx = random.randint(0, len(words)-1)
        synonyms = wordnet.synsets(words[word_idx])
        lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
        
        if lemmas:
            new_word = random.choice(list(lemmas))
            new_sentence[word_idx] = new_word.replace("_", " ")

    return " ".join(new_sentence)

In [5]:
example_sentence = "There is a cat in the picture."
print("Original:", example_sentence)
print("Shuffled:", shuffle_words(example_sentence))
print("Synonyms:", synonym_replacement(example_sentence))

Original: There is a cat in the picture.
Shuffled: the in a cat picture. There is
Synonyms: There is a cat IN the picture.


In [6]:
# Adding augmentation to the training data
augmented_texts = []
augmented_labels = []

for text, label in zip(texts, labels):
    # Adding original data
    augmented_texts.append(text)
    augmented_labels.append(label)
    
    # Let's add data with word shuffling
    shuffled_text = shuffle_words(text)
    augmented_texts.append(shuffled_text)
    augmented_labels.append(label)

tokenized_augmented = tokenizer(augmented_texts, padding=True, truncation=True, return_tensors="pt")

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data, labels):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Taking 30% of data
_, texts_30, _, labels_30 = train_test_split(
    augmented_texts, augmented_labels, test_size=0.3, random_state=42
)

# Dividing this 30% into training and test samples
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts_30, labels_30, test_size=0.2, random_state=42
)

# Tokenize separately for training and test data
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=50)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=50)

# Function that aligns the length of labels to tokens
def align_labels(labels, encodings):
    aligned_labels = []
    for label, input_ids in zip(labels, encodings['input_ids']):
        padded_label = np.zeros(len(input_ids), dtype=int)
        length = min(len(label), len(input_ids))
        padded_label[:length] = label[:length]
        aligned_labels.append(padded_label)
    return aligned_labels

# Aligning the labels
train_labels_aligned = align_labels(train_labels, train_encodings)
test_labels_aligned = align_labels(test_labels, test_encodings)

# Creating dataset
train_dataset = NERDataset(train_encodings, train_labels_aligned)
test_dataset = NERDataset(test_encodings, test_labels_aligned)

In [7]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=2)

project_root = os.path.abspath(os.getcwd())
model_dir = os.path.join(project_root, "NER", "model")
log_dir = os.path.join(model_dir, "logs")

# Creating directories
os.makedirs(r"C:\NER\model\logs", exist_ok=True)

training_args = TrainingArguments(
    output_dir=r"C:\NER\model",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir=r"C:\NER\model\logs",
    logging_steps=50,
    save_steps=500,
    evaluation_strategy="no",
    save_total_limit=2,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.1756
100,0.0934
150,0.0783
200,0.0605
250,0.0497
300,0.0436
350,0.0317
400,0.0288
450,0.0242
500,0.0259


TrainOutput(global_step=900, training_loss=0.03901118947399987, metrics={'train_runtime': 503.6698, 'train_samples_per_second': 28.59, 'train_steps_per_second': 1.787, 'total_flos': 73492368768000.0, 'train_loss': 0.03901118947399987, 'epoch': 3.0})

In [8]:
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

true_labels_flat = np.concatenate([item["labels"].numpy().flatten() for item in test_dataset])
pred_labels = preds.flatten()

# Checking that the number of labels matches
min_len = min(len(true_labels_flat), len(pred_labels))
true_labels_flat = true_labels_flat[:min_len]
pred_labels = pred_labels[:min_len]

# Calculating metrics
precision = precision_score(true_labels_flat, pred_labels, average="binary")
recall = recall_score(true_labels_flat, pred_labels, average="binary")
f1 = f1_score(true_labels_flat, pred_labels, average="binary")

print(f"📊 Evaluation Results:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

📊 Evaluation Results:
Precision: 0.9839
Recall: 0.9899
F1 Score: 0.9869


In [9]:
trainer.save_model("./NER/model")
tokenizer.save_pretrained("./NER/model")
print("✅ Model and tokenizer successfully saved.")

✅ Model and tokenizer successfully saved.


In [10]:
# Uploading model for NER
ner_pipeline = pipeline(
    "token-classification",
    model="./NER/model",
    tokenizer="./NER/model",
    aggregation_strategy="simple"
)

# New examples for testing
examples = [
    "Is there a horse in this picture?",
    "There might be a cow here.",
    "Do you see a penguin in the image?",
    "I think there is a giraffe in the photo.",
    "Can you spot a dolphin in this image?",
    "This picture definitely contains an elephant.",
    "Maybe there's a lion hidden here.",
    "Is there a squirrel in the scene?",
    "Does this image feature a kangaroo?",
    "Can we see a bear in this photo?",
    "There is a cat and a dog playing together in the picture.",
    "Can you spot both a lion and a tiger here?",
    "I think there's a fox chasing a rabbit in this image.",
    "This is a beautiful landscape.",
    "I see some buildings and trees.",
    "This image shows a car on a road."
]

for example in examples:
    results = ner_pipeline(example)
    print(f"Sentence: {example}\nResults: {results}\n")

Device set to use cpu


Sentence: Is there a horse in this picture?
Results: [{'entity_group': 'LABEL_0', 'score': 0.9999943, 'word': 'Is there a', 'start': 0, 'end': 10}, {'entity_group': 'LABEL_1', 'score': 0.9999182, 'word': 'horse', 'start': 11, 'end': 16}, {'entity_group': 'LABEL_0', 'score': 0.9999894, 'word': 'in this picture?', 'start': 17, 'end': 33}]

Sentence: There might be a cow here.
Results: [{'entity_group': 'LABEL_0', 'score': 0.9999956, 'word': 'There might be a', 'start': 0, 'end': 16}, {'entity_group': 'LABEL_1', 'score': 0.99994636, 'word': 'cow', 'start': 17, 'end': 20}, {'entity_group': 'LABEL_0', 'score': 0.9999954, 'word': 'here.', 'start': 21, 'end': 26}]

Sentence: Do you see a penguin in the image?
Results: [{'entity_group': 'LABEL_0', 'score': 0.9999951, 'word': 'Do you see a', 'start': 0, 'end': 12}, {'entity_group': 'LABEL_1', 'score': 0.99997604, 'word': 'penguin', 'start': 13, 'end': 20}, {'entity_group': 'LABEL_0', 'score': 0.99999577, 'word': 'in the image?', 'start': 21, 'e