In [7]:
# pip install spacy
# python -m spacy download en_core_web_sm
import spacy
import random
from spacy.training.example import Example
from spacy.util import minibatch

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample dataset (replace this with your extended dataset)
dataset = [
    "Mrs. May visited Leeds General Infirmary hospital which is located near Burley Road.",
    "She was not feeling well and had an appointment with Dr. Ray Johnson."
    # ... (add more sentences)
]

# Assuming 80% for training and 20% for testing
split = int(0.8 * len(dataset))
train_data = dataset[:split]
test_data = dataset[split:]

# Prepare training data
training_data = []
for sentence in train_data:
    doc = nlp(sentence)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    example = Example.from_dict({"text": sentence , "entities": entities})
    training_data.append(example)
# Define the NER model
ner = nlp.get_pipe("ner")

# Add labels for medical entities
labels = ["Disease", "Medication", "Procedure", "Symptom", "Location", "Doctor", "Date", "Pharmacy"]

for label in labels:
    ner.add_label(label)

# Disable other pipelines during training to speed up training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):
    # Training the NER model
    for epoch in range(10):  # You may need to adjust the number of epochs
        random.shuffle(training_data)
        losses = {}

        for batch in minibatch(training_data, size=8):
            examples, _ = zip(*batch)
            nlp.update(examples, drop=0.5, losses=losses)

        print(losses)
# Evaluate the model
test_loss = 0.0

for sentence in test_data:
    doc = nlp(sentence)
    gold_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    gold_example = Example.from_dict({"text": sentence, "entities": gold_entities})
    loss = nlp.evaluate([gold_example])
    test_loss += loss

avg_test_loss = test_loss / len(test_data)
print(f"Average test loss: {avg_test_loss}")


TypeError: ignored