In [None]:
!pip install spacy
!pip install transformers
!pip install spacy-transformers


Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py

In [None]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
import json
import spacy
from spacy.tokens import DocBin, Span

def load_data(file_path):
    """Load JSON data from file."""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def convert_to_spacy(data, nlp):
    """Convert JSON NER dataset into spaCy's binary format while checking index errors."""
    doc_bin = DocBin()

    for entry in data:
        text = entry["text"]
        doc = nlp.make_doc(text)

        entities = []
        for ent in entry["entity"]:
            start_char = ent["start"]
            end_char = ent["end"]
            label = ent["label"]


            if start_char >= len(text) or end_char > len(text) or start_char >= end_char:
                print(f"Skipping invalid entity: '{label}' in text: '{text}' (start={start_char}, end={end_char})")
                continue

            entity_text = text[start_char:end_char]
            char_span = doc.char_span(start_char, end_char, label=label)

            if char_span is not None:
                entities.append(char_span)
            else:
                print(f"Skipping entity '{entity_text}' in text '{text}' due to tokenization issues.")

        doc.ents = entities
        doc_bin.add(doc)  #

    return doc_bin

nlp = spacy.load("en_core_web_sm")

for split in ["train", "test", "validation"]:
    data = load_data(f"/content/drive/MyDrive/End to End NER project/NER_{split}_data.json")
    doc_bin = convert_to_spacy(data, nlp)
    doc_bin.to_disk(f"{split}.spacy")
    doc_bin.to_disk(f"/content/drive/MyDrive/End to End NER project/NER_{split}_spacy_data.spacy")


!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./validation.spacy


[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0         77.21     78.91    1.80    1.11    4.82    0.02
  0     200       2330.46   3402.05   50.18   57.50   44.51    0.50
  0     400       1061.75   2201.30   65.04   67.18   63.03    0.65
  0     600       3355.01   2239.02   68.76   70.86   66.78    0.69
  0     800       1063.72   2200.95   72.34   74.62   70.18    0.72
  0    1000       1617.47   2424.69   74.54   74.96   74.13    0.75
  1    1200       2646.95   2401.54   76.70   77.56   75.87    0.77
  1    1400       1878.79   1883.33   77.85   77.97   77.74    0.78
  1    1600       1371.82   2246.31   79.07   79.92   78.24    0.7

In [None]:
import spacy
import json
from spacy.training import Example
from spacy.scorer import Scorer

nlp = spacy.load("./output/model-best")

def load_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

test_data = load_data("/content/drive/MyDrive/End to End NER project/NER_test_data.json")

examples = []
for entry in test_data:
    text = entry["text"]
    entities = [(ent["start"], ent["end"], ent["label"]) for ent in entry["entity"]]

    gold_doc = nlp.make_doc(text)
    gold_ents = []

    for start, end, label in entities:
        span = gold_doc.char_span(start, end, label=label)
        if span is not None:
            gold_ents.append(span)

    gold_doc.ents = gold_ents
    pred_doc = nlp(text)

    example = Example(pred_doc, gold_doc)
    examples.append(example)

scorer = Scorer()
scores = scorer.score(examples)

print("Precision:", scores["ents_p"])
print("Recall:", scores["ents_r"])
print("F1-score:", scores["ents_f"])
print("Entity Type Breakdown:", scores["ents_per_type"])


In [None]:
nlp.to_disk("/content/drive/MyDrive/End to End NER project/spacy_model1")