In [1]:
import os
import random
import spacy
import torch
from spacy.training import Example
from spacy.tokens import DocBin
from sklearn.metrics import classification_report
from pathlib import Path
import subprocess
import srsly

In [2]:
# === CONFIG ===
DATA_DIR = "AnchorNER_all"
TRAIN_RATIO = 0.8
MODEL_OUTPUT = "output_ner_model"
TRAIN_SPACY_PATH = "train.spacy"
CONFIG_FILE = "config.cfg"

In [3]:
# === Load all IOB-formatted files (limit to 5 files) ===
def load_conll_data(directory):
    data = []
    files_processed = 0
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            with open(os.path.join(directory, file_name), encoding="utf8") as f:
                words, labels = [], []
                for line in f:
                    line = line.strip()
                    if not line:
                        if words:
                            data.append((words, labels))
                            words, labels = [], []
                    else:
                        parts = line.split()
                        if len(parts) == 2:
                            words.append(parts[0])
                            labels.append(parts[1])
                if words:
                    data.append((words, labels))
            files_processed += 1
            if files_processed >= 1:
                break
    return data

In [4]:
# === Convert to spaCy format ===
def convert_to_spacy_format(data, nlp):
    db = DocBin()
    for words, labels in data:
        doc = spacy.tokens.Doc(nlp.vocab, words=words)
        ents = []
        start = 0
        for word, label in zip(words, labels):
            end = start + len(word)
            if label.startswith("B-"):
                ent_start = start
                ent_label = label[2:]
            elif label.startswith("I-") and ents and ents[-1][2] == label[2:]:
                ent_end = end
                ents[-1] = (ents[-1][0], ent_end, ents[-1][2])
                start = end + 1
                continue
            else:
                ent_start = None
                ent_label = None

            if ent_start is not None:
                ents.append((ent_start, end, ent_label))
            start = end + 1

        spans = [doc.char_span(start, end, label=label) for start, end, label in ents if doc.char_span(start, end, label=label)]
        doc.ents = spans
        db.add(doc)
    return db

In [5]:
# === Evaluation helper ===
def evaluate(nlp, data):
    y_true, y_pred = [], []
    for words, labels in data:
        doc = nlp(" ".join(words))
        pred_labels = ["O"] * len(words)
        for ent in doc.ents:
            ent_tokens = ent.text.split()
            for i, token in enumerate(ent_tokens):
                try:
                    idx = words.index(token)
                    pred_labels[idx] = "B-" + ent.label_ if i == 0 else "I-" + ent.label_
                except ValueError:
                    # Token not found due to mismatch or encoding issue
                    continue

        y_true.extend(labels)
        y_pred.extend(pred_labels)

    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred))

In [6]:
# === MAIN WORKFLOW ===
if __name__ == "__main__":
    random.seed(42)
    print("Loading data...")
    raw_data = load_conll_data(DATA_DIR)
    random.shuffle(raw_data)
    split = int(len(raw_data) * TRAIN_RATIO)
    train_data, eval_data = raw_data[:split], raw_data[split:]

    print("Creating blank English model...")
    nlp = spacy.blank("en")
    ner = nlp.add_pipe("ner")

    # Add all unique labels
    for _, labels in train_data:
        for label in labels:
            if label != "O":
                ner.add_label(label[2:])

    print("Initializing pipeline...")
    nlp.initialize()

    print("Converting data to spaCy binary format...")
    train_bin = convert_to_spacy_format(train_data, nlp)
    train_bin.to_disk(TRAIN_SPACY_PATH)

    print("Debugging training data...")
    subprocess.run([
        "python", "-m", "spacy", "debug", "data",
        CONFIG_FILE,
        "--paths.train", TRAIN_SPACY_PATH,
        "--paths.dev", TRAIN_SPACY_PATH
    ], check=True)

    print("Training the model...")
    result = subprocess.run([
        "python", "-m", "spacy", "train",
        CONFIG_FILE,
        "--output", MODEL_OUTPUT,
        "--paths.train", TRAIN_SPACY_PATH,
        "--paths.dev", TRAIN_SPACY_PATH
    ], capture_output=True, text=True)

    if result.returncode != 0:
        print("❌ Training failed!")
        print("STDOUT:\n", result.stdout)
        print("STDERR:\n", result.stderr)
        exit(1)
    else:
        print("✅ Training succeeded!")

    # Test the saved model
    print("Loading trained model...")
    trained_nlp = spacy.load(os.path.join(MODEL_OUTPUT, "model-best"))
    evaluate(trained_nlp, eval_data)

📄 Loading data...
📦 Creating blank English model...
🧠 Initializing pipeline (important!)...
💾 Converting data to spaCy binary format...
🧪 Debugging training data...
🚀 Training the model...
✅ Training succeeded!
🧪 Loading trained model...

Classification Report:

              precision    recall  f1-score   support

       B-LOC       0.80      0.77      0.78      2157
      B-MISC       0.68      0.60      0.63      2292
       B-ORG       0.71      0.58      0.64       816
       B-PER       0.81      0.67      0.73       942
       I-LOC       0.75      0.69      0.72       885
      I-MISC       0.68      0.53      0.60      1756
       I-ORG       0.69      0.60      0.64       819
       I-PER       0.81      0.70      0.75       919
           O       0.97      0.99      0.98     67336

    accuracy                           0.94     77922
   macro avg       0.76      0.68      0.72     77922
weighted avg       0.94      0.94      0.94     77922



In [7]:
#save model
nlp.to_disk(MODEL_OUTPUT)