In [16]:
import os
import random
import spacy
from spacy.training import Example
from spacy.tokens import DocBin
from sklearn.metrics import classification_report
from pathlib import Path

In [17]:
# === CONFIG ===
DATA_DIR = "AnchorNER_all"
TRAIN_RATIO = 0.8
MODEL_OUTPUT = "output_ner_model"

In [18]:
# === Load all IOB-formatted files ===
def load_conll_data(directory):
    data = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            with open(os.path.join(directory, file_name), encoding="utf8") as f:
                words, labels = [], []
                for line in f:
                    line = line.strip()
                    if not line:
                        if words:
                            data.append((words, labels))
                            words, labels = [], []
                    else:
                        parts = line.split()
                        if len(parts) == 2:
                            words.append(parts[0])
                            labels.append(parts[1])
                if words:
                    data.append((words, labels))
    return data


In [None]:

# === Convert to spaCy format ===
def convert_to_spacy_format(data, nlp):
    db = DocBin()
    for words, labels in data:
        doc = spacy.tokens.Doc(nlp.vocab, words=words)
        ents = []
        start = 0
        for word, label in zip(words, labels):
            end = start + len(word)
            if label.startswith("B-"):
                ent_start = start
                ent_label = label[2:]
            elif label.startswith("I-") and ents and ents[-1][2] == label[2:]:
                ent_end = end
                ents[-1] = (ents[-1][0], ent_end, ents[-1][2])
                start = end + 1
                continue
            else:
                ent_start = None
                ent_label = None

            if ent_start is not None:
                ents.append((ent_start, end, ent_label))
            start = end + 1

        spans = [doc.char_span(start, end, label=label) for start, end, label in ents if doc.char_span(start, end, label=label)]
        doc.ents = spans
        db.add(doc)
    return db

In [None]:

# === Evaluation helper ===
def evaluate(nlp, data):
    y_true, y_pred = [], []
    for words, labels in data:
        doc = nlp(" ".join(words))
        pred_labels = ["O"] * len(words)
        for ent in doc.ents:
            ent_tokens = ent.text.split()
            for i, token in enumerate(ent_tokens):
                idx = words.index(token)  # assumes tokenization match
                pred_labels[idx] = "B-" + ent.label_ if i == 0 else "I-" + ent.label_

        y_true.extend(labels)
        y_pred.extend(pred_labels)

    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred))

In [None]:
# === MAIN WORKFLOW ===
if __name__ == "__main__":
    random.seed(42)

    print("Loading data...")
    raw_data = load_conll_data(DATA_DIR)
    random.shuffle(raw_data)
    split = int(len(raw_data) * TRAIN_RATIO)
    train_data, eval_data = raw_data[:split], raw_data[split:]

    # === Create blank spaCy model ===
    print("📦 Creating blank English model...")
    nlp = spacy.blank("en")
    ner = nlp.add_pipe("ner")
    

    # Collect labels for init
    for _, labels in train_data:
        for label in labels:
            if label != "O":
                ner.add_label(label[2:])

    # === Convert and save training data ===
    print("🧼 Converting to spaCy format...")
    train_bin = convert_to_spacy_format(train_data, nlp)
    train_bin.to_disk("train.spacy")

    # === Training ===
    print("🚀 Training model...")
    config = {
        "pipeline": ["ner"],
        "training": {
            "train_corpus": {"@readers": "spacy.Corpus.v1", "path": "train.spacy"},
            "dev_corpus": {"@readers": "spacy.Corpus.v1", "path": "train.spacy"},
            "max_steps": 1000 #reduce to 2000 for faster training
        }
    }

    # Use spaCy training CLI programmatically
    import subprocess
    subprocess.run([
        "python", "-m", "spacy", "train",
        "config.cfg",
        "--output", MODEL_OUTPUT,
        "--paths.train", "train.spacy",
        "--paths.dev", "train.spacy"
    ])

    # === Load trained model and evaluate ===
    print("📊 Evaluating trained model...")
    trained_nlp = spacy.load(os.path.join(MODEL_OUTPUT, "model-best"))
    evaluate(trained_nlp, eval_data)

📥 Loading data...
