# Load Dataset

In [1]:
import kagglehub

path = kagglehub.dataset_download("alaakhaled/conll003-englishversion")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\AIJimmy\.cache\kagglehub\datasets\alaakhaled\conll003-englishversion\versions\1


In [2]:
# Function to Load CoNLL Files
def load_conll_file(filepath):
    sentences, labels = [], []
    sentence, label = [], []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
                continue
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            token, ner_tag = parts[0], parts[-1]
            if token == '-DOCSTART-':
                continue
            sentence.append(token)
            label.append(ner_tag)
    return sentences, labels

In [3]:
train_sentences, train_labels = load_conll_file(f'{path}/train.txt')
valid_sentences, valid_labels = load_conll_file(f'{path}/valid.txt')
test_sentences, test_labels = load_conll_file(f'{path}/test.txt')

In [4]:
import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")

# Preprocess

In [5]:
# Prepare Data for Training
def to_spacy_format(sentences, labels):
    data = []
    for words, tags in zip(sentences, labels):
        entities = []
        start = 0
        for word, tag in zip(words, tags):
            end = start + len(word)
            if tag != 'O':
                label = tag.split('-')[-1]
                entities.append((start, end, label))
            start = end + 1
        text = " ".join(words)
        data.append((text, {"entities": entities}))
    return data

train_data = to_spacy_format(train_sentences, train_labels)
val_data = to_spacy_format(valid_sentences, valid_labels)
test_data = to_spacy_format(test_sentences, test_labels)

In [6]:
train_data

[('EU rejects German call to boycott British lamb .',
  {'entities': [(0, 2, 'ORG'), (11, 17, 'MISC'), (34, 41, 'MISC')]}),
 ('Peter Blackburn', {'entities': [(0, 5, 'PER'), (6, 15, 'PER')]}),
 ('BRUSSELS 1996-08-22', {'entities': [(0, 8, 'LOC')]}),
 ('The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .',
  {'entities': [(4, 12, 'ORG'),
    (13, 23, 'ORG'),
    (59, 65, 'MISC'),
    (94, 101, 'MISC')]}),
 ("Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .",
  {'entities': [(0, 7, 'LOC'),
    (33, 41, 'ORG'),
    (42, 47, 'ORG'),
    (72, 78, 'PER'),
    (79, 88, 'PER'),
    (164, 171, 'LOC')]}),
 ('" We do n\'t support any such recommendation because we do n\'t see any grounds for it , " th

# model training

In [7]:
import random
def train_ner_model(nlp , train_data, val_data, n_iter):
    if nlp == "sm":
        nlp = nlp_sm
    elif nlp == "md":
        nlp = nlp_md
        
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            if ent[2] not in ner.labels:
                ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itr in range(n_iter):
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    examples = []
                    for text, annotations in batch:
                        doc = nlp.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        examples.append(example)
                    nlp.update(examples, drop=0.5, losses=losses , sgd=optimizer)
                print(f"Iteration {itr + 1}, Losses: {losses}")
        correct, total = 0, 0
        for text, annotations in val_data:
            doc = nlp(text)
            true_ents = set([(ent[0], ent[1], ent[2]) for ent in annotations["entities"]])
            pred_ents = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
            total += len(true_ents)
            correct += len(true_ents.intersection(pred_ents))
        val_acc = correct / total if total else 0
        print(f"\nValidation Accuracy: {val_acc:.4f}")
        nlp.to_disk(nlp.meta["name"] + '_custom_ner_model')
        return nlp

In [9]:
trained_nlp_sm = train_ner_model("sm", train_data, val_data, n_iter=10)

Iteration 1, Losses: {'ner': np.float32(23838.941)}
Iteration 2, Losses: {'ner': np.float32(12532.51)}
Iteration 3, Losses: {'ner': np.float32(9252.801)}
Iteration 4, Losses: {'ner': np.float32(7646.863)}
Iteration 5, Losses: {'ner': np.float32(6523.1274)}
Iteration 6, Losses: {'ner': np.float32(5936.908)}
Iteration 7, Losses: {'ner': np.float32(5297.29)}
Iteration 8, Losses: {'ner': np.float32(4795.0044)}
Iteration 9, Losses: {'ner': np.float32(4591.543)}
Iteration 10, Losses: {'ner': np.float32(4216.637)}

Validation Accuracy: 0.8643


In [10]:
trained_nlp_md = train_ner_model("md", train_data, val_data, n_iter=10)

Iteration 1, Losses: {'ner': np.float32(20153.81)}
Iteration 2, Losses: {'ner': np.float32(11275.471)}
Iteration 3, Losses: {'ner': np.float32(8347.302)}
Iteration 4, Losses: {'ner': np.float32(6931.89)}
Iteration 5, Losses: {'ner': np.float32(6107.333)}
Iteration 6, Losses: {'ner': np.float32(5530.217)}
Iteration 7, Losses: {'ner': np.float32(4978.127)}
Iteration 8, Losses: {'ner': np.float32(4630.1855)}
Iteration 9, Losses: {'ner': np.float32(4425.78)}
Iteration 10, Losses: {'ner': np.float32(4047.749)}

Validation Accuracy: 0.8864


# Evaluation

In [12]:
md_model = spacy.load('core_web_md_custom_ner_model')
sm_model = spacy.load('core_web_sm_custom_ner_model')

def evaluate_accuracy(nlp, test_data):
    correct, total = 0, 0
    for text, annotations in test_data:
        doc = nlp(text)
        true_ents = set([(ent[0], ent[1], ent[2]) for ent in annotations["entities"]])
        pred_ents = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
        total += len(true_ents)
        correct += len(true_ents.intersection(pred_ents))
    return correct / total if total else 0

md_test_acc = evaluate_accuracy(md_model, test_data)
sm_test_acc = evaluate_accuracy(sm_model, test_data)
print(f"\nMD Test Accuracy: {md_test_acc:.4f}")
print(f"SM Test Accuracy: {sm_test_acc:.4f}")


MD Test Accuracy: 0.8481
SM Test Accuracy: 0.8063


# Visualization

In [15]:
from spacy import displacy

# Test on a few examples from the test set
test_samples = test_data[:5]  # First 5 test examples

print("=" * 80)
print("ENTITY EXTRACTION VISUALIZATION - SM Model")
print("=" * 80)

for text, annotations in test_samples:
    doc_sm = nlp_sm(text)
    print(f"\nText: {text}\n")
    displacy.render(doc_sm, style="ent", manual=False, page=True, minify=True)
    print("\n" + "-" * 80)

print("\n" + "=" * 80)
print("ENTITY EXTRACTION VISUALIZATION - MD Model")
print("=" * 80)

for text, annotations in test_samples:
    doc_md = nlp_md(text)
    print(f"\nText: {text}\n")
    displacy.render(doc_md, style="ent", manual=False, page=True, minify=True)
    print("\n" + "-" * 80)

ENTITY EXTRACTION VISUALIZATION - SM Model

Text: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .




--------------------------------------------------------------------------------

Text: Nadim Ladki




--------------------------------------------------------------------------------

Text: AL-AIN , United Arab Emirates 1996-12-06




--------------------------------------------------------------------------------

Text: Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .




--------------------------------------------------------------------------------

Text: But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .




--------------------------------------------------------------------------------

ENTITY EXTRACTION VISUALIZATION - MD Model

Text: SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .




--------------------------------------------------------------------------------

Text: Nadim Ladki




--------------------------------------------------------------------------------

Text: AL-AIN , United Arab Emirates 1996-12-06




--------------------------------------------------------------------------------

Text: Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .




--------------------------------------------------------------------------------

Text: But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .




--------------------------------------------------------------------------------


# Notebook Summary: Named Entity Recognition (NER) from News Articles

## Objective
Train and evaluate custom spaCy NER models (small and medium) on the CoNLL-2003 style English dataset and visualize extracted entities.

## Dataset
- Source: `alaakhaled/conll003-englishversion` (downloaded via `kagglehub`).
- Files: `train.txt`, `valid.txt`, `test.txt`.
- Format: Token per line with NER tag; sentences separated by blank lines; `-DOCSTART-` ignored.

## Data Loading
- `load_conll_file()` parses tokens and tags into sentence lists.
- Output: `train_sentences`, `valid_sentences`, `test_sentences` and corresponding label lists.

## Preprocessing
- `to_spacy_format()` converts sentences + BIO tags to spaCy training tuples: `(text, {"entities": [(start_char, end_char, label), ...]})`.
- Simple char offset calculation; joins tokens with spaces.

## Models
- Base pipelines loaded: `en_core_web_sm`, `en_core_web_md`.
- Adds/initializes `ner` pipe if missing.
- Dynamically registers entity labels from training data.

## Training
- Function: `train_ner_model(model_key, train_data, val_data, n_iter)`.
- Disables other pipes during updates.
- Uses `minibatch()` with compounding sizes.
- Performs `n_iter` iterations; applies dropout `0.5`.
- Saves trained model to disk: `<base_model_name>_custom_ner_model`.

## Validation
- After training: computes a simple accuracy = correctly matched (start, end, label) / total gold entities on validation set.

## Evaluation (Test Set)
- Loads saved custom models.
- Applies same exact-match metric to test data.
- Prints MD and SM test accuracies.

## Visualization
- Uses `displacy.render()` to display first 5 test samples for both models.
- Inline entity highlighting with labels.

## Limitations
- Offset calculation assumes single space join; may misalign if original tokenization differs.
- Uses plain accuracy (no precision/recall/F1).
- BIO tags collapsed; no handling of multi-token entity boundary consistency.
- No error handling for missing models or dependency conflicts.

## Possible Improvements
- Replace accuracy with precision/recall/F1 (e.g., per label).
- Filter overlapping / duplicate spans before training.
- Use spaCy `DocBin` for faster serialization.
- Add transformer-based model (`en_core_web_trf`) if environment supports.
- Improve entity offset logic by tracking cumulative lengths while inserting spaces.
- Parameterize dropout, batch sizes, iterations.

## Usage Flow
1. Download dataset.
2. Load and parse files.
3. Convert to spaCy format.
4. Train small and medium models.
5. Validate during training.
6. Save models.
7. Evaluate on test set.
8. Visualize sample outputs with displaCy.

## Key Functions
- `load_conll_file(filepath)`
- `to_spacy_format(sentences, labels)`
- `train_ner_model(nlp_key, train_data, val_data, n_iter)`
- `evaluate_accuracy(nlp, test_data)`
