# NER Training from Manually Annotated Job Ads

Train a French NER model from a corpus annotated with IOB/BIO tags (general job ads, not only tech).
- Accepts either CoNLL-like files (`token\tPOS?\tIOB`, sentence-separated by blank lines) or CSV with columns: `doc_id, sent_id, token, iob` (optional `pos`).
- Builds spaCy docs and trains a small NER head.
- Saves model to `artifacts/ner_custom`.

In [1]:
import os
import re
import json
import random
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict, Counter
import datetime as dt

# SpaCy imports
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy import displacy

print("Imports completed successfully")
print(f"SpaCy version: {spacy.__version__}")

Imports completed successfully
SpaCy version: 3.5.4


In [2]:
# Configuration
ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)

MODEL_OUTPUT_DIR = ARTIFACTS_DIR / 'ner_custom'
DATA_DIR = Path('data')  # Assume training data is here

# Training parameters
TRAINING_ITERATIONS = 30
DROPOUT_RATE = 0.2
BATCH_SIZE = 8
LEARNING_RATE = 0.001

print(f"Model will be saved to: {MODEL_OUTPUT_DIR}")
print(f"Training data directory: {DATA_DIR}")

Model will be saved to: artifacts\ner_custom
Training data directory: data


## 1. Data Loading Functions

In [3]:
def load_conll_file(file_path):
    """Load data from CoNLL format file (token\tPOS\tIOB)."""
    sentences = []
    current_sentence = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            
            if not line:  # Empty line = sentence boundary
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                parts = line.split('\t')
                if len(parts) >= 2:
                    token = parts[0]
                    iob = parts[-1]  # Last column is IOB tag
                    pos = parts[1] if len(parts) >= 3 else None
                    current_sentence.append((token, pos, iob))
    
    # Add last sentence if file doesn't end with empty line
    if current_sentence:
        sentences.append(current_sentence)
    
    return sentences

def load_csv_file(file_path):
    """Load data from CSV format (doc_id, sent_id, token, iob, pos?)."""
    df = pd.read_csv(file_path)
    
    required_cols = ['doc_id', 'sent_id', 'token', 'iob']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"CSV must contain columns: {required_cols}")
    
    sentences = []
    
    # Group by document and sentence
    for (doc_id, sent_id), group in df.groupby(['doc_id', 'sent_id']):
        sentence = []
        for _, row in group.iterrows():
            token = row['token']
            iob = row['iob']
            pos = row.get('pos', None)
            sentence.append((token, pos, iob))
        
        sentences.append(sentence)
    
    return sentences

def convert_iob_to_spacy(sentences):
    """Convert IOB format to spaCy training format."""
    training_data = []
    
    for sentence in sentences:
        tokens = [token for token, pos, iob in sentence]
        iob_tags = [iob for token, pos, iob in sentence]
        
        # Reconstruct text
        text = ' '.join(tokens)
        
        # Convert IOB to character-based entities
        entities = []
        current_entity = None
        char_offset = 0
        
        for i, (token, tag) in enumerate(zip(tokens, iob_tags)):
            if tag.startswith('B-'):
                # Begin new entity
                if current_entity:
                    entities.append(current_entity)
                
                entity_type = tag[2:]
                current_entity = {
                    'start': char_offset,
                    'end': char_offset + len(token),
                    'label': entity_type
                }
            
            elif tag.startswith('I-') and current_entity:
                # Continue current entity
                current_entity['end'] = char_offset + len(token)
            
            else:  # O tag or end of entity
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
            
            char_offset += len(token) + 1  # +1 for space
        
        # Add final entity if exists
        if current_entity:
            entities.append(current_entity)
        
        # Convert to spaCy format
        spacy_entities = [(ent['start'], ent['end'], ent['label']) for ent in entities]
        training_data.append((text, {'entities': spacy_entities}))
    
    return training_data

## 2. Load and Prepare Training Data

In [4]:
# Look for training data files
conll_files = list(DATA_DIR.glob('*.conll')) + list(DATA_DIR.glob('*.txt'))
csv_files = list(DATA_DIR.glob('*training*.csv')) + list(DATA_DIR.glob('*annotated*.csv'))

print(f"Found CoNLL files: {[f.name for f in conll_files]}")
print(f"Found CSV files: {[f.name for f in csv_files]}")

# Load data from available files
all_sentences = []

for file_path in conll_files:
    print(f"Loading CoNLL file: {file_path}")
    sentences = load_conll_file(file_path)
    all_sentences.extend(sentences)
    print(f"  Loaded {len(sentences)} sentences")

for file_path in csv_files:
    print(f"Loading CSV file: {file_path}")
    sentences = load_csv_file(file_path)
    all_sentences.extend(sentences)
    print(f"  Loaded {len(sentences)} sentences")

if not all_sentences:
    print("WARNING: No training data found!")
    print("Please add annotated data files to the data/ directory")
    print("Supported formats:")
    print("  - CoNLL: token\tpos\tiob (one token per line, empty lines separate sentences)")
    print("  - CSV: columns doc_id, sent_id, token, iob (optional pos column)")
else:
    print(f"\nTotal sentences loaded: {len(all_sentences)}")
    
    # Show some statistics
    total_tokens = sum(len(sent) for sent in all_sentences)
    print(f"Total tokens: {total_tokens}")
    
    # Count entity types
    entity_counts = Counter()
    for sentence in all_sentences:
        for token, pos, iob in sentence:
            if iob != 'O':
                entity_type = iob.split('-')[-1]
                entity_counts[entity_type] += 1
    
    print(f"\nEntity types found: {dict(entity_counts)}")

Found CoNLL files: []
Found CSV files: []
Please add annotated data files to the data/ directory
Supported formats:
  - CoNLL: token	pos	iob (one token per line, empty lines separate sentences)
  - CSV: columns doc_id, sent_id, token, iob (optional pos column)


In [5]:
# Convert to spaCy format
if all_sentences:
    print("Converting to spaCy format...")
    training_data = convert_iob_to_spacy(all_sentences)
    
    print(f"Converted {len(training_data)} examples")
    
    # Show first example
    if training_data:
        text, annotations = training_data[0]
        print(f"\nFirst example:")
        print(f"Text: {text[:100]}...")
        print(f"Entities: {annotations['entities'][:5]}...")
    
    # Split train/validation
    random.seed(42)
    random.shuffle(training_data)
    
    split_point = int(len(training_data) * 0.8)
    train_data = training_data[:split_point]
    val_data = training_data[split_point:]
    
    print(f"\nTraining examples: {len(train_data)}")
    print(f"Validation examples: {len(val_data)}")
else:
    print("No training data available - skipping training")
    train_data = []
    val_data = []

No training data available - skipping training


## 3. Create and Configure Model

In [6]:
if train_data:
    # Create a blank French model or load existing one
    try:
        nlp = spacy.load('fr_core_news_sm')
        print("Loaded existing French model")
    except OSError:
        print("French model not found, creating blank model")
        nlp = spacy.blank('fr')
    
    # Add NER component if not present
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner')
    else:
        ner = nlp.get_pipe('ner')
    
    # Add entity labels
    entity_labels = set()
    for text, annotations in train_data:
        for start, end, label in annotations['entities']:
            entity_labels.add(label)
    
    for label in entity_labels:
        ner.add_label(label)
    
    print(f"Entity labels to train: {sorted(entity_labels)}")
    
    # Disable other pipes during training
    disabled_pipes = []
    for pipe_name in nlp.pipe_names:
        if pipe_name != 'ner':
            disabled_pipes.append(pipe_name)
    
    print(f"Disabled pipes during training: {disabled_pipes}")

## 4. Training Loop

In [7]:
if train_data:
    print("Starting training...")
    
    # Prepare training examples
    def create_examples(data):
        examples = []
        for text, annotations in data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        return examples
    
    train_examples = create_examples(train_data)
    val_examples = create_examples(val_data) if val_data else []
    
    # Initialize the model
    nlp.initialize(lambda: train_examples)
    
    # Training loop
    losses = []
    
    with nlp.disable_pipes(*disabled_pipes):
        optimizer = nlp.resume_training()
        
        for iteration in range(TRAINING_ITERATIONS):
            print(f"\nIteration {iteration + 1}/{TRAINING_ITERATIONS}")
            
            # Shuffle training data
            random.shuffle(train_examples)
            
            batch_losses = []
            batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
            
            for batch in batches:
                nlp.update(batch, drop=DROPOUT_RATE, losses={})
                batch_losses.append(losses.get('ner', 0.0))
            
            avg_loss = np.mean(batch_losses) if batch_losses else 0.0
            losses.append(avg_loss)
            
            print(f"  Average loss: {avg_loss:.4f}")
            
            # Evaluate on validation set every 5 iterations
            if val_examples and (iteration + 1) % 5 == 0:
                print("  Evaluating on validation set...")
                scores = nlp.evaluate(val_examples)
                print(f"  NER Precision: {scores['ents_p']:.3f}")
                print(f"  NER Recall: {scores['ents_r']:.3f}")
                print(f"  NER F1: {scores['ents_f']:.3f}")
    
    print("\nTraining completed!")
else:
    print("Skipping training - no data available")

Skipping training - no data available


## 5. Save Model

In [8]:
if train_data:
    # Save the trained model
    MODEL_OUTPUT_DIR.mkdir(exist_ok=True)
    nlp.to_disk(MODEL_OUTPUT_DIR)
    
    print(f"Model saved to: {MODEL_OUTPUT_DIR}")
    
    # Save training metadata
    metadata = {
        'timestamp': dt.datetime.now().isoformat(),
        'training_examples': len(train_data),
        'validation_examples': len(val_data),
        'iterations': TRAINING_ITERATIONS,
        'entity_labels': sorted(entity_labels),
        'final_loss': losses[-1] if losses else None,
        'dropout_rate': DROPOUT_RATE,
        'batch_size': BATCH_SIZE
    }
    
    metadata_file = MODEL_OUTPUT_DIR / 'training_metadata.json'
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    print(f"Training metadata saved to: {metadata_file}")
else:
    print("No model to save - no training data was available")

No model to save - no training data was available


## 6. Test the Model

In [9]:
if train_data:
    # Test the model with some sample text
    test_texts = [
        "Nous recherchons un développeur Python avec une expérience en Django et React.",
        "Poste d'ingénieur logiciel spécialisé en Java et Spring Boot.",
        "Analyste de données maîtrisant SQL, R et Tableau.",
        "Chef de projet agile avec certification Scrum Master."
    ]
    
    print("Testing the trained model:")
    print("=" * 50)
    
    for i, text in enumerate(test_texts, 1):
        doc = nlp(text)
        
        print(f"\nTest {i}: {text}")
        print("Entities found:")
        
        if doc.ents:
            for ent in doc.ents:
                print(f"  - {ent.text:<15} ({ent.label_})")
        else:
            print("  No entities found")
    
    print("\n" + "=" * 50)
    print("Model testing completed!")
    print(f"\nTo use this model in other scripts:")
    print(f"  nlp = spacy.load('{MODEL_OUTPUT_DIR}')")
else:
    print("No model to test - training was skipped")

No model to test - training was skipped
