# HF NER Training (French)

Train a token classification model (NER) with Hugging Face Transformers on IOB/BIO annotated data.
- Supported input: CoNLL (`token\tPOS?\tIOB`, blank line between sentences) or CSV with columns: `doc_id, sent_id, token, iob`.
- Base model: camembert-base (can swap to xlm-roberta-base).
- Saves model + tokenizer to `artifacts/hf_ner_model`.

In [None]:
!python -V
import os, json, random, re
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict
import numpy as np
from collections import Counter

import evaluate
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification,
    TrainingArguments, Trainer
)

SEED = 42
random.seed(SEED); np.random.seed(SEED)
ARTIFACTS = Path('artifacts')
ARTIFACTS.mkdir(exist_ok=True, parents=True)
MODEL_DIR = ARTIFACTS / 'hf_ner_model'
DATA_DIR = Path('data')
BASE_MODEL = 'camembert-base'  # or 'xlm-roberta-base'
print('Artifacts:', MODEL_DIR)

## 1) Load IOB data (CoNLL or CSV)

In [None]:
def read_conll(path: Path):
    sents = []
    cur = []
    for line in path.read_text(encoding='utf-8').splitlines():
        line = line.strip()
        if not line:
            if cur:
                sents.append(cur); cur = []
            continue
        parts = line.split('	')
        tok = parts[0]
        iob = parts[-1] if len(parts) >= 2 else 'O'
        cur.append((tok, iob))
    if cur:
        sents.append(cur)
    return sents

def read_csv_token_iob(path: Path):
    df = pd.read_csv(path, encoding='utf-8')
    req = {'doc_id','sent_id','token','iob'}
    if not req.issubset(df.columns):
        raise ValueError(f'Missing columns in {path}: need {req}')
    sents = []
    for (doc, sid), grp in df.groupby(['doc_id','sent_id']):
        cur = [(str(t), str(i)) for t,i in zip(grp['token'], grp['iob'])]
        sents.append(cur)
    return sents

def load_all(data_dir: Path):
    conll_paths = list(data_dir.glob('*.conll')) + list(data_dir.glob('*.txt'))
    csv_paths = list(data_dir.glob('*training*.csv')) + list(data_dir.glob('*annotated*.csv'))
    sents = []
    for p in conll_paths: sents += read_conll(p)
    for p in csv_paths: sents += read_csv_token_iob(p)
    return sents

sents = load_all(DATA_DIR)
print('Loaded sentences:', len(sents))
# Inspect label distribution
cnt = Counter([tag for sent in sents for _,tag in sent if tag!='O'])
print('Entity tag counts:', dict(cnt))

## 2) Build HF Datasets (tokens + ner_tags)

In [None]:
if not sents:
    raise SystemExit('No training data found in data/.')

# Label list
labels = sorted({tag for sent in sents for _,tag in sent})
if 'O' in labels: labels.remove('O')
labels = ['O'] + labels
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
print('Labels:', labels)

# Split train/val
idx = np.arange(len(sents))
np.random.shuffle(idx)
split = int(0.9*len(idx))
train_idx, val_idx = idx[:split], idx[split:]

def sents_to_dict(sent_list):
    return {
        'tokens': [[tok for tok,_ in s] for s in sent_list],
        'ner_tags': [[label2id[tag] for _,tag in s] for s in sent_list]
    }

train_ds = Dataset.from_dict(sents_to_dict([sents[i] for i in train_idx]))
val_ds = Dataset.from_dict(sents_to_dict([sents[i] for i in val_idx]))
datasets = DatasetDict({'train': train_ds, 'validation': val_ds})
datasets

## 3) Tokenize and align labels

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(examples['tokens'], is_split_into_words=True, truncation=True)
    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        prev = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev:
                aligned.append(labels[wid])
            else:
                # Inside a word -> use I- if B-/I- mapping is needed. Keep same tag here.
                aligned.append(labels[wid])
            prev = wid
        new_labels.append(aligned)
    tokenized['labels'] = new_labels
    return tokenized

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
tokenized_datasets

## 4) Train with Trainer

In [None]:
metric = evaluate.load('seqeval')

model = AutoModelForTokenClassification.from_pretrained(
    BASE_MODEL, num_labels=len(labels), id2label=id2label, label2id=label2id
)
data_collator = DataCollatorForTokenClassification(tokenizer)

args = TrainingArguments(
    output_dir=str(MODEL_DIR),
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    seed=SEED,
    load_best_model_at_end=True,
)

def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=-1)
    true_preds, true_labels = [], []
    for pred, lab in zip(preds, labels):
        cur_preds, cur_labels = [], []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                cur_preds.append(id2label[int(p_i)])
                cur_labels.append(id2label[int(l_i)])
        true_preds.append(cur_preds)
        true_labels.append(cur_labels)
    res = metric.compute(predictions=true_preds, references=true_labels)
    return {
        'precision': res.get('overall_precision', 0),
        'recall': res.get('overall_recall', 0),
        'f1': res.get('overall_f1', 0),
        'accuracy': res.get('overall_accuracy', 0),
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()
trainer.save_model(str(MODEL_DIR))
tokenizer.save_pretrained(str(MODEL_DIR))
print('Saved model to', MODEL_DIR)