## Medicine Dataset

In [None]:
!pip install -U huggingface_hub evaluate datasets transformers accelerate

In [22]:
!pip install sentencepiece protobuf

python(78385) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.




In [1]:
import pandas as pd
import numpy as np
import os
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
import evaluate
from seqeval.metrics import classification_report

In [2]:
MODEL_NAME = "Dr-BERT/DrBERT-4GB"
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 15
LEARNING_RATE = 5e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# File Paths
TRAIN_FILES = [
    "TP_ISD2020/QUAERO_FrenchMed/MEDLINE/MEDLINEtrain_layer1_ID.csv",
    "TP_ISD2020/QUAERO_FrenchMed/EMEA/EMEAtrain_layer1_ID.csv",
]
VALID_FILES = [
    "TP_ISD2020/QUAERO_FrenchMed/MEDLINE/MEDLINEdev_layer1_ID.csv",
    "TP_ISD2020/QUAERO_FrenchMed/EMEA/EMEAdev_layer1_ID.csv",
]
TEST_FILES = [
    "TP_ISD2020/QUAERO_FrenchMed/MEDLINE/MEDLINEtest_layer1_ID.csv",
    "TP_ISD2020/QUAERO_FrenchMed/EMEA/EMEAtest_layer1_ID.csv",
]

In [3]:
# --- 2. ROBUST DATA LOADER ---
def load_data_from_csv(file_paths):
    all_sents, all_tags = [], []
    for fpath in file_paths:
        if not os.path.exists(fpath):
            continue
        print(f"Loading {os.path.basename(fpath)}...", end=" ")
        try:
            df = pd.read_csv(
                fpath,
                sep=None,
                engine="python",
                keep_default_na=False,
                skip_blank_lines=False,
            )
        except:
            continue

        if "Mot" in df.columns:
            words, tags = df["Mot"].astype(str).values, df["Tag"].astype(str).values
        else:
            words, tags = (
                df.iloc[:, 0].astype(str).values,
                df.iloc[:, -1].astype(str).values,
            )

        curr_s, curr_t = [], []
        file_s, file_t = [], []

        for w, t in zip(words, tags):
            if not w.strip():
                if curr_s:
                    file_s.append(curr_s)
                    file_t.append(curr_t)
                    curr_s, curr_t = [], []
            else:
                curr_s.append(w)
                curr_t.append(t)
        if curr_s:
            file_s.append(curr_s)
            file_t.append(curr_t)

        # Chunking Fallback
        if len(file_s) < 10 and len(words) > 500:
            flat_w = [w for s in file_s for w in s]
            flat_t = [t for s in file_t for t in s]
            file_s = [flat_w[i : i + MAX_LEN] for i in range(0, len(flat_w), MAX_LEN)]
            file_t = [flat_t[i : i + MAX_LEN] for i in range(0, len(flat_t), MAX_LEN)]

        print(f"-> {len(file_s)} sentences.")
        all_sents.extend(file_s)
        all_tags.extend(file_t)
    return all_sents, all_tags


print("--- LOADING DATA ---")
train_sents, train_tags = load_data_from_csv(TRAIN_FILES)
valid_sents, valid_tags = load_data_from_csv(VALID_FILES)
test_sents, test_tags = load_data_from_csv(TEST_FILES)

--- LOADING DATA ---
Loading MEDLINEtrain_layer1_ID.csv... -> 91 sentences.
Loading EMEAtrain_layer1_ID.csv... -> 120 sentences.
Loading MEDLINEdev_layer1_ID.csv... -> 90 sentences.
Loading EMEAdev_layer1_ID.csv... -> 106 sentences.
Loading MEDLINEtest_layer1_ID.csv... -> 94 sentences.
Loading EMEAtest_layer1_ID.csv... -> 97 sentences.


In [4]:
# --- 3. TOKENIZATION ---
unique_tags = sorted(list(set(t for s in train_tags + test_tags for t in s)))
label2id = {tag: i for i, tag in enumerate(unique_tags)}
id2label = {i: tag for i, tag in enumerate(unique_tags)}

train_ds = Dataset.from_dict({"tokens": train_sents, "ner_tags": train_tags})
valid_ds = Dataset.from_dict({"tokens": valid_sents, "ner_tags": valid_tags})
test_ds = Dataset.from_dict({"tokens": test_sents, "ner_tags": test_tags})

# Try loading Fast Tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
except:
    MODEL_NAME = "almanach/camembert-base"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=MAX_LEN,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


print("Tokenizing...")
tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_ds.map(tokenize_and_align_labels, batched=True)

Tokenizing...


Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

In [5]:
# --- 5. TOKENIZATION & ALIGNMENT ---
# BERT breaks words into pieces ("Hepatitis" -> "Hepa", "##titis").
# We must align tags so "Hepa" gets the label and "##titis" is ignored (-100).
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=MAX_LEN,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens (CLS, SEP)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])  # Label the first sub-token
            else:
                label_ids.append(-100)  # Ignore subsequent sub-tokens
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


print("Tokenizing...")
tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_ds.map(tokenize_and_align_labels, batched=True)

Tokenizing...


Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/191 [00:00<?, ? examples/s]

In [6]:
# --- 4. MODEL INITIALIZATION ---
print(f"Initializing BERT with pre-trained weights from: {MODEL_NAME}")
# This step loads the 768-dimensional medical embeddings automatically
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(unique_tags), id2label=id2label, label2id=label2id
)

Initializing BERT with pre-trained weights from: almanach/camembert-base


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

CamembertForTokenClassification LOAD REPORT from: almanach/camembert-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.bias             | MISSING    | 
classifier.weight           | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [9]:
import torch
from torch import nn
from transformers import Trainer

# 1. Calculate Weights
# We assume 'O' is the most frequent. We down-weight it.
# Entities get weight 1.0, 'O' gets weight 0.05 (20x less importance)
class_weights = torch.ones(len(unique_tags)).to(DEVICE)

# Find the ID for 'O' (Outside) and 'PAD'
o_tag_id = label2id.get('O', -1)
pad_tag_id = -100

if o_tag_id != -1:
    class_weights[o_tag_id] = 0.05  # heavily penalize the model for only predicting 'O'

print(f"Class Weights set. 'O' (id {o_tag_id}) weight: {class_weights[o_tag_id]}")

# 2. Create a Custom Trainer to use these weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Reshape for CrossEntropy: (Batch * Seq, Num_Classes) vs (Batch * Seq)
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)

        active_loss = labels.view(-1) != -100 # Ignore padding
        active_logits = logits.view(-1, self.model.config.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]

        loss = loss_fct(active_logits, active_labels)

        return (loss, outputs) if return_outputs else loss

Class Weights set. 'O' (id 20) weight: 0.05000000074505806


In [13]:
training_args = TrainingArguments(
    output_dir="./drbert_ner_results",
    learning_rate=5e-5,              # Increased LR slightly to help jump out of the 'O' trap
    per_device_train_batch_size=4,   # Keep small batch for memory
    gradient_accumulation_steps=4,   # Accumulate gradients
    num_train_epochs=50,             # Train long enough to see improvements
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=5,
    push_to_hub=False,
)

trainer = WeightedTrainer(           # <--- Use the Custom Class here
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

print("\n--- RESTARTING TRAINING WITH CLASS WEIGHTS ---")
trainer.train()


--- RESTARTING TRAINING WITH CLASS WEIGHTS ---




Epoch,Training Loss,Validation Loss,F1,Accuracy
1,6.488594,1.626957,0.510339,0.837264
2,5.974086,1.579548,0.52121,0.840338
3,5.92836,1.543056,0.54721,0.852988
4,5.42786,1.515436,0.546338,0.852279
5,5.185176,1.479744,0.552988,0.85352
6,5.242513,1.460309,0.573215,0.865756
7,4.832901,1.429548,0.568547,0.859136
8,4.926265,1.400923,0.573199,0.85949
9,4.495228,1.397683,0.587793,0.867471
10,4.330398,1.380841,0.581144,0.862446


  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=350, training_loss=3.514325338091169, metrics={'train_runtime': 759.9241, 'train_samples_per_second': 13.883, 'train_steps_per_second': 0.461, 'total_flos': 689288580057600.0, 'train_loss': 3.514325338091169, 'epoch': 50.0})

In [12]:
print("\n--- FINAL TEST EVALUATION ---")
# Evaluate on the separate Test Set
test_results = trainer.evaluate(tokenized_test)
print(test_results)

# Generate the detailed classification report
print("\n--- DETAILED REPORT ---")
predictions, labels, _ = trainer.predict(tokenized_test)
predictions = np.argmax(predictions, axis=2)

# Convert IDs back to Tags (removing the -100 padding)
true_preds = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

from seqeval.metrics import classification_report

print(classification_report(true_labels, true_preds))


--- FINAL TEST EVALUATION ---




{'eval_loss': 1.7567861080169678, 'eval_f1': 0.46935632891900214, 'eval_accuracy': 0.8197710205589068, 'eval_runtime': 1.0455, 'eval_samples_per_second': 182.694, 'eval_steps_per_second': 11.478, 'epoch': 15.0}

--- DETAILED REPORT ---


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        ANAT       0.28      0.28      0.28       237
        CHEM       0.58      0.76      0.66       693
        DEVI       0.00      0.00      0.00        77
        DISO       0.37      0.57      0.45       651
        GEOG       0.00      0.00      0.00        44
        LIVB       0.47      0.76      0.58       332
        OBJC       0.00      0.00      0.00        50
        PHEN       0.00      0.00      0.00        46
        PHYS       0.00      0.00      0.00       114
        PROC       0.28      0.65      0.39       468

   micro avg       0.40      0.56      0.47      2712
   macro avg       0.20      0.30      0.24      2712
weighted avg       0.37      0.56      0.44      2712



## Media Dataset

In [15]:
# --- 2. CONFIGURATION ---
MODEL_NAME = "almanach/camembert-base"

# Update path to your Press dataset
PRESS_DIR = "TP_ISD2020/QUAERO_FrenchPress"
TRAIN_FILES = [f"{PRESS_DIR}/fra4_ID_train.csv"]
VALID_FILES = [f"{PRESS_DIR}/fra4_ID_dev.csv"]
TEST_FILES  = [f"{PRESS_DIR}/fra4_ID_test.csv"]

print(f"--- STARTING PRESS TASK ---")
print(f"Model: {MODEL_NAME}")

# --- 3. LOAD DATA ---
# (Using the same load_data_from_csv function defined earlier)
train_sents, train_tags = load_data_from_csv(TRAIN_FILES)
valid_sents, valid_tags = load_data_from_csv(VALID_FILES)
test_sents, test_tags   = load_data_from_csv(TEST_FILES)

# Create NEW Mappings (Press tags: PER, LOC, ORG are different from Med tags)
unique_tags = sorted(list(set(t for s in train_tags+test_tags for t in s)))
label2id = {tag: i for i, tag in enumerate(unique_tags)}
id2label = {i: tag for i, tag in enumerate(unique_tags)}
print(f"Press Tags: {label2id}")

--- STARTING PRESS TASK ---
Model: almanach/camembert-base
Loading fra4_ID_train.csv... -> 9034 sentences.
Loading fra4_ID_dev.csv... -> 744 sentences.
Loading fra4_ID_test.csv... -> 749 sentences.
Press Tags: {'O': 0, 'b-func': 1, 'b-loc': 2, 'b-org': 3, 'b-pers': 4, 'b-prod': 5, 'i-func': 6, 'i-loc': 7, 'i-org': 8, 'i-pers': 9, 'i-prod': 10}


In [16]:
# --- 4. PREPARE DATASETS ---
train_ds = Dataset.from_dict({"tokens": train_sents, "ner_tags": train_tags})
valid_ds = Dataset.from_dict({"tokens": valid_sents, "ner_tags": valid_tags})
test_ds  = Dataset.from_dict({"tokens": test_sents,  "ner_tags": test_tags})

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=128)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: label_ids.append(-100)
            elif word_idx != previous_word_idx: label_ids.append(label2id[label[word_idx]])
            else: label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid = valid_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test  = test_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/9034 [00:00<?, ? examples/s]

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Map:   0%|          | 0/749 [00:00<?, ? examples/s]

In [17]:
# --- 5. INITIALIZE MODEL ---
# Loading CamemBERT *is* the initialization with pre-trained knowledge
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(unique_tags),
    id2label=id2label,
    label2id=label2id
)

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

CamembertForTokenClassification LOAD REPORT from: almanach/camembert-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.bias             | MISSING    | 
classifier.weight           | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [21]:
from torch import nn
class_weights = torch.ones(len(unique_tags)).to(model.device)
if 'O' in label2id:
    class_weights[label2id['O']] = 0.1

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # --- FIX: Ensure weights are on the correct device ---
        # We move class_weights to the same device as the model's logits
        weights = class_weights.to(model.device)

        loss_fct = nn.CrossEntropyLoss(weight=weights)

        active_loss = labels.view(-1) != -100
        active_logits = logits.view(-1, self.model.config.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]

        loss = loss_fct(active_logits, active_labels)
        return (loss, outputs) if return_outputs else loss


In [23]:

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # --- FIX IS HERE ---
        # Instead of model.device, use logits.device (works for DataParallel too)
        weights = class_weights.to(logits.device)

        loss_fct = nn.CrossEntropyLoss(weight=weights)

        active_loss = labels.view(-1) != -100
        active_logits = logits.view(-1, self.model.config.num_labels)[active_loss]
        active_labels = labels.view(-1)[active_loss]

        loss = loss_fct(active_logits, active_labels)
        return (loss, outputs) if return_outputs else loss

# Re-initialize the Trainer with the fixed class
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting Training (Fixed)...")
trainer.train()

Starting Training (Fixed)...


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.718025,0.688714,0.948375
2,3.892530,0.514889,0.763741,0.964358
3,3.892530,0.408598,0.766604,0.964975
4,1.869762,0.352631,0.785818,0.96781
5,1.869762,0.319625,0.784134,0.969004
6,1.287033,0.305243,0.801674,0.970854
7,1.287033,0.302606,0.800045,0.971878
8,1.011836,0.301388,0.802548,0.971917
9,0.865288,0.299112,0.8058,0.972652
10,0.865288,0.291763,0.798687,0.97109




Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=2830, training_loss=1.6704919174787434, metrics={'train_runtime': 1828.8487, 'train_samples_per_second': 49.397, 'train_steps_per_second': 1.547, 'total_flos': 5901867437614080.0, 'train_loss': 1.6704919174787434, 'epoch': 10.0})

In [24]:
# --- 8. FINAL EVALUATION ---
print("\n--- FINAL PRESS EVALUATION ---")
predictions, labels, _ = trainer.predict(tokenized_test)
predictions = np.argmax(predictions, axis=2)

true_preds = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

print(classification_report(true_labels, true_preds))


--- FINAL PRESS EVALUATION ---






              precision    recall  f1-score   support

        func       0.74      0.79      0.77       477
         loc       0.78      0.88      0.83       555
         org       0.68      0.74      0.71       392
        pers       0.87      0.90      0.88      1048
        prod       0.51      0.68      0.58       256

   micro avg       0.76      0.83      0.79      2728
   macro avg       0.72      0.80      0.75      2728
weighted avg       0.77      0.83      0.80      2728

