# Transfer learning notebook

Dans ce notebook, je vais prendre un réseau de neurones pré-entraînés, freeze des layers et entraîner la dernière couche pour prédire si un mot est un nom de personne ou non.

Il se décompose en 4 étapes:
- *Data:* Load data "MultiNERD" (données wikipedia labelisées)
- *Feature:* Créer un jeu de données X, y pour dire quels mots sont des noms de personne, formatté pour HuggingFace.<br/>
Je créé un jeu "train" (pour apprendre), "dev" (pour évaluer le modèle pendant l'apprentissage) et "test" pour mesurer l'accuracy (ou autre mesure) une fois toutes mes optimisations faites.
- *Model:* Prendre un réseau de neurones, freeze les layers
- *Train:* L'entraîner

In [2]:
import csv
import numpy as np
import torch
import transformers

In [25]:
model_name = "distilbert/distilbert-base-cased"

# Model FR

In [3]:
model_name = "camembert-base"

In [3]:
model_name

'camembert-base'

## Data
### MultiNERD data

Ce dataset est un text avec des catégories assez fines (dont nom de personne).<br>
Il est disponible [sur ce lien](https://github.com/Babelscape/multinerd)

In [4]:
with open("../data/raw/train_fr.tsv", encoding="utf-8") as f:
    rows = list(line.strip().split("\t") for line in f)

rows[:10]

[['0', 'Il', 'O'],
 ['1', 'est', 'O'],
 ['2', 'incarné', 'O'],
 ['3', 'par', 'O'],
 ['4',
  'Austin',
  'B-PER',
  'bn:02525192n',
  'Q4204710',
  '7345300',
  'Austin_Stowell',
  'Austin Stowell est un acteur américain né le 24 décembre 1984 à Kensington dans le Connecticut.',
  'https://upload.wikimedia.org/wikipedia/commons/9/95/Austin_Stowell-DolphinTale.jpg'],
 ['5', 'Stowell', 'I-PER'],
 ['6', '.', 'O'],
 [''],
 ['0', 'c’', 'O'],
 ['1', 'est', 'O']]

## Feature

### Créer le jeu X (mot), y (est-ce un nom de personne)

In [5]:
def make_labelled_sentences(tagged_words):
    # Joining words until we meet a dot
    # Word's label is 1 if 'PER' is in its tag
    X = []
    y = []

    this_word = []
    this_labels = []
    for tagged_word in tagged_words:
        if len(tagged_word) < 3:
            # not a tagged word
            continue
        word = tagged_word[1]
        tag = tagged_word[2]

        if word == '.':
            X.append(this_word)
            y.append(this_labels)

            this_word = []
            this_labels = []
        else:
            this_word.append(word)
            this_labels.append(1 * tag.endswith("PER"))

    return X, y

In [6]:
# Quand on essaie un modèle / framework
# PAS la peine de faire tourner sur tout le dataset
# On peut prendre un sous-ensemble et vérifier que le code tourne
sentences, labels = make_labelled_sentences(rows[:100_000])

In [7]:
sentences[2], labels[2]

(['À', 'Madras', ',', 'le'], [0, 0, 0, 0])

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [10]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

### Transformer ce jeu X, y en données pour un modèle HuggingFace

J'utilise le tokenizer pour transformer les mots en tokens.
Et j'applique les labels:
- 1 si le mot est un nom de personne ET ce token est le 1er token du mot.
- 0 si le mot n'est pas un nom de personne ET ce token est le 1er token du mot.
- -100 sinon (token pas à prédire, convention HuggingFace)

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [12]:
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [13]:
tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)

In [14]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

In [15]:
from datasets import Dataset

dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Model
## V1: learning only last layer

In [17]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2
)
# model = model.to("cuda")

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
for name, _ in model.base_model.named_parameters():
  print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [19]:
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if (
        any(layer_name in name for layer_name in ["layer.11"])
        # and any(layer_type in name for layer_type in ["weight", "bias"])
        and "ffn.lin" in name
    ):
        param.requires_grad = True

In [20]:
# Geler tout le backbone
for n, p in model.base_model.named_parameters():
    p.requires_grad = False

# Dégeler les deux dernières couches
for n, p in model.base_model.named_parameters():
    if "layer.10" in n or "layer.11" in n:
        p.requires_grad = True

# S'assurer que la tête de classification apprend
for n, p in model.named_parameters():
    if "classifier" in n:
        p.requires_grad = True


# Train 

## Méthode d'évaluation du modèle

Calculant accuracy, log loss, etc

In [21]:
ex = dataset_train[0]["labels"]
print("tokens utiles:", sum(1 for x in ex if x != -100), "/ total:", len(ex))


tokens utiles: 32 / total: 43


In [22]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

In [23]:
import torch
from transformers import Trainer

# pondération: ajuste w1 à 12–20 pour pousser la classe "1"
class_weights = torch.tensor([1.0, 12.0])  # [w_0, w_1]
class_weights = class_weights.to(model.device)

class WeightedTrainer(Trainer):
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs: bool = False,
        num_items_in_batch: int | None = None,   # <-- AJOUT
        **kwargs,                                 # <-- compat future
    ):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


## Train du modèle

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="ner_camembert_fr",
    learning_rate=3e-5,                 # un poil plus haut
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,      # ≈ batch 32
    num_train_epochs=5,                 # 4–6 suffisent
    weight_decay=0.01,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = WeightedTrainer(


In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6168,0.161953,0.0,0.0,0.0,0.986306
2,0.097,0.063716,0.0,0.0,0.0,0.994799
3,0.0686,0.05098,0.0,0.0,0.0,0.99448
4,0.0552,0.047294,0.0,0.0,0.0,0.995648


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


KeyboardInterrupt: 

In [21]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.629304,0.0,0.0,0.0,0.906587
2,No log,0.552346,0.0,0.0,0.0,0.951966
3,No log,0.492448,0.0,0.0,0.0,0.952763
4,0.595800,0.445922,0.0,0.0,0.0,0.952763
5,0.595800,0.410476,0.0,0.0,0.0,0.952763
6,0.595800,0.38406,0.0,0.0,0.0,0.952763
7,0.432300,0.364883,0.0,0.0,0.0,0.952763
8,0.432300,0.351981,0.0,0.0,0.0,0.952763
9,0.432300,0.344583,0.0,0.0,0.0,0.952763


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier,

SafetensorError: Error while serializing: I/O error: Espace insuffisant sur le disque. (os error 112)

# A vous de jouer

Créer la fonction prenant les prédictions niveau token, et me donnant prédictions niveau mot.
*Note:* Dans ce genre de cas, prenez des mots bizarres, qui font plusieurs tokens. <br/>
De cette façon, vous évitez des erreurs. <br/>
Si vous ne prenez que des mots simples ["bonjour", "monsieur", "jean"] (qui ne font qu'un token), vous pouvez croire que votre code marche, mais qu'il casse dès qu'un mot fait plusieurs tokens

In [27]:
words = ["Hellotapasci", "mister", "Bondaboliot", "goodbye"]
# wanted_labels = [0, 0, 1, 0]
# def predict_at_word_level(words, model, tokenizer):
#     # your magic here
#     # return ....

#     # I'm cheating here
#     return wanted_labels
    
import torch

def predict_at_word_level(words, model, tokenizer):
    # Tokenisation des mots
    inputs = tokenizer(words, return_tensors="pt", is_split_into_words=True, padding=True, truncation=True)
    
    # Passage dans le modèle
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits  # (batch_size, seq_len, num_labels)
    predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
    
    # Alignement des prédictions avec les mots
    word_ids = inputs.word_ids()
    word_predictions = []
    seen = set()
    
    for idx, word_id in enumerate(word_ids):
        if word_id is None:  # ignorer les tokens spéciaux
            continue
        if word_id not in seen:
            word_predictions.append(predictions[idx])  # on prend le premier token du mot
            seen.add(word_id)
    
    return word_predictions


In [28]:
predict_at_word_level(words, model, tokenizer)

[0, 0, 1, 0]

# Prédire sur le dataset France Inter

In [31]:
import pandas as pd

df_franceinter = pd.read_csv("../data/raw/train_v3.csv", sep=",")
df_franceinter.head()


Unnamed: 0,video_name,is_name,tokens
0,Le Barbecue Disney - La chanson de Frédéric Fr...,"[0, 0, 0, 0, 0, 0, 0, 1, 1]","[""Le"", ""Barbecue"", ""Disney"", ""-"", ""La"", ""chans..."
1,Le Roi et l'Oiseau - La Chronique de Christine...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]","[""Le"", ""Roi"", ""et"", ""l'Oiseau"", ""-"", ""La"", ""Ch..."
2,L'amour du lac - La chronique d'Hippolyte Gira...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]","[""L'"", ""amour"", ""du"", ""lac"", ""-"", ""La"", ""chron..."
3,La fille de la piscine de Léa Tourret - La chr...,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1]","[""La"", ""fille"", ""de"", ""la"", ""piscine"", ""de"", ""..."
4,"""Le soleil va moins faire son malin quand Jean...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[""\""Le"", ""soleil"", ""va"", ""moins"", ""faire"", ""so..."


In [32]:
sentences = df_franceinter["tokens"].apply(eval).tolist()  # transforme la string en liste

In [33]:
results = []
for s in sentences[:10]:   # prends d’abord 10 exemples pour tester
    preds = predict_at_word_level(s, model, tokenizer)
    results.append(list(zip(s, preds)))


In [34]:
for r in results[:5]:
    print(r)
    

[('Le', 0), ('Barbecue', 0), ('Disney', 0), ('-', 0), ('La', 0), ('chanson', 0), ('de', 0), ('Frédéric', 1), ('Fromet', 1)]
[('Le', 0), ('Roi', 1), ('et', 0), ("l'Oiseau", 0), ('-', 0), ('La', 0), ('Chronique', 0), ('de', 0), ('Christine', 1), ('Gonzalez', 1)]
[("L'", 0), ('amour', 0), ('du', 0), ('lac', 0), ('-', 0), ('La', 0), ('chronique', 0), ("d'", 0), ('Hippolyte', 1), ('Girardot', 1)]
[('La', 0), ('fille', 0), ('de', 0), ('la', 0), ('piscine', 0), ('de', 0), ('Léa', 1), ('Tourret', 1), ('-', 0), ('La', 0), ('chronique', 0), ('de', 0), ('Juliette', 1), ('Arnaud', 1)]
[('"Le', 0), ('soleil', 0), ('va', 0), ('moins', 0), ('faire', 0), ('son', 0), ('malin', 0), ('quand', 0), ('Jean-Luc', 1), ('va', 0), ('aller', 0), ('lui', 0), ('hurler', 0), ('dessus', 0), ('"les', 0), ('températures', 0), ("c'e", 0)]


In [39]:
import ast
tokens_list = df_franceinter["tokens"].apply(ast.literal_eval).tolist()
y_true_list = df_franceinter["is_name"].apply(ast.literal_eval).tolist()

In [40]:
y_pred_list = [predict_at_word_level(x, model, tokenizer) for x in tokens_list]

In [41]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# aplatir
y_true_flat = [v for row in y_true_list for v in row]
y_pred_flat = [v for row in y_pred_list for v in row]

acc = accuracy_score(y_true_flat, y_pred_flat)
prec, rec, f1, _ = precision_recall_fscore_support(y_true_flat, y_pred_flat, average='binary', zero_division=0)

print(f"Word-level  ->  Acc: {acc:.4f} | P: {prec:.4f} | R: {rec:.4f} | F1: {f1:.4f}")


Word-level  ->  Acc: 0.9776 | P: 0.8633 | R: 0.9851 | F1: 0.9202


In [42]:
model.config.id2label = {0: "O", 1: "PER"}
model.config.label2id = {"O": 0, "PER": 1}
model.save_pretrained("ner_camembert_fr")
tokenizer.save_pretrained("ner_camembert_fr")


('ner_camembert_fr\\tokenizer_config.json',
 'ner_camembert_fr\\special_tokens_map.json',
 'ner_camembert_fr\\tokenizer.json')

In [25]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.78k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/NabilSarker/ner_camembert_fr/commit/a871cece031725b10b2edc278edcd779b90978b4', commit_message='End of training', commit_description='', oid='a871cece031725b10b2edc278edcd779b90978b4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NabilSarker/ner_camembert_fr', endpoint='https://huggingface.co', repo_type='model', repo_id='NabilSarker/ner_camembert_fr'), pr_revision=None, pr_num=None)

In [28]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

HF_REPO = "NabilSarker/ner_camembert_fr"  

def predict(texts_split_into_words: list[list[str]]) -> list[list[int]]:
    model = AutoModelForTokenClassification.from_pretrained(HF_REPO)
    tokenizer = AutoTokenizer.from_pretrained(HF_REPO)

    all_preds = []
    for words in texts_split_into_words:
        enc = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True)
        with torch.no_grad():
            logits = model(**enc).logits.squeeze(0)
        token_preds = logits.argmax(-1).tolist()

        word_ids = enc.word_ids()
        seen, word_labels = set(), []
        for i, w in enumerate(word_ids):
            if w is None: 
                continue
            if w not in seen:                 # 1er sous-token du mot
                word_labels.append(int(token_preds[i]))  # 0/1
                seen.add(w)
        all_preds.append(word_labels)
    return all_preds


In [30]:
samples = [["La","chronique","de","Thomas","VDB"],
           ["Invité","Gaspard","Proust"]]
print(predict(samples))


[[0, 0, 0, 0, 0], [0, 0, 0]]
