# Transfer learning

Ce notebook télécharge le modèle RoBERTa-base et utilise le dataset MultiNERD English pour prédire les noms de personnes sur du texte anglais. <br/>
Il faut:
- L'adapter sur du français (modèle camembert, autre dataset)
- Essayer de freeze des layers, améliorer ses performances sur le jeu "dev" 

In [1]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers

In [2]:
model_name = "roberta-base"

## MultiNERD data

Ce dataset est un text avec des catégories assez fines (dont nom de personne).<br>
Il est disponible [ici](https://github.com/Babelscape/multinerd)<br>
Prenez le dataset français<br>

In [3]:
with open("../src/data/raw/train_fr.tsv") as f:
    rows = list(line.strip().split("\t") for line in f)

rows[:10]

[['0', 'Il', 'O'],
 ['1', 'est', 'O'],
 ['2', 'incarné', 'O'],
 ['3', 'par', 'O'],
 ['4',
  'Austin',
  'B-PER',
  'bn:02525192n',
  'Q4204710',
  '7345300',
  'Austin_Stowell',
  'Austin Stowell est un acteur américain né le 24 décembre 1984 à Kensington dans le Connecticut.',
  'https://upload.wikimedia.org/wikipedia/commons/9/95/Austin_Stowell-DolphinTale.jpg'],
 ['5', 'Stowell', 'I-PER'],
 ['6', '.', 'O'],
 [''],
 ['0', 'c’', 'O'],
 ['1', 'est', 'O']]

In [4]:
def make_labelled_sentences(tagged_words):
    # Joining words until we meet a dot
    # Word's label is 1 if 'PER' is in its tag
    X = []
    y = []

    this_word = []
    this_labels = []
    for tagged_word in tagged_words:
        if len(tagged_word) < 3:
            # not a tagged word
            continue
        word = tagged_word[1]
        tag = tagged_word[2]

        if word == '.':
            X.append(this_word)
            y.append(this_labels)

            this_word = []
            this_labels = []
        else:
            this_word.append(word)
            this_labels.append(1 * tag.endswith("PER"))

    return X, y

In [5]:
sentences, labels = make_labelled_sentences(rows)

In [6]:
sentences[:10]

[['Il', 'est', 'incarné', 'par', 'Austin', 'Stowell'],
 ['c’',
  'est',
  'ainsi',
  'que',
  'des',
  'firmes',
  'comme',
  'DuPont',
  ',',
  'Dow',
  'Chemical',
  ',',
  'Monsanto',
  ',',
  'American',
  'Cyanamid',
  'lancèrent',
  'la',
  'production',
  'en',
  'masse',
  "d'",
  'engrais',
  'minéraux'],
 ['À', 'Madras', ',', 'le'],
 ['Les',
  'trois',
  'plus',
  'grandes',
  'villes',
  'sont',
  ',',
  'dans',
  'l',
  "'",
  'ordre',
  ',',
  'Vienne',
  ',',
  'Graz',
  'et',
  'Linz'],
 ['Mike',
  'Hough',
  'est',
  'le',
  'capitaine',
  'de',
  'l',
  "'",
  'équipe',
  'entraînée',
  'brièvement',
  'par',
  'Dave',
  'Chambers',
  'puis',
  'licencié',
  'par',
  'le',
  'président',
  'de',
  'l',
  "'",
  'équipe',
  ',',
  'Pierre',
  'Pagé',
  ',',
  'qui',
  'prend',
  'sa',
  'place',
  'derrière',
  'le',
  'banc'],
 ['Le',
  'réseau',
  'social',
  'inspire',
  'également',
  'quelques',
  'humoristes',
  'français',
  'tels',
  'que',
  'Gad',
  'Elmaleh',

In [7]:
labels[:10]

[[0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [8]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

# Applying Hugging face V2

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [10]:
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [11]:
tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)

In [12]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

In [13]:
from datasets import Dataset

dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [14]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [15]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [16]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 4070 Ti'

In [17]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, TrainerCallback

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2
)
model = model.to("cuda")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
for name, _ in model.base_model.named_parameters():
  print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [19]:
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if (
        any(layer_name in name for layer_name in ["layer.5"])
        and any(layer_type in name for layer_type in ["weight", "bias"])
        and "attention" not in name
    ):
        param.requires_grad = True

In [20]:
metrics_dict = {}

class StoreMetricsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if len(state.log_history) > 0:
            metrics_dict[f"Epoch {state.epoch}"] = state.log_history[-1]

** Validation loss **
- Only last layer learnt: 0.001911

In [21]:
# Shutting down warnings
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [22]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[StoreMetricsCallback()]
)

trainer.train()

  0%|          | 0/11236 [00:00<?, ?it/s]

{'loss': 0.0859, 'learning_rate': 1.911000355998576e-05, 'epoch': 0.09}
{'loss': 0.0174, 'learning_rate': 1.822000711997152e-05, 'epoch': 0.18}
{'loss': 0.0149, 'learning_rate': 1.733001067995728e-05, 'epoch': 0.27}
{'loss': 0.0119, 'learning_rate': 1.644001423994304e-05, 'epoch': 0.36}
{'loss': 0.0118, 'learning_rate': 1.55500177999288e-05, 'epoch': 0.44}
{'loss': 0.012, 'learning_rate': 1.4660021359914563e-05, 'epoch': 0.53}
{'loss': 0.0108, 'learning_rate': 1.3770024919900323e-05, 'epoch': 0.62}
{'loss': 0.0113, 'learning_rate': 1.2880028479886081e-05, 'epoch': 0.71}
{'loss': 0.0103, 'learning_rate': 1.1990032039871841e-05, 'epoch': 0.8}
{'loss': 0.0103, 'learning_rate': 1.1100035599857601e-05, 'epoch': 0.89}
{'loss': 0.0106, 'learning_rate': 1.0210039159843361e-05, 'epoch': 0.98}


  0%|          | 0/1756 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'eval_loss': 0.007907397113740444, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9975355585864244, 'eval_runtime': 49.6623, 'eval_samples_per_second': 565.58, 'eval_steps_per_second': 35.359, 'epoch': 1.0}
{'loss': 0.0099, 'learning_rate': 9.320042719829121e-06, 'epoch': 1.07}
{'loss': 0.0085, 'learning_rate': 8.430046279814883e-06, 'epoch': 1.16}
{'loss': 0.0098, 'learning_rate': 7.540049839800642e-06, 'epoch': 1.25}
{'loss': 0.0086, 'learning_rate': 6.650053399786402e-06, 'epoch': 1.33}
{'loss': 0.009, 'learning_rate': 5.7600569597721615e-06, 'epoch': 1.42}
{'loss': 0.0083, 'learning_rate': 4.8700605197579215e-06, 'epoch': 1.51}
{'loss': 0.009, 'learning_rate': 3.9800640797436814e-06, 'epoch': 1.6}
{'loss': 0.0089, 'learning_rate': 3.0900676397294414e-06, 'epoch': 1.69}
{'loss': 0.0094, 'learning_rate': 2.2000711997152013e-06, 'epoch': 1.78}
{'loss': 0.0083, 'learning_rate': 1.3100747597009615e-06, 'epoch': 1.87}
{'loss': 0.0088, 'learning_rate': 4.20

  0%|          | 0/1756 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'eval_loss': 0.007096001412719488, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.997643634769478, 'eval_runtime': 49.2841, 'eval_samples_per_second': 569.921, 'eval_steps_per_second': 35.63, 'epoch': 2.0}
{'train_runtime': 458.4717, 'train_samples_per_second': 392.077, 'train_steps_per_second': 24.508, 'train_loss': 0.013802536593165928, 'epoch': 2.0}


TrainOutput(global_step=11236, training_loss=0.013802536593165928, metrics={'train_runtime': 458.4717, 'train_samples_per_second': 392.077, 'train_steps_per_second': 24.508, 'train_loss': 0.013802536593165928, 'epoch': 2.0})

In [26]:
trainer.evaluate()

  0%|          | 0/1756 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'eval_loss': 0.007096001412719488,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.997643634769478,
 'eval_runtime': 49.3066,
 'eval_samples_per_second': 569.66,
 'eval_steps_per_second': 35.614,
 'epoch': 2.0}

# Applying Hugging face V2