In [85]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import torch
import transformers
import pandas as pd
import re
import ast
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [86]:
np.random.seed(42)

In [87]:
model_name = "Jean-Baptiste/camembert-ner"

In [88]:
df = pd.read_csv("../src/data/raw/train.csv", sep=';')

In [89]:
df = df.drop(columns=['is_comic', 'comic_name', 'tokens'])

In [90]:
df.head()

Unnamed: 0.1,Unnamed: 0,video_name,is_name
0,0,Le Barbecue Disney - La chanson de Frédéric Fr...,"[0, 0, 0, 0, 0, 0, 0, 1, 1]"
1,1,Le Roi et l'Oiseau - La Chronique de Christine...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]"
2,2,L'amour du lac - La chronique d'Hippolyte Gira...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1]"
3,3,La fille de la piscine de Léa Tourret - La chr...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]"
4,4,"""Le soleil va moins faire son malin quand Jean...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [91]:
def make_labelled_sentences(df):
    sentences = []
    labels = []
    df['is_name'] = df['is_name'].apply(lambda x: ast.literal_eval(x))
    for i, row in df.iterrows():
        sentence = row['video_name']
        label = row['is_name']
        sentence = [word for word in re.split(r"[ ']", sentence)]
        label = [label for label in label]
            
        sentences.append(sentence)
        labels.append(label)

    return sentences, labels

In [92]:
sentences, labels = make_labelled_sentences(df)

In [93]:
# Delete line with sentence and labels with desequal length
def delete_line_with_desequal_length(sentences, labels):
    for i in reversed(range(len(sentences))):
        if len(sentences[i]) != len(labels[i]):
            del sentences[i]
            del labels[i]
    return sentences, labels

In [94]:
sentences, labels = delete_line_with_desequal_length(sentences, labels)

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [97]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

In [98]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
# model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [99]:
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [100]:
tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)

In [101]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

In [102]:
from datasets import Dataset

dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [103]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [104]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [105]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2, ignore_mismatched_sizes=True
)

torch.cuda.get_device_name(0)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/camembert-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'NVIDIA GeForce RTX 4070 Ti'

In [106]:
model = model.to("cuda")

for name, param in model.base_model.named_parameters():
  param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if (
        any(layer_name in name for layer_name in ["layer.0", "layer.1", "layer.2", "layer.3", "layer.4"])
        and any(layer_type in name for layer_type in ["weight", "bias"])
        and "attention" not in name
    ):
        param.requires_grad = True

In [107]:
from transformers import TrainingArguments, Trainer, TrainerCallback

metrics_dict = {}

class StoreMetricsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        if len(state.log_history) > 0:
            metrics_dict[f"Epoch {state.epoch}"] = state.log_history[-1]

In [108]:
# Shutting down warnings
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [109]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[StoreMetricsCallback()]
)

trainer.train()

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.12139977514743805, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9620978404583517, 'eval_runtime': 0.25, 'eval_samples_per_second': 744.095, 'eval_steps_per_second': 48.006, 'epoch': 1.0}


  0%|          | 0/12 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.10785740613937378, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9620978404583517, 'eval_runtime': 0.2616, 'eval_samples_per_second': 711.045, 'eval_steps_per_second': 45.874, 'epoch': 2.0}
{'train_runtime': 7.421, 'train_samples_per_second': 159.817, 'train_steps_per_second': 10.241, 'train_loss': 0.1875760931717722, 'epoch': 2.0}


TrainOutput(global_step=76, training_loss=0.1875760931717722, metrics={'train_runtime': 7.421, 'train_samples_per_second': 159.817, 'train_steps_per_second': 10.241, 'train_loss': 0.1875760931717722, 'epoch': 2.0})

In [110]:
trainer.evaluate()

  0%|          | 0/12 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.10785740613937378,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.9620978404583517,
 'eval_runtime': 0.2301,
 'eval_samples_per_second': 808.352,
 'eval_steps_per_second': 52.152,
 'epoch': 2.0}

In [111]:
trainer.save_model("model_finetuned")