In [None]:
!pip install torch matplotlib numpy scipy scikit-learn transformers datasets evaluate codecarbon

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting codecarbon
  Downloading codecarbon-2.8.3-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.me

In [None]:
import torch
import pandas as pd
from datasets import load_dataset, DatasetDict
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import DatasetDict
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainerCallback
import time
import matplotlib.pyplot as plt
from codecarbon import EmissionsTracker
from transformers import AutoTokenizer, DistilBertForSequenceClassification

In [None]:
# Inspecte les étiquettes du dataset
from datasets import load_dataset

# Charger le dataset
dataset = load_dataset("hate_speech18")

# Vérifie les étiquettes dans le dataset d'entraînement
print(dataset['train'].features['label'])


ClassLabel(names=['noHate', 'hate', 'idk/skip', 'relation'], id=None)


In [None]:
# Séparer en train et test
split_datasets = dataset["train"].train_test_split(test_size=0.2)

datasets = DatasetDict({
    "train": split_datasets["train"],
    "test": split_datasets["test"],
})

# Préparer le tokenizer
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Appliquer la tokenization
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Charger le modèle pré-entraîné sans fine-tuning
base_model = DistilBertForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=4)

Map:   0%|          | 0/8755 [00:00<?, ? examples/s]

Map:   0%|          | 0/2189 [00:00<?, ? examples/s]

You are using a model of type modernbert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distil

**FINETUNING AVEC LoRa**

In [None]:
# Récupérer les noms des classes directement à partir du dataset
class_names = dataset['train'].features['label'].names

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None, zero_division=1)

    # Calcul des métriques globales (moyenne des classes)
    precision_global = precision.mean()
    recall_global = recall.mean()
    f1_macro = f1.mean()  # Ajout du F1-score macro

    metrics = {
        "accuracy": accuracy,
        "precision_global": precision_global,
        "recall_global": recall_global,
        "f1_macro": f1_macro
    }

    # Ajouter les métriques par classe avec les noms explicites
    for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
        metrics[f"precision_{class_names[i]}"] = p
        metrics[f"recall_{class_names[i]}"] = r
        metrics[f"f1_{class_names[i]}"] = f

    return metrics

# Configurer LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Classification de séquence
    r=8,  # Rang de la décomposition
    lora_alpha=16,  # Facteur d'adaptation
    lora_dropout=0.1,  # Dropout pour LoRA
    target_modules=["q_lin", "v_lin"]  # Modules spécifiques à LoRA dans les transformers
)

# Appliquer LoRA au modèle
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# Configurer l'entraînement avec fine-tuning LoRA
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Évaluation à chaque époque
    save_strategy="epoch",  # Sauvegarde à chaque époque
    logging_strategy="steps",  # Log les métriques à chaque `logging_steps`
    logging_steps=10,  # Enregistre la loss toutes les 10 étapes
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="wandb",
    overwrite_output_dir=True,
    disable_tqdm=False
)

small_train_dataset = tokenized_datasets["train"]
small_test_dataset = tokenized_datasets["test"]

# Initialisation du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,  # Dataset d'entraînement
    eval_dataset=small_test_dataset,  # Dataset de test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

tracker = EmissionsTracker()
tracker.start()

train_result = trainer.train()

emissions = tracker.stop()


  trainer = Trainer(


trainable params: 1,134,340 || all params: 202,636,808 || trainable%: 0.5598


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[codecarbon INFO @ 09:07:48] [setup] RAM Tracking...
[codecarbon INFO @ 09:07:48] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 09:07:49] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 09:07:49] [setup] GPU Tracking...
[codecarbon INFO @ 09:07:49] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 09:07:49] >>> Tracker's metadata:
[codecarbon INFO @ 09:07:49]   Platform system: Linux-6.1.85+-x86_64-with-glibc2.35
[codecarbon INFO @ 09:07:49]   Python version: 3.11.12
[codecarbon INFO @ 09:07:49]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 09:07:49]   Available RAM : 12.675 GB
[c

Epoch,Training Loss,Validation Loss,Accuracy,Precision Global,Recall Global,F1 Macro,Precision Nohate,Recall Nohate,F1 Nohate,Precision Hate,Recall Hate,F1 Hate,Precision Idk/skip,Recall Idk/skip,F1 Idk/skip,Precision Relation,Recall Relation,F1 Relation
1,0.6652,0.458668,0.859753,0.964938,0.25,0.231147,0.859753,1.0,0.924589,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.4998,0.449403,0.859753,0.798452,0.250844,0.233125,0.860476,0.999469,0.924779,0.333333,0.003906,0.007722,1.0,0.0,0.0,1.0,0.0,0.0
3,0.421,0.47837,0.859753,0.964938,0.25,0.231147,0.859753,1.0,0.924589,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


[codecarbon INFO @ 09:08:04] Energy consumed for RAM : 0.000020 kWh. RAM Power : 4.753036022186279 W
[codecarbon INFO @ 09:08:04] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 09:08:04] Energy consumed for all GPUs : 0.000257 kWh. Total GPU Power : 61.642852282613916 W
[codecarbon INFO @ 09:08:04] 0.000454 kWh of electricity used since the beginning.
[codecarbon INFO @ 09:08:19] Energy consumed for RAM : 0.000040 kWh. RAM Power : 4.753036022186279 W
[codecarbon INFO @ 09:08:19] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 09:08:19] Energy consumed for all GPUs : 0.000546 kWh. Total GPU Power : 69.4313000927635 W
[codecarbon INFO @ 09:08:19] 0.000940 kWh of electricity used since the beginning.
[codecarbon INFO @ 09:08:34] Energy consumed for RAM : 0.000059 kWh. RAM Power : 4.753036022186279 W
[codecarbon INFO @ 09:08:34] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codecarbo

**FINETUNING SANS LoRa**

In [None]:
#SANS LORA
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments
from codecarbon import EmissionsTracker
import torch

# Récupérer les noms des classes depuis le dataset
class_names = dataset['train'].features['label'].names

# Fonction pour calculer les métriques
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=1
    )

    precision_global = precision.mean()
    recall_global = recall.mean()
    f1_macro = f1.mean()

    metrics = {
        "accuracy": accuracy,
        "precision_global": precision_global,
        "recall_global": recall_global,
        "f1_macro": f1_macro
    }

    for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
        metrics[f"precision_{class_names[i]}"] = p
        metrics[f"recall_{class_names[i]}"] = r
        metrics[f"f1_{class_names[i]}"] = f

    return metrics

# Configuration de l'entraînement (fine-tuning complet, sans LoRA)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="wandb",  # ou "none" si tu ne veux pas utiliser wandb
    overwrite_output_dir=True,
    disable_tqdm=False
)

# Datasets
small_train_dataset = tokenized_datasets["train"]
small_test_dataset = tokenized_datasets["test"]

# Création du Trainer avec le modèle de base (fine-tuning complet)
trainer = Trainer(
    model=base_model,  # Pas de modèle modifié par LoRA
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Suivi des émissions
tracker = EmissionsTracker()
tracker.start()

# Entraînement
train_result = trainer.train()

# Fin du suivi
emissions = tracker.stop()


In [None]:
!zip -r ModernBERT_hatespeech_lora.zip ./

  adding: .config/ (stored 0%)
  adding: .config/active_config (stored 0%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: .config/default_configs.db (deflated 98%)
  adding: .config/.last_survey_prompt.yaml (stored 0%)
  adding: .config/.last_opt_in_prompt.yaml (stored 0%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2025.04.09/ (stored 0%)
  adding: .config/logs/2025.04.09/13.38.09.438379.log (deflated 56%)
  adding: .config/logs/2025.04.09/13.37.59.170421.log (deflated 86%)
  adding: .config/logs/2025.04.09/13.38.00.354311.log (deflated 58%)
  adding: .config/logs/2025.04.09/13.37.30.549742.log (deflated 93%)
  adding: .config/logs/2025.04.09/13.38.08.751788.log (deflated 57%)
  adding: .config/logs/2025.04.09/13.37.51.026068.log (deflated 58%)
  adding: .config/config_sentinel (stored 0%)
  adding: .conf