In [1]:
!pip install torch
!pip install pandas
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install peft
!pip install codecarbon
!pip install matplotlib

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import DatasetDict
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainerCallback
import time
import matplotlib.pyplot as plt
from codecarbon import EmissionsTracker

In [2]:
# Inspecte les étiquettes du dataset
from datasets import load_dataset

# Charger le dataset
dataset = load_dataset("imdb")

# Vérifie les étiquettes dans le dataset d'entraînement
print(dataset['train'].features['label'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ClassLabel(names=['neg', 'pos'], id=None)


In [3]:
# Séparer en train et test
split_datasets = dataset["train"].train_test_split(test_size=0.2)

datasets = DatasetDict({
    "train": split_datasets["train"],
    "test": split_datasets["test"],
})

# Préparer le tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Appliquer la tokenization
tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Charger le modèle pré-entraîné sans fine-tuning
base_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Récupérer les noms des classes directement à partir du dataset
class_names = dataset['train'].features['label'].names

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None, zero_division=1)

    # Calcul des métriques globales (moyenne des classes)
    precision_global = precision.mean()
    recall_global = recall.mean()
    f1_macro = f1.mean()  # Ajout du F1-score macro

    metrics = {
        "accuracy": accuracy,
        "precision_global": precision_global,
        "recall_global": recall_global,
        "f1_macro": f1_macro
    }

    # Ajouter les métriques par classe avec les noms explicites
    for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
        metrics[f"precision_{class_names[i]}"] = p
        metrics[f"recall_{class_names[i]}"] = r
        metrics[f"f1_{class_names[i]}"] = f

    return metrics

# Configurer LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Classification de séquence
    r=8,  # Rang de la décomposition
    lora_alpha=16,  # Facteur d'adaptation
    lora_dropout=0.1,  # Dropout pour LoRA
    target_modules=["q_lin", "v_lin"]  # Modules spécifiques à LoRA dans les transformers
)

# Appliquer LoRA au modèle
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# Configurer l'entraînement avec fine-tuning LoRA
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_lora",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.05,
    lr_scheduler_type="cosine",
    warmup_steps=1000,
    max_grad_norm=0.8,
    metric_for_best_model="f1_macro",
    seed=123,
    fp16=True,
    report_to="none",
    overwrite_output_dir=True,
    disable_tqdm=False,
    logging_first_step=True,
)


small_train_dataset = tokenized_datasets["train"]
small_test_dataset = tokenized_datasets["test"]

# Initialisation du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,  # Dataset d'entraînement
    eval_dataset=small_test_dataset,  # Dataset de test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



tracker = EmissionsTracker()
tracker.start()

train_result = trainer.train()

emissions = tracker.stop()


  trainer = Trainer(


trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[codecarbon INFO @ 10:44:13] [setup] RAM Tracking...
[codecarbon INFO @ 10:44:13] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 10:44:14] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 10:44:14] [setup] GPU Tracking...
[codecarbon INFO @ 10:44:14] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 10:44:14] >>> Tracker's metadata:
[codecarbon INFO @ 10:44:14]   Platform system: Linux-6.1.85+-x86_64-with-glibc2.35
[codecarbon INFO @ 10:44:14]   Python version: 3.11.11
[codecarbon INFO @ 10:44:14]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 10:44:14]   Available RAM : 12.675 GB
[c

Epoch,Training Loss,Validation Loss,Accuracy,Precision Global,Recall Global,F1 Macro,Precision Neg,Recall Neg,F1 Neg,Precision Pos,Recall Pos,F1 Pos
1,0.6669,0.636586,0.798,0.798215,0.798256,0.797999,0.812348,0.784929,0.798403,0.784082,0.811582,0.797595
2,0.3174,0.298815,0.8728,0.872753,0.872753,0.872753,0.875196,0.875196,0.875196,0.87031,0.87031,0.87031
3,0.2786,0.264309,0.8926,0.89255,0.892672,0.892581,0.899166,0.888932,0.89402,0.885933,0.896411,0.891141
4,0.2587,0.243369,0.9026,0.902561,0.902699,0.902586,0.910068,0.897567,0.903774,0.895054,0.90783,0.901397


[codecarbon INFO @ 10:44:29] Energy consumed for RAM : 0.000020 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 10:44:29] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:44:29] Energy consumed for all GPUs : 0.000272 kWh. Total GPU Power : 65.21620481121546 W
[codecarbon INFO @ 10:44:29] 0.000469 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:44:44] Energy consumed for RAM : 0.000040 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 10:44:44] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 10:44:44] Energy consumed for all GPUs : 0.000558 kWh. Total GPU Power : 68.67571407656136 W
[codecarbon INFO @ 10:44:44] 0.000952 kWh of electricity used since the beginning.
[codecarbon INFO @ 10:44:59] Energy consumed for RAM : 0.000059 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 10:44:59] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codeca

In [4]:
import torch
import pandas as pd
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from codecarbon import EmissionsTracker

# Charger le modèle de base
base_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

# ✅ CHOIX : Sélectionner les paramètres à entraîner
freeze_mode = "last_layers"  # "head", "last_layers", "embeddings", "none"

if freeze_mode == "head":
    for param in base_model.distilbert.parameters():
        param.requires_grad = False  # Gèle tout sauf la tête

elif freeze_mode == "last_layers":
    for param in base_model.distilbert.parameters():
        param.requires_grad = False  # Gèle tout
    for layer in base_model.distilbert.transformer.layer[-2:]:  # Dégeler les 2 dernières couches
        for param in layer.parameters():
            param.requires_grad = True

elif freeze_mode == "embeddings":
    for param in base_model.distilbert.embeddings.parameters():
        param.requires_grad = False  # Gèle uniquement les embeddings

elif freeze_mode == "none":
    pass  # Fine-tune complet

# Vérifier les paramètres entraînables
for name, param in base_model.named_parameters():
    print(f"{name}: requires_grad = {param.requires_grad}")

# Fonction de calcul des métriques
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None, zero_division=1)

    return {
        "accuracy": accuracy,
        "precision_global": precision.mean(),
        "recall_global": recall.mean(),
        "f1_macro": f1.mean()
    }

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results_sans_lora",
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.05,
    lr_scheduler_type="cosine",
    warmup_steps=1000,
    max_grad_norm=0.8,
    metric_for_best_model="f1_macro",
    seed=123,
    fp16=True,
    report_to="none",
    overwrite_output_dir=True,
    disable_tqdm=False,
    logging_first_step=True,
)

# Initialisation du Trainer
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Suivi de l'empreinte carbone
tracker = EmissionsTracker()
tracker.start()

# Entraînement
train_result = trainer.train()

# Fin du suivi carbone
emissions = tracker.stop()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


distilbert.embeddings.word_embeddings.weight: requires_grad = False
distilbert.embeddings.position_embeddings.weight: requires_grad = False
distilbert.embeddings.LayerNorm.weight: requires_grad = False
distilbert.embeddings.LayerNorm.bias: requires_grad = False
distilbert.transformer.layer.0.attention.q_lin.weight: requires_grad = False
distilbert.transformer.layer.0.attention.q_lin.bias: requires_grad = False
distilbert.transformer.layer.0.attention.k_lin.weight: requires_grad = False
distilbert.transformer.layer.0.attention.k_lin.bias: requires_grad = False
distilbert.transformer.layer.0.attention.v_lin.weight: requires_grad = False
distilbert.transformer.layer.0.attention.v_lin.bias: requires_grad = False
distilbert.transformer.layer.0.attention.out_lin.weight: requires_grad = False
distilbert.transformer.layer.0.attention.out_lin.bias: requires_grad = False
distilbert.transformer.layer.0.sa_layer_norm.weight: requires_grad = False
distilbert.transformer.layer.0.sa_layer_norm.bias: 

[codecarbon INFO @ 11:13:38] [setup] RAM Tracking...
[codecarbon INFO @ 11:13:38] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 11:13:39] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 11:13:39] [setup] GPU Tracking...
[codecarbon INFO @ 11:13:39] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 11:13:39] >>> Tracker's metadata:
[codecarbon INFO @ 11:13:39]   Platform system: Linux-6.1.85+-x86_64-with-glibc2.35
[codecarbon INFO @ 11:13:39]   Python version: 3.11.11
[codecarbon INFO @ 11:13:39]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 11:13:39]   Available RAM : 12.675 GB
[codecarbon INFO @ 11:13:39]   CPU count: 2
[codecarbon INFO @ 11:13:39]   CPU model: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 11:13:39]   GPU count: 1
[codecarbon INFO @ 11:13:39]   GPU model: 1 x Tesla T4
[codecarbon INFO @ 11:13:39] Saving emissions data to file

Epoch,Training Loss,Validation Loss,Accuracy,Precision Global,Recall Global,F1 Macro
1,0.3711,0.278615,0.8912,0.892105,0.891492,0.891176
2,0.2467,0.226337,0.9112,0.911554,0.91104,0.911147
3,0.2183,0.224378,0.9112,0.913383,0.911638,0.911134
4,0.1777,0.199595,0.923,0.923029,0.922955,0.922984


[codecarbon INFO @ 11:13:54] Energy consumed for RAM : 0.000020 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 11:13:54] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 11:13:54] Energy consumed for all GPUs : 0.000272 kWh. Total GPU Power : 65.28084048730274 W
[codecarbon INFO @ 11:13:54] 0.000469 kWh of electricity used since the beginning.
[codecarbon INFO @ 11:14:09] Energy consumed for RAM : 0.000040 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 11:14:09] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 11:14:09] Energy consumed for all GPUs : 0.000556 kWh. Total GPU Power : 67.99930012372101 W
[codecarbon INFO @ 11:14:09] 0.000949 kWh of electricity used since the beginning.
[codecarbon INFO @ 11:14:24] Energy consumed for RAM : 0.000059 kWh. RAM Power : 4.7530388832092285 W
[codecarbon INFO @ 11:14:24] Energy consumed for all CPUs : 0.000531 kWh. Total CPU Power : 42.5 W
[codeca

In [6]:
import pandas as pd

def generate_metrics_report(trainer, final_metrics, training_args, emissions, tokenizer=None, lora_config=None):
    """
    Génère un rapport compact avec les métriques clés.
    Affiche automatiquement les infos liées à LoRA si `lora_config` est fourni.
    """
    def safe_format(value):
        return f"{value:.4f}" if isinstance(value, (int, float)) else str(value)

    history = pd.DataFrame(trainer.state.log_history)

    # Calcul du temps moyen par époque
    if 'epoch' in history.columns and 'train_runtime' in history.columns:
        epoch_times = history[history['epoch'].notna()].groupby('epoch')['train_runtime'].mean()
        mean_epoch_time = epoch_times.mean() if not epoch_times.empty else "N/A"
    else:
        mean_epoch_time = "N/A"

    # Section LoRA (optionnelle)
    lora_section = (
        f"- **LoRA Config** : r={lora_config.r}, alpha={lora_config.lora_alpha}, dropout={lora_config.lora_dropout}"
        if lora_config else "- **LoRA** : non utilisé"
    )

    # Choix du nom de fichier / courbe
    suffix = "_lora" if lora_config else "_sans_lora"
    report_path = f"./results{suffix}/training_report{suffix}.txt"
    learning_curve_path = f"./results{suffix}/learning_curves{suffix}.png"

    report = f"""
# 📊 Rapport d'entraînement et d'évaluation

## 🔍 **Résultats**
- **Accuracy** : {safe_format(final_metrics.get('eval_accuracy', 'N/A'))}
- **F1-score (macro)** : {safe_format(final_metrics.get('eval_f1_macro', 'N/A'))}
- **Precision globale** : {safe_format(final_metrics.get('eval_precision_global', 'N/A'))}
- **Recall global** : {safe_format(final_metrics.get('eval_recall_global', 'N/A'))}
- **Loss finale** : {safe_format(final_metrics.get('eval_loss', 'N/A'))}

## ⚙️ **Hyperparamètres**
- **Epochs** : {training_args.num_train_epochs}
- **Batch Size (train / eval)** : {training_args.per_device_train_batch_size} / {training_args.per_device_eval_batch_size}
- **Gradient Accumulation Steps** : {training_args.gradient_accumulation_steps}
- **Learning Rate** : {training_args.learning_rate}
- **Weight Decay** : {training_args.weight_decay}
- **Warmup Steps** : {training_args.warmup_steps}
- **Scheduler** : {training_args.lr_scheduler_type}
- **Max Grad Norm** : {training_args.max_grad_norm}
- **Seed** : {training_args.seed}
{lora_section}

## 🧠 **Tokenizer**
- **Tokenizer utilisé** : {getattr(tokenizer, 'name_or_path', 'Non spécifié') if tokenizer else 'Non fourni'}

## ⏱ **Temps d'entraînement**
- **Temps moyen par époque** : {safe_format(mean_epoch_time)} sec

## 🌱 **Empreinte carbone**
- **CO₂ estimé** : {safe_format(emissions)} kg

"""
    with open(report_path, "w") as f:
        f.write(report)
    print(report)


In [7]:
final_metrics = trainer.evaluate()
# generate_metrics_report(trainer, final_metrics, training_args, emissions, tokenizer, lora_config)
generate_metrics_report(trainer, final_metrics, training_args, emissions, tokenizer)
print(final_metrics)


# 📊 Rapport d'entraînement et d'évaluation

## 🔍 **Résultats**
- **Accuracy** : 0.9230
- **F1-score (macro)** : 0.9230
- **Precision globale** : 0.9230
- **Recall global** : 0.9230
- **Loss finale** : 0.1996

## ⚙️ **Hyperparamètres**
- **Epochs** : 5
- **Batch Size (train / eval)** : 32 / 32
- **Gradient Accumulation Steps** : 2
- **Learning Rate** : 3e-05
- **Weight Decay** : 0.05
- **Warmup Steps** : 1000
- **Scheduler** : SchedulerType.COSINE
- **Max Grad Norm** : 0.8
- **Seed** : 123
- **LoRA** : non utilisé

## 🧠 **Tokenizer**
- **Tokenizer utilisé** : distilbert-base-uncased

## ⏱ **Temps d'entraînement**
- **Temps moyen par époque** : 733.0113 sec

## 🌱 **Empreinte carbone**
- **CO₂ estimé** : 0.0067 kg


{'eval_loss': 0.19959479570388794, 'eval_accuracy': 0.923, 'eval_precision_global': 0.9230289574498597, 'eval_recall_global': 0.9229548328023018, 'eval_f1_macro': 0.9229835831805908, 'eval_runtime': 24.1738, 'eval_samples_per_second': 206.835, 'eval_steps_per_second': 6.495, 

In [10]:
!zip -r mon_dossier2.zip ./

  adding: .config/ (stored 0%)
  adding: .config/config_sentinel (stored 0%)
  adding: .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .config/gce (stored 0%)
  adding: .config/.last_survey_prompt.yaml (stored 0%)
  adding: .config/active_config (stored 0%)
  adding: .config/default_configs.db (deflated 98%)
  adding: .config/.last_update_check.json (deflated 23%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2025.04.07/ (stored 0%)
  adding: .config/logs/2025.04.07/13.42.43.503346.log (deflated 57%)
  adding: .config/logs/2025.04.07/13.42.02.070730.log (deflated 92%)
  adding: .config/logs/2025.04.07/13.42.32.824024.log (deflated 86%)
  adding: .config/logs/2025.04.07/13.42.24.146310.log (deflated 58%)
  adding: .config/logs/2025.04.07/13.42.34.302585.log (deflated 58%)
  adding: .config/logs/2025.04.07/13.42.44.25107

In [12]:
from google.colab import files
files.download("mon_dossier2.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>