In [None]:
!pip install torch
!pip install pandas
!pip install datasets
!pip install transformers
!pip install scikit-learn
!pip install peft
!pip install codecarbon
!pip install matplotlib

In [2]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Features, Value, ClassLabel, Dataset
from peft import get_peft_model, LoraConfig, TaskType
from transformers import TrainerCallback
import time
import matplotlib.pyplot as plt
from codecarbon import EmissionsTracker
import os

In [None]:
!wget https://perso.univ-lemans.fr/~ndugue/ohsumed-all.zip
!unzip ohsumed-all.zip

In [5]:
labels = []
ids = []
paths = []
fichiers = []

ohsumed_dir = "/content/ohsumed-all"

for category in os.listdir(ohsumed_dir):
    cat_dir = os.path.join(ohsumed_dir, category)
    for fichier in os.listdir(cat_dir):
         if ".ipynb" not in fichier:
            fichier_path = os.path.join(cat_dir, fichier)
            labels.append(category)
            paths.append(fichier_path)
            ids.append(fichier)
            with open(fichier_path) as c_file:
                fichiers.append(c_file.read())

features = Features(
{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=23, id=None, names=list(set(labels))),
 'id': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
})
dico_data = {"text" : fichiers, "label" : labels, "id": ids, "path": paths}
ds = Dataset.from_dict(dico_data, features=features)
ds[0]

ds_split = ds.train_test_split(test_size=0.2, stratify_by_column="label")
ds_split
train_for_ft = ds_split["train"]
test_for_ft = ds_split["test"]

In [6]:
print(train_for_ft.features["label"].num_classes)

23


In [8]:
# Préparer le tokenizer
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Charger le modèle pré-entraîné sans fine-tuning
base_model = DistilBertForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=23)

def tokenize_and_chunk(doc):
    return tokenizer(doc["text"], truncation = True, max_length=512, padding="max_length")

tokenized_train_for_ft = train_for_ft.map(tokenize_and_chunk,batched=True, num_proc=8)
tokenized_test_for_ft = test_for_ft.map(tokenize_and_chunk,batched=True, num_proc=8)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

You are using a model of type modernbert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transforme

Map (num_proc=8):   0%|          | 0/45587 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/11397 [00:00<?, ? examples/s]

**FINETUNING AVEC LoRa**

In [None]:
# Récupérer les noms des classes directement à partir du dataset
class_names = tokenized_train_for_ft.features['label'].names

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None, zero_division=1)

    # Calcul des métriques globales (moyenne des classes)
    precision_global = precision.mean()
    recall_global = recall.mean()
    f1_macro = f1.mean()  # Ajout du F1-score macro

    metrics = {
        "accuracy": accuracy,
        "precision_global": precision_global,
        "recall_global": recall_global,
        "f1_macro": f1_macro
    }

    # Ajouter les métriques par classe avec les noms explicites
    for i, (p, r, f) in enumerate(zip(precision, recall, f1)):
        metrics[f"precision_{class_names[i]}"] = p
        metrics[f"recall_{class_names[i]}"] = r
        metrics[f"f1_{class_names[i]}"] = f

    return metrics

# Configurer LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Classification de séquence
    r=8,  # Rang de la décomposition
    lora_alpha=16,  # Facteur d'adaptation
    lora_dropout=0.1,  # Dropout pour LoRA
    target_modules=["q_lin", "v_lin"]  # Modules spécifiques à LoRA dans les transformers
)

# Appliquer LoRA au modèle
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# Configurer l'entraînement avec fine-tuning LoRA
training_args = TrainingArguments(
    output_dir="./results_lora",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    overwrite_output_dir=True,
    disable_tqdm=False,
    fp16=True,
    seed=42,
    report_to="wandb"
)

# Initialisation du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_for_ft ,  # Dataset d'entraînement
    eval_dataset=tokenized_test_for_ft,  # Dataset de test
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

tracker = EmissionsTracker()
tracker.start()

train_result = trainer.train()

emissions = tracker.stop()


**FINETUNING SANS LoRa**

In [None]:
import torch
import pandas as pd
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from codecarbon import EmissionsTracker

# ✅ CHOIX : Sélectionner les paramètres à entraîner
freeze_mode = "last_layers"  # "head", "last_layers", "embeddings", "none"

if freeze_mode == "head":
    for param in base_model.distilbert.parameters():
        param.requires_grad = False  # Gèle tout sauf la tête

elif freeze_mode == "last_layers":
    for param in base_model.distilbert.parameters():
        param.requires_grad = False  # Gèle tout
    for layer in base_model.distilbert.transformer.layer[-2:]:  # Dégeler les 2 dernières couches
        for param in layer.parameters():
            param.requires_grad = True

elif freeze_mode == "embeddings":
    for param in base_model.distilbert.embeddings.parameters():
        param.requires_grad = False  # Gèle uniquement les embeddings

elif freeze_mode == "none":
    pass  # Fine-tune complet

# Vérifier les paramètres entraînables
for name, param in base_model.named_parameters():
    print(f"{name}: requires_grad = {param.requires_grad}")

# Fonction de calcul des métriques
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=None, zero_division=1)

    return {
        "accuracy": accuracy,
        "precision_global": precision.mean(),
        "recall_global": recall.mean(),
        "f1_macro": f1.mean()
    }

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results_sans_lora",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    overwrite_output_dir=True,
    disable_tqdm=False,
    fp16=True,
    seed=42,
    report_to="wandb"
)

# Initialisation du Trainer
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_for_ft,
    eval_dataset=tokenized_test_for_ft,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Suivi de l'empreinte carbone
tracker = EmissionsTracker()
tracker.start()

# Entraînement
train_result = trainer.train()

# Fin du suivi carbone
emissions = tracker.stop()


In [None]:
import pandas as pd

def generate_metrics_report(trainer, final_metrics, training_args, emissions, tokenizer=None, lora_config=None):
    """
    Génère un rapport compact avec les métriques clés.
    Affiche automatiquement les infos liées à LoRA si `lora_config` est fourni.
    """
    def safe_format(value):
        return f"{value:.4f}" if isinstance(value, (int, float)) else str(value)

    history = pd.DataFrame(trainer.state.log_history)

    # Calcul du temps moyen par époque
    if 'epoch' in history.columns and 'train_runtime' in history.columns:
        epoch_times = history[history['epoch'].notna()].groupby('epoch')['train_runtime'].mean()
        mean_epoch_time = epoch_times.mean() if not epoch_times.empty else "N/A"
    else:
        mean_epoch_time = "N/A"

    # Section LoRA (optionnelle)
    lora_section = (
        f"- **LoRA Config** : r={lora_config.r}, alpha={lora_config.lora_alpha}, dropout={lora_config.lora_dropout}"
        if lora_config else "- **LoRA** : non utilisé"
    )

    # Choix du nom de fichier / courbe
    suffix = "_lora" if lora_config else "_sans_lora"
    report_path = f"./results{suffix}/training_report{suffix}.txt"
    learning_curve_path = f"./results{suffix}/learning_curves{suffix}.png"

    report = f"""
# 📊 Rapport d'entraînement et d'évaluation

## 🔍 **Résultats**
- **Accuracy** : {safe_format(final_metrics.get('eval_accuracy', 'N/A'))}
- **F1-score (macro)** : {safe_format(final_metrics.get('eval_f1_macro', 'N/A'))}
- **Precision globale** : {safe_format(final_metrics.get('eval_precision_global', 'N/A'))}
- **Recall global** : {safe_format(final_metrics.get('eval_recall_global', 'N/A'))}
- **Loss finale** : {safe_format(final_metrics.get('eval_loss', 'N/A'))}

## ⚙️ **Hyperparamètres**
- **Epochs** : {training_args.num_train_epochs}
- **Batch (train/eval)** : {training_args.per_device_train_batch_size} / {training_args.per_device_eval_batch_size}
- **Learning Rate** : {training_args.learning_rate}
- **Weight Decay** : {training_args.weight_decay}
{lora_section}

## 🧠 **Tokenizer**
- **Tokenizer utilisé** : {getattr(tokenizer, 'name_or_path', 'Non spécifié') if tokenizer else 'Non fourni'}

## ⏱ **Temps d'entraînement**
- **Temps moyen par époque** : {safe_format(mean_epoch_time)} sec

## 🌱 **Empreinte carbone**
- **CO₂ estimé** : {safe_format(emissions)} kg

## 📈 **Courbes d'apprentissage**
📌 `{learning_curve_path}`
"""
    with open(report_path, "w") as f:
        f.write(report)
    print(report)

In [None]:
final_metrics = trainer.evaluate()
generate_metrics_report(trainer, final_metrics, training_args, emissions, tokenizer)
print(final_metrics)

In [None]:
!zip -r modernBERT_oshumed.zip ./

In [None]:
from google.colab import files
files.download("mon_dossier.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>