Installs

In [19]:
!pip -q install librosa matplotlib pandas numpy pillow scikit-learn tqdm
# Pick the right CUDA build for your GPU. Example below uses CUDA 12.1.
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu129



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Imports + Seeds

In [20]:
import os, glob, random, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, models, transforms
from PIL import Image

import librosa
import librosa.display

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.benchmark = True
# Allow TF32 on Ampere+ to speed up conv/matmul (slight precision tradeoff).
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision("high")

USE_AMP = True
USE_CHANNELS_LAST = True
USE_COMPILE = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Torch:", torch.__version__)
print("Device:", device)


Torch: 2.8.0+cu129
Device: cuda


Chemins FoR + dossier spectrograms

In [21]:
# Racine FoR (structure: training|validation|testing / real|fake)
FOR_WAV_ROOT = "data/for-norm"   # ex: data/for/training/real/*.wav etc

# Où stocker les spectrogrammes générés (png)
FOR_SPEC_ROOT = "data/for_spectrograms"  # output

SPLITS = ["training", "validation", "testing"]
CLASSES = ["real", "fake"]

# vérif
for s in SPLITS:
    for c in CLASSES:
        p = os.path.join(FOR_WAV_ROOT, s, c)
        assert os.path.exists(p), f"Chemin introuvable: {p}"

os.makedirs(FOR_SPEC_ROOT, exist_ok=True)
print("OK paths.")


OK paths.


Conversion spectrogram

In [22]:
def create_spectrogram(audio_file, image_file, target_sr=16000, n_fft=2048, hop_length=512):
    y, sr = librosa.load(audio_file, sr=target_sr)
    if len(y) == 0:
        return False

    # Ajuste n_fft si signal trop court
    if len(y) < n_fft:
        n_fft = 2 ** int(np.floor(np.log2(len(y))))  # puissance de 2 <= len(y)
        n_fft = max(n_fft, 256)
        if n_fft > len(y):
            return False

    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length)
    S_db = librosa.power_to_db(S, ref=np.max)

    fig, ax = plt.subplots(figsize=(3, 3), dpi=80)
    ax.axis("off")
    librosa.display.specshow(S_db, sr=sr, x_axis=None, y_axis=None, ax=ax)
    fig.savefig(image_file, bbox_inches="tight", pad_inches=0)
    plt.close(fig)
    return True

def create_pngs_from_wavs(input_path, output_path):
    """
    Inspiré du notebook : convertit tous les wav d'un dossier vers png
    """
    os.makedirs(output_path, exist_ok=True)
    files = [f for f in os.listdir(input_path) if f.lower().endswith(".wav")]

    for file in files:
        in_file = os.path.join(input_path, file)
        out_file = os.path.join(output_path, file.replace(".wav", ".png"))
        if not os.path.exists(out_file):  # skip si déjà généré
            create_spectrogram(in_file, out_file)


Générer tous les spectrogrammes (training/validation/testing)

In [None]:
t0 = time.time()

for split in SPLITS:
    for cls in CLASSES:
        in_dir  = os.path.join(FOR_WAV_ROOT, split, cls)
        out_dir = os.path.join(FOR_SPEC_ROOT, split, cls)
        print("Converting:", in_dir, "->", out_dir)
        create_pngs_from_wavs(in_dir, out_dir)

print("DONE spectrograms. Time(s):", time.time() - t0)


Converting: data/for-norm\training\real -> data/for_spectrograms\training\real
Converting: data/for-norm\training\fake -> data/for_spectrograms\training\fake
Converting: data/for-norm\validation\real -> data/for_spectrograms\validation\real
Converting: data/for-norm\validation\fake -> data/for_spectrograms\validation\fake
Converting: data/for-norm\testing\real -> data/for_spectrograms\testing\real
Converting: data/for-norm\testing\fake -> data/for_spectrograms\testing\fake
DONE spectrograms. Time(s): 709.4892230033875


Datasets PyTorch (ImageFolder)


In [24]:
BATCH_SIZE = 32
IMG_SIZE = (224, 224)

train_tfms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_tfms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_ds = datasets.ImageFolder(os.path.join(FOR_SPEC_ROOT, "training"), transform=train_tfms)
val_ds = datasets.ImageFolder(os.path.join(FOR_SPEC_ROOT, "validation"), transform=val_tfms)
test_ds = datasets.ImageFolder(os.path.join(FOR_SPEC_ROOT, "testing"), transform=val_tfms)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    pin_memory=torch.cuda.is_available(),
)
val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=torch.cuda.is_available(),
)
test_loader = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=torch.cuda.is_available(),
)

class_names = train_ds.classes
print("Class names:", class_names)


Class names: ['fake', 'real']


Utilitaires d’entraînement (checkpoint + early stopping)

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

USE_PRETRAINED = True

def build_backbone(builder, weights_enum=None):
    if USE_PRETRAINED:
        if weights_enum is not None:
            try:
                return builder(weights=weights_enum)
            except Exception:
                pass
        try:
            return builder(pretrained=True)
        except Exception:
            return builder()
    try:
        return builder(weights=None)
    except Exception:
        try:
            return builder(pretrained=False)
        except Exception:
            return builder()

def _move_images(images):
    if USE_CHANNELS_LAST:
        return images.to(device, non_blocking=True).contiguous(memory_format=torch.channels_last)
    return images.to(device, non_blocking=True)

def evaluate_metrics(model, loader, threshold=0.5):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in loader:
            images = _move_images(images)
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                outputs = model(images)
            probs = torch.sigmoid(outputs).cpu().numpy().reshape(-1)
            preds = (probs >= threshold).astype(int)
            y_pred.extend(preds.tolist())
            y_true.extend(labels.numpy().reshape(-1).astype(int).tolist())

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return acc, prec, rec, f1

def train_and_save(model, name, train_loader=train_loader, val_loader=val_loader, epochs=20, lr=1e-4, patience=4):
    if USE_COMPILE and hasattr(torch, "compile"):
        model = torch.compile(model)
    model = model.to(device)
    if USE_CHANNELS_LAST:
        model = model.to(memory_format=torch.channels_last)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)

    os.makedirs("weights", exist_ok=True)
    ckpt_path = f"weights/{name}.pt"

    history = []
    best_acc = 0.0
    best_epoch = 0

    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images = _move_images(images)
            labels = labels.float().unsqueeze(1).to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                outputs = model(images)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item() * images.size(0)
            preds = (torch.sigmoid(outputs) >= 0.5).long()
            correct += (preds.cpu().squeeze(1) == labels.cpu().long().squeeze(1)).sum().item()
            total += labels.size(0)

        train_loss = running_loss / max(1, total)
        train_acc = correct / max(1, total)
        val_acc, val_prec, val_rec, val_f1 = evaluate_metrics(model, val_loader)

        history.append({
            "epoch": epoch,
            "loss": train_loss,
            "acc": train_acc,
            "val_acc": val_acc,
            "val_precision": val_prec,
            "val_recall": val_rec,
            "val_f1": val_f1,
        })

        print(
            f"Epoch {epoch}/{epochs} - loss {train_loss:.4f} - acc {train_acc:.4f} "
            f"- val_acc {val_acc:.4f} - val_prec {val_prec:.4f} - val_rec {val_rec:.4f} - val_f1 {val_f1:.4f}"
        )

        if val_acc >= best_acc:
            best_acc = val_acc
            best_epoch = epoch
            torch.save(model.state_dict(), ckpt_path)

        if epoch - best_epoch >= patience:
            print("Early stopping triggered.")
            break

    print("Best saved:", ckpt_path)
    return history, ckpt_path

def eval_model(model, title="model"):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in test_loader:
            images = _move_images(images)
            with torch.cuda.amp.autocast(enabled=USE_AMP):
                outputs = model(images)
            probs = torch.sigmoid(outputs).cpu().numpy().reshape(-1)
            y_pred.extend((probs >= 0.5).astype(int).tolist())
            y_true.extend(labels.numpy().reshape(-1).astype(int).tolist())

    print(f"\n=== TEST {title} ===")
    print("acc:", accuracy_score(y_true, y_pred))
    print("precision:", precision_score(y_true, y_pred, zero_division=0))
    print("recall:", recall_score(y_true, y_pred, zero_division=0))
    print("f1:", f1_score(y_true, y_pred, zero_division=0))
    print(classification_report(y_true, y_pred, zero_division=0))


In [27]:
import time

def _sync_cuda():
    if torch.cuda.is_available():
        torch.cuda.synchronize()

def profile_one_batch(model, loader):
    """Rough timing for one batch: data load, transfer, forward, backward."""
    model = model.to(device)
    model.train()

    it = iter(loader)
    t0 = time.perf_counter()
    images, labels = next(it)
    t1 = time.perf_counter()

    images = images.to(device, non_blocking=True)
    labels = labels.float().unsqueeze(1).to(device, non_blocking=True)
    _sync_cuda()
    t2 = time.perf_counter()

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    _sync_cuda()
    t3 = time.perf_counter()
    outputs = model(images)
    _sync_cuda()
    t4 = time.perf_counter()

    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    _sync_cuda()
    t5 = time.perf_counter()

    print(f"data load:      {t1 - t0:.4f}s")
    print(f"host->device:  {t2 - t1:.4f}s")
    print(f"forward:        {t4 - t3:.4f}s")
    print(f"backward+step:  {t5 - t4:.4f}s")
    print(f"total:          {t5 - t0:.4f}s")

# Run this after a model is created, e.g.:
vgg_weights = getattr(models, "VGG16_Weights", None)
vgg16 = build_backbone(models.vgg16, vgg_weights.IMAGENET1K_V1 if vgg_weights else None)
for param in vgg16.features.parameters():
    param.requires_grad = False

vgg16.classifier[6] = nn.Linear(vgg16.classifier[6].in_features, 1)
model_vgg = vgg16
profile_one_batch(model_vgg, train_loader)


data load:      0.1317s
host->device:  0.0019s
forward:        0.2160s
backward+step:  0.0557s
total:          0.4055s


Modèle 1 : VGG16 (comme notebook VGG16)

In [28]:
vgg_weights = getattr(models, "VGG16_Weights", None)
vgg16 = build_backbone(models.vgg16, vgg_weights.IMAGENET1K_V1 if vgg_weights else None)
for param in vgg16.features.parameters():
    param.requires_grad = False

vgg16.classifier[6] = nn.Linear(vgg16.classifier[6].in_features, 1)
model_vgg = vgg16
print(model_vgg)

hist_vgg, vgg_path = train_and_save(model_vgg, "audio_vgg16", epochs=5, lr=1e-5)
eval_model(model_vgg, "audio_vgg16")


VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 1/5 - loss 0.0826 - acc 0.9699 - val_acc 0.9902 - val_prec 0.9987 - val_rec 0.9817 - val_f1 0.9901


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 2/5 - loss 0.0123 - acc 0.9963 - val_acc 0.9957 - val_prec 0.9985 - val_rec 0.9930 - val_f1 0.9957


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 3/5 - loss 0.0044 - acc 0.9988 - val_acc 0.9938 - val_prec 0.9993 - val_rec 0.9883 - val_f1 0.9938


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 4/5 - loss 0.0022 - acc 0.9996 - val_acc 0.9951 - val_prec 0.9991 - val_rec 0.9911 - val_f1 0.9951


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 5/5 - loss 0.0012 - acc 0.9997 - val_acc 0.9955 - val_prec 0.9991 - val_rec 0.9919 - val_f1 0.9954
Best saved: weights/audio_vgg16.pt


  with torch.cuda.amp.autocast(enabled=USE_AMP):



=== TEST audio_vgg16 ===
acc: 0.8120414328873543
precision: 0.7381196581196581
recall: 0.9536219081272085
f1: 0.8321449219502794
              precision    recall  f1-score   support

           0       0.94      0.68      0.79      2370
           1       0.74      0.95      0.83      2264

    accuracy                           0.81      4634
   macro avg       0.84      0.82      0.81      4634
weighted avg       0.84      0.81      0.81      4634



Modèle 2 : ResNet50

In [29]:
resnet_weights = getattr(models, "ResNet50_Weights", None)
resnet = build_backbone(models.resnet50, resnet_weights.IMAGENET1K_V1 if resnet_weights else None)
for param in resnet.parameters():
    param.requires_grad = False

resnet.fc = nn.Linear(resnet.fc.in_features, 1)
model_resnet = resnet
print(model_resnet)

hist_resnet, resnet_path = train_and_save(model_resnet, "audio_resnet50", epochs=5, lr=2e-5)
eval_model(model_resnet, "audio_resnet50")


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 1/5 - loss 0.4299 - acc 0.8531 - val_acc 0.8627 - val_prec 0.9418 - val_rec 0.7731 - val_f1 0.8492


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 2/5 - loss 0.3039 - acc 0.8840 - val_acc 0.8750 - val_prec 0.9492 - val_rec 0.7924 - val_f1 0.8637


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 3/5 - loss 0.2687 - acc 0.8961 - val_acc 0.8836 - val_prec 0.9534 - val_rec 0.8067 - val_f1 0.8739


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 4/5 - loss 0.2491 - acc 0.9036 - val_acc 0.8838 - val_prec 0.9619 - val_rec 0.7993 - val_f1 0.8731


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 5/5 - loss 0.2343 - acc 0.9088 - val_acc 0.8863 - val_prec 0.9646 - val_rec 0.8020 - val_f1 0.8758
Best saved: weights/audio_resnet50.pt


  with torch.cuda.amp.autocast(enabled=USE_AMP):



=== TEST audio_resnet50 ===
acc: 0.6389728096676737
precision: 0.7640750670241286
recall: 0.37765017667844525
f1: 0.5054685190659178
              precision    recall  f1-score   support

           0       0.60      0.89      0.72      2370
           1       0.76      0.38      0.51      2264

    accuracy                           0.64      4634
   macro avg       0.68      0.63      0.61      4634
weighted avg       0.68      0.64      0.61      4634



Modèle 3 : MobileNetV2 

In [30]:
mobile_weights = getattr(models, "MobileNet_V2_Weights", None)
mobile = build_backbone(models.mobilenet_v2, mobile_weights.IMAGENET1K_V1 if mobile_weights else None)
for param in mobile.features.parameters():
    param.requires_grad = False

mobile.classifier[1] = nn.Linear(mobile.classifier[1].in_features, 1)
model_mobile = mobile
print(model_mobile)

hist_mobile, mobile_path = train_and_save(model_mobile, "audio_mobilenetv2", epochs=5, lr=1e-5)
eval_model(model_mobile, "audio_mobilenetv2")


Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to C:\Users\akram/.cache\torch\hub\checkpoints\mobilenet_v2-b0353104.pth


100%|██████████| 13.6M/13.6M [00:00<00:00, 22.1MB/s]
  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
  with torch.cuda.amp.autocast(enabled=USE_AMP):


MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=

  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 1/5 - loss 0.5115 - acc 0.7924 - val_acc 0.8465 - val_prec 0.9245 - val_rec 0.7546 - val_f1 0.8310


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 2/5 - loss 0.3698 - acc 0.8608 - val_acc 0.8501 - val_prec 0.9323 - val_rec 0.7550 - val_f1 0.8343


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 3/5 - loss 0.3304 - acc 0.8692 - val_acc 0.8534 - val_prec 0.9374 - val_rec 0.7574 - val_f1 0.8379


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 4/5 - loss 0.3095 - acc 0.8742 - val_acc 0.8524 - val_prec 0.9457 - val_rec 0.7478 - val_f1 0.8352


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 5/5 - loss 0.2952 - acc 0.8792 - val_acc 0.8558 - val_prec 0.9461 - val_rec 0.7546 - val_f1 0.8396
Best saved: weights/audio_mobilenetv2.pt


  with torch.cuda.amp.autocast(enabled=USE_AMP):



=== TEST audio_mobilenetv2 ===
acc: 0.6281829952524817
precision: 0.8022346368715084
recall: 0.31713780918727913
f1: 0.4545742323520101
              precision    recall  f1-score   support

           0       0.59      0.93      0.72      2370
           1       0.80      0.32      0.45      2264

    accuracy                           0.63      4634
   macro avg       0.69      0.62      0.59      4634
weighted avg       0.69      0.63      0.59      4634



PARTIE B : XRAY (CheXpert small) : AlexNet + DenseNet121 (sans augmentation)

In [39]:
CHEX_ROOT = "data/chexpert"
train_csv = os.path.join(CHEX_ROOT, "train.csv")
valid_csv = os.path.join(CHEX_ROOT, "valid.csv")

assert os.path.exists(train_csv), train_csv
assert os.path.exists(valid_csv), valid_csv

df_train = pd.read_csv(train_csv)
df_valid = pd.read_csv(valid_csv)

df_train.head()


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,


Construire une cible binaire simple (proxy)

In [None]:
LABEL_COL = "Lung Opacity"   # ou "Mass" / "Nodule"
assert LABEL_COL in df_train.columns, df_train.columns

def clean_chexpert(df):
    df = df[["Path", LABEL_COL]].copy()
    df = df.dropna()
    df = df[df[LABEL_COL].isin([0, 1])]  # drop -1 (uncertain)
    df["label"] = df[LABEL_COL].astype(int)
    df["abs_path"] = df["Path"].apply(lambda p: os.path.join(CHEX_ROOT, p))
    df = df[df["abs_path"].apply(os.path.exists)]
    return df[["abs_path", "label"]]

train_bin = clean_chexpert(df_train)
valid_bin = clean_chexpert(df_valid)

print("train:", len(train_bin), "valid:", len(valid_bin))
train_bin.head()


train: 112180 valid: 234


Unnamed: 0,abs_path,label
1,data/chexpert\CheXpert-v1.0-small/train/patien...,1
2,data/chexpert\CheXpert-v1.0-small/train/patien...,1
3,data/chexpert\CheXpert-v1.0-small/train/patien...,1
12,data/chexpert\CheXpert-v1.0-small/train/patien...,1
13,data/chexpert\CheXpert-v1.0-small/train/patien...,1


PyTorch dataset images


In [45]:
IMG_SIZE = (224, 224)
BATCH_SIZE_X = 24

xray_tfms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.Grayscale(num_output_channels=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class ChexpertDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.loc[idx, "abs_path"]
        label = int(self.df.loc[idx, "label"])
        img = Image.open(path).convert("L")
        if self.transform:
            img = self.transform(img)
        return img, label

xray_train_ds = ChexpertDataset(train_bin, xray_tfms)
xray_valid_ds = ChexpertDataset(valid_bin, xray_tfms)

xray_train_loader = DataLoader(
    xray_train_ds,
    batch_size=BATCH_SIZE_X,
    shuffle=True,
    num_workers=0,
    pin_memory=torch.cuda.is_available(),
)
xray_valid_loader = DataLoader(
    xray_valid_ds,
    batch_size=BATCH_SIZE_X,
    shuffle=False,
    num_workers=0,
    pin_memory=torch.cuda.is_available(),
)


Entraîner DenseNet121

In [47]:
densenet_weights = getattr(models, "DenseNet121_Weights", None)
base = build_backbone(models.densenet121, densenet_weights.IMAGENET1K_V1 if densenet_weights else None)
for param in base.features.parameters():
    param.requires_grad = False

base.classifier = nn.Linear(base.classifier.in_features, 1)
xray_densenet = base
print(xray_densenet)

hist_xdn, ckpt = train_and_save(
    xray_densenet,
    "xray_densenet121",
    train_loader=xray_train_loader,
    val_loader=xray_valid_loader,
    epochs=5,
    lr=1e-4,
    patience=3,
)
print("Saved:", ckpt)


DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 1/5 - loss 0.2075 - acc 0.9412 - val_acc 0.5385 - val_prec 0.5385 - val_rec 1.0000 - val_f1 0.7000


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 2/5 - loss 0.1934 - acc 0.9413 - val_acc 0.5385 - val_prec 0.5385 - val_rec 1.0000 - val_f1 0.7000


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 3/5 - loss 0.1906 - acc 0.9410 - val_acc 0.5513 - val_prec 0.5455 - val_rec 1.0000 - val_f1 0.7059


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 4/5 - loss 0.1889 - acc 0.9410 - val_acc 0.5513 - val_prec 0.5455 - val_rec 1.0000 - val_f1 0.7059


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 5/5 - loss 0.1876 - acc 0.9409 - val_acc 0.5513 - val_prec 0.5455 - val_rec 1.0000 - val_f1 0.7059
Best saved: weights/xray_densenet121.pt
Saved: weights/xray_densenet121.pt


Entraîner AlexNet

In [46]:
alex_weights = getattr(models, "AlexNet_Weights", None)
xray_alexnet = build_backbone(models.alexnet, alex_weights.IMAGENET1K_V1 if alex_weights else None)

xray_alexnet.classifier[6] = nn.Linear(xray_alexnet.classifier[6].in_features, 1)
print(xray_alexnet)

hist_xax, ckpt = train_and_save(
    xray_alexnet,
    "xray_alexnet",
    train_loader=xray_train_loader,
    val_loader=xray_valid_loader,
    epochs=5,
    lr=1e-4,
    patience=3,
)
print("Saved:", ckpt)


AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)
  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 1/5 - loss 0.1944 - acc 0.9407 - val_acc 0.5385 - val_prec 0.5385 - val_rec 1.0000 - val_f1 0.7000


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 2/5 - loss 0.1824 - acc 0.9412 - val_acc 0.5385 - val_prec 0.5385 - val_rec 1.0000 - val_f1 0.7000


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 3/5 - loss 0.1778 - acc 0.9415 - val_acc 0.5427 - val_prec 0.5408 - val_rec 1.0000 - val_f1 0.7019


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 4/5 - loss 0.1747 - acc 0.9412 - val_acc 0.5556 - val_prec 0.5478 - val_rec 1.0000 - val_f1 0.7079


  with torch.cuda.amp.autocast(enabled=USE_AMP):
  with torch.cuda.amp.autocast(enabled=USE_AMP):


Epoch 5/5 - loss 0.1698 - acc 0.9420 - val_acc 0.6239 - val_prec 0.5888 - val_rec 1.0000 - val_f1 0.7412
Best saved: weights/xray_alexnet.pt
Saved: weights/xray_alexnet.pt


Évaluation X-ray (valid) + report

In [48]:
def eval_xray(model, title, loader=xray_valid_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            outputs = model(images)
            probs = torch.sigmoid(outputs).cpu().numpy().reshape(-1)
            y_pred.extend((probs >= 0.5).astype(int).tolist())
            y_true.extend(labels.numpy().reshape(-1).astype(int).tolist())

    print(f"\n=== XRAY VALID {title} ===")
    print("acc:", accuracy_score(y_true, y_pred))
    print("precision:", precision_score(y_true, y_pred, zero_division=0))
    print("recall:", recall_score(y_true, y_pred, zero_division=0))
    print("f1:", f1_score(y_true, y_pred, zero_division=0))
    print(classification_report(y_true, y_pred, zero_division=0))

eval_xray(xray_densenet, "DenseNet121")
eval_xray(xray_alexnet, "AlexNet")



=== XRAY VALID DenseNet121 ===
acc: 0.5512820512820513
precision: 0.5454545454545454
recall: 1.0
f1: 0.7058823529411765
              precision    recall  f1-score   support

           0       1.00      0.03      0.05       108
           1       0.55      1.00      0.71       126

    accuracy                           0.55       234
   macro avg       0.77      0.51      0.38       234
weighted avg       0.76      0.55      0.41       234


=== XRAY VALID AlexNet ===
acc: 0.6239316239316239
precision: 0.5887850467289719
recall: 1.0
f1: 0.7411764705882353
              precision    recall  f1-score   support

           0       1.00      0.19      0.31       108
           1       0.59      1.00      0.74       126

    accuracy                           0.62       234
   macro avg       0.79      0.59      0.53       234
weighted avg       0.78      0.62      0.54       234



Résumé des 5 modèles (weights)

In [49]:
import os
print("Saved models in ./weights:")
for f in sorted(os.listdir("weights")):
    print(" -", f)


Saved models in ./weights:
 - audio_mobilenetv2.pt
 - audio_resnet50.pt
 - audio_vgg16.pt
 - xray_alexnet.pt
 - xray_densenet121.pt
