In [1]:
from torchvision.models import resnet50,mobilenet_v3_large,mobilenet_v3_small
import torch.nn as nn 

student=mobilenet_v3_small(weights="MobileNet_V3_Small_Weights.IMAGENET1K_V1")

student.classifier[3]=nn.Linear(1024,5)     

student.to("cuda")

MobileNetV3(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (avgpool): AdaptiveAvgPool2d(output_size=1)
          (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
          (fc2): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
          (activation): ReLU()
          (scale_activation): Hardsigmoid()
        )
        (2): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), 

In [3]:
from scripts.utils import train_loader,val_loader,train_loader
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report,cohen_kappa_score
import torch
import torchvision.models as models
import time 

num_classes=5

teacher = models.resnet50(weights=None)
# Adapting the last layer to the 5-class classification task
num_features = teacher.fc.in_features
teacher.fc = torch.nn.Linear(num_features, num_classes)
#Load the weights
state_dict = torch.load("models/resnet50_aptos19_best.pth", map_location="cuda")
teacher.load_state_dict(state_dict)

<All keys matched successfully>

In [4]:
import torch, torch.nn.functional as F
import torch.nn as nn
from torchvision import models

# --- Teacher ---
teacher = models.resnet50(weights="IMAGENET1K_V1")
teacher.fc = nn.Linear(2048, 5)
teacher.load_state_dict(torch.load("models/resnet50_aptos19_best.pth"))
teacher = teacher.to("cuda")
teacher.eval()

# --- Student ---
student = models.mobilenet_v3_small(weights="IMAGENET1K_V1")
student.classifier[3] = nn.Linear(1024, 5)
student = student.to("cuda")


for name, param in student.features.named_parameters():
    if int(name.split('.')[0]) <= 6:
        param.requires_grad = False


In [5]:
from scripts.utils import FocalLoss,df_train

lr=1e-4

class_counts = df_train["label"].value_counts().sort_index()
frequences = class_counts / class_counts.sum()

alpha = (1.0 / frequences)
alpha = alpha / alpha.sum()  

criterion = FocalLoss(alpha=alpha.tolist(), gamma=2.0)

criterion_ce = FocalLoss(alpha=[0.25]*5, gamma=2.0)
criterion_kd = nn.KLDivLoss(reduction='batchmean')

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, student.parameters()), lr=lr)

T = 3.0          # température pour adoucir les logits
alpha = 0.7      # pondération entre KD et Focal/CE
lr = 1e-4


In [8]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, confusion_matrix
from torch.optim.lr_scheduler import CosineAnnealingLR
from scripts.utils import set_seed

set_seed(1)
N_epochs = 30
patience = 5

best_kappa = -1
patience_counter = 0
best_state = None

# --- Scheduler ---
scheduler = CosineAnnealingLR(optimizer, T_max=N_epochs)

for epoch in range(N_epochs):
    print(f"\nEpoch {epoch+1}/{N_epochs}")
    student.train()
    total_loss = 0.0

    for x, y in train_loader:
        x, y = x.to("cuda"), y.to("cuda")
        optimizer.zero_grad()

        # --- forward teacher + student
        with torch.no_grad():
            teacher_logits = teacher(x)

        student_logits = student(x)

        # --- distillation loss
        loss_kd = criterion_kd(
            F.log_softmax(student_logits / T, dim=1),
            F.softmax(teacher_logits / T, dim=1)
        ) * (T ** 2)

        # --- hard-label loss (classique)
        loss_ce = criterion_ce(student_logits, y)

        # --- mix des deux
        loss = alpha * loss_kd + (1 - alpha) * loss_ce
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    scheduler.step()

    # --- Validation ---
    student.eval()
    val_labels, val_preds = [], []
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val = x_val.to("cuda")
            logits_val = student(x_val)
            preds = torch.argmax(logits_val, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(y_val.numpy())

    # --- Metrics ---
    acc = accuracy_score(val_labels, val_preds)
    f1 = f1_score(val_labels, val_preds, average="macro")
    qwk = cohen_kappa_score(val_labels, val_preds, weights="quadratic")
    cm = confusion_matrix(val_labels, val_preds)

    print(f"Train Loss={avg_train_loss:.4f} | "
          f"Val Acc={acc:.3f} | F1={f1:.3f} | QWK={qwk:.3f}")
    print(cm)

    # --- Early stopping ---
    if qwk > best_kappa:
        best_kappa = qwk
        best_state = student.state_dict().copy()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping déclenché à l’epoch {epoch+1}")
            break

# --- Charger le meilleur modèle ---
if best_state is not None:
    student.load_state_dict(best_state)
    print(f"\n Meilleur modèle sauvegardé (QWK max = {best_kappa:.3f})")




✅ Seed fixée à 1

Epoch 1/30
Train Loss=0.3526 | Val Acc=0.792 | F1=0.378 | QWK=0.692
[[658   9  37   0   1]
 [ 19  13  36   0   0]
 [ 32   2 158   0   1]
 [  8   0  28   0   1]
 [  5   0  39   0   1]]

Epoch 2/30
Train Loss=0.3312 | Val Acc=0.801 | F1=0.424 | QWK=0.732
[[656   7  40   0   2]
 [ 14  13  41   0   0]
 [ 24   2 162   0   5]
 [  6   0  26   0   5]
 [  4   0  33   0   8]]

Epoch 3/30
Train Loss=0.3553 | Val Acc=0.811 | F1=0.445 | QWK=0.727
[[658  10  35   0   2]
 [ 13  19  36   0   0]
 [ 23   3 166   0   1]
 [  6   0  25   0   6]
 [  7   0  31   0   7]]

Epoch 4/30
Train Loss=0.3351 | Val Acc=0.810 | F1=0.444 | QWK=0.715
[[657   8  38   0   2]
 [ 18  14  36   0   0]
 [ 23   0 169   0   1]
 [  8   0  24   0   5]
 [  8   0  28   0   9]]

Epoch 5/30
Train Loss=0.3134 | Val Acc=0.815 | F1=0.481 | QWK=0.728
[[660  10  32   1   2]
 [ 20  22  26   0   0]
 [ 28   3 160   0   2]
 [  9   0  22   0   6]
 [  6   0  26   1  12]]

Epoch 6/30
Train Loss=0.2991 | Val Acc=0.825 | F1=0.512 |

In [None]:
from scripts.evaluate import evaluate
from scripts.utils import test_loader

evaluate(student,test_loader,device="cuda")

Evaluation starting
[[835  15  61   2   4]
 [ 16  46  31   0   1]
 [ 36  13 203   0   1]
 [  7   0  27   5   4]
 [  8   3  44   2  34]]
Inference time on cuda : 24.72 seconds.
Performace : {'accuracy': np.float64(0.8), 'quadratic_kappa': np.float64(0.73)}


{'accuracy': np.float64(0.8), 'quadratic_kappa': np.float64(0.73)}

In [None]:
# --- Sauvegarde ---
torch.save(student.state_dict(), "mobilenetv3_distilled_best.pth")
print("Modèle sauvegardé ✅")

In [16]:
from evaluate import evaluate 
from utils import test_loader
from torchvision.models import resnet50,mobilenet_v3_large,mobilenet_v3_small
import torch.nn as nn 
import torch 

def get_student(state_dict_path=None):

    mobile_net=mobilenet_v3_small(weights="MobileNet_V3_Small_Weights.IMAGENET1K_V1")

    mobile_net.classifier[3]=nn.Linear(1024,5)     

    state_dict = torch.load("mobile_net_small.pth", map_location="cpu")

    if state_dict_path : 

        mobile_net.load_state_dict(state_dict)

    mobile_net.to("cuda")

    return(mobile_net)

student_path="mobilenetv3_distilled_best.pth"
mobile_net=get_student(student_path)

In [1]:
import os
import torchvision.models as models 
import torch.nn as nn 
import torch
from evaluate import evaluate
from utils import test_loader

# --- Student ---
student = models.mobilenet_v3_small(weights="IMAGENET1K_V1")

student.classifier[3] = nn.Linear(1024, 5)

student = student.to("cuda")

state_dict=torch.load("mobilenetv3_distilled_best.pth")

student.load_state_dict(state_dict)

student.to("cpu")

teacher_size=os.path.getsize(r"resnet50_aptos19_best.pth")/1e6
student_size=(os.path.getsize(r"mobilenetv3_distilled_best.pth")/1e6)

print(f"Teacher size : {teacher_size:.0f} MB")

print(f"Student size : {student_size:.0f} MB")

evaluate(student,test_loader,device="cpu")

Teacher size : 94 MB
Student size : 6 MB
[[828  15  66   1   7]
 [ 16  45  32   0   1]
 [ 36  12 203   0   2]
 [  8   0  26   3   6]
 [  9   2  45   2  33]]
Inference time on cpu : 28.174745798110962 seconds.
Performace : {'accuracy': 0.7954220314735336, 'f1_macro': 0.538418301194789, 'quadratic_kappa': 0.7138115071955878}
