This notebook is used to train the classifier on Google Colab once the embeddings have been extracted. At the end of the notebook, we also provide the option to retrain the classifier on the combined training and validation sets for final submission purposes.

In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.7.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.2-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  D

In [None]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import pandas as pd
import torchmetrics


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### FUNCTIONS

In [None]:
class Classifier(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, output_size)
        self.dropout = nn.Dropout(p=0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.fc4(x)
        x = self.sigmoid(x)  # Pour binaire
        return x.view(-1)

In [None]:
def train_one_epoch(model, dataloader, optimizer, criterion, metric, device):
    model.train()
    metrics, losses = [], []
    for x, y in tqdm(dataloader, leave=False):
        optimizer.zero_grad()
        pred = model(x.to(device))
        loss = criterion(pred, y.to(device))
        loss.backward()
        optimizer.step()
        losses.extend([loss.item()] * len(y))
        score = metric(pred.cpu(), y.int().cpu())
        metrics.extend([score.item()] * len(y))
    return np.mean(losses), np.mean(metrics)

def validate(model, dataloader, criterion, metric, device):
    model.eval()
    metrics, losses = [], []
    with torch.no_grad():
        for x, y in tqdm(dataloader, leave=False):
            pred = model(x.to(device))
            loss = criterion(pred, y.to(device))
            losses.extend([loss.item()] * len(y))
            score = metric(pred.cpu(), y.int().cpu())
            metrics.extend([score.item()] * len(y))
    return np.mean(losses), np.mean(metrics)

### DATASET

In [None]:
train_df = pd.read_pickle("/content/drive/MyDrive/data/embeddings_final.pkl")
val_df = pd.read_pickle("/content/drive/MyDrive/data/embeddings_final_val.pkl")

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, df, excluded_ids=None):
        if excluded_ids is not None:
            df = df[~df["ID"].isin(excluded_ids)].reset_index(drop=True)

        self.embeddings = np.stack(df["embedding"].values)  # shape (N, D)
        self.labels = df["label"].values                    # shape (N,)
        self.ids = df["ID"].values                          # pour suivi éventuel

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.tensor(self.embeddings[idx]).float()
        y = torch.tensor(self.labels[idx]).float()
        return x, y

In [None]:
batch_size = 64

train_dataset = EmbeddingDataset(train_df)
val_dataset = EmbeddingDataset(val_df)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)


In [None]:
len(train_dataset)

100000

In [None]:
import pickle

with open("/content/drive/MyDrive/data/results.pkl", "rb") as f:
    data = pickle.load(f)

ids_abberant_train = data["abberant_train"]
ids_abberant_val = data["abberant_val"]
batch_size = 64

train_dataset = EmbeddingDataset(train_df,ids_abberant_train)
val_dataset = EmbeddingDataset(val_df,ids_abberant_val)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)


In [None]:
len(train_dataset),len(val_dataset)

(99614, 34817)

### Training

In [None]:
#PARAMS
OPTIMIZER = 'Adam'
lr= 0.001
LOSS = 'BCELoss'
METRIC = 'Accuracy'
NUM_EPOCHS = 100
PATIENCE = 13
checkpoint_path = "best_model.pth"

model = Classifier(input_size=train_dataset[0][0].shape[0], output_size=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Classifier(
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sigmoid): Sigmoid()
)

In [None]:
optimizer = getattr(torch.optim,OPTIMIZER)(model.parameters(), lr=lr)
criterion = getattr(torch.nn, LOSS)()
metric = getattr(torchmetrics, METRIC)('binary')

min_loss, best_epoch = float('inf'), 0

for epoch in range(NUM_EPOCHS):
    train_loss, train_metric = train_one_epoch(model, train_loader, optimizer, criterion, metric, device)
    val_loss, val_metric = validate(model, val_loader, criterion, metric, device)

    print(f"[{epoch+1}/{NUM_EPOCHS}] Train Loss: {train_loss:.4f} | Acc: {train_metric:.4f}")
    print(f"[{epoch+1}/{NUM_EPOCHS}] Val   Loss: {val_loss:.4f} | Acc: {val_metric:.4f}")

    if val_loss < min_loss:
        print(f"New best val loss {min_loss:.4f} → {val_loss:.4f}")
        min_loss = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), checkpoint_path)

    if epoch - best_epoch >= PATIENCE:
        print("Early stopping.")
        break



[1/100] Train Loss: 0.0681 | Acc: 0.9784
[1/100] Val   Loss: 0.1713 | Acc: 0.9389
New best val loss inf → 0.1713




[2/100] Train Loss: 0.0574 | Acc: 0.9818
[2/100] Val   Loss: 0.1163 | Acc: 0.9564
New best val loss 0.1713 → 0.1163




[3/100] Train Loss: 0.0527 | Acc: 0.9835
[3/100] Val   Loss: 0.1689 | Acc: 0.9368




[4/100] Train Loss: 0.0492 | Acc: 0.9844
[4/100] Val   Loss: 0.1318 | Acc: 0.9531




[5/100] Train Loss: 0.0477 | Acc: 0.9852
[5/100] Val   Loss: 0.1434 | Acc: 0.9487




[6/100] Train Loss: 0.0465 | Acc: 0.9851
[6/100] Val   Loss: 0.1503 | Acc: 0.9426




[7/100] Train Loss: 0.0444 | Acc: 0.9857
[7/100] Val   Loss: 0.1262 | Acc: 0.9532




[8/100] Train Loss: 0.0433 | Acc: 0.9865
[8/100] Val   Loss: 0.1574 | Acc: 0.9452




[9/100] Train Loss: 0.0418 | Acc: 0.9867
[9/100] Val   Loss: 0.1655 | Acc: 0.9397




[10/100] Train Loss: 0.0410 | Acc: 0.9869
[10/100] Val   Loss: 0.1256 | Acc: 0.9517




[11/100] Train Loss: 0.0405 | Acc: 0.9873
[11/100] Val   Loss: 0.1325 | Acc: 0.9484




[12/100] Train Loss: 0.0389 | Acc: 0.9877
[12/100] Val   Loss: 0.1360 | Acc: 0.9482




[13/100] Train Loss: 0.0380 | Acc: 0.9878
[13/100] Val   Loss: 0.1308 | Acc: 0.9526




[14/100] Train Loss: 0.0370 | Acc: 0.9883
[14/100] Val   Loss: 0.1504 | Acc: 0.9492


                                                  

[15/100] Train Loss: 0.0367 | Acc: 0.9881
[15/100] Val   Loss: 0.1478 | Acc: 0.9438
Early stopping.




### Train + Val

In [None]:
full_df = pd.concat([train_df, val_df]).reset_index(drop=True)

# Dataset et DataLoader
full_dataset = EmbeddingDataset(full_df)
train_loader = DataLoader(full_dataset, shuffle=True, batch_size=64)

In [None]:
#PARAMS
OPTIMIZER = 'Adam'
lr= 0.001
LOSS = 'BCELoss'
METRIC = 'Accuracy'
NUM_EPOCHS = 30
PATIENCE = 10

model_full = Classifier(input_size=full_dataset[0][0].shape[0], output_size=1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_full.to(device)

Classifier(
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (sigmoid): Sigmoid()
)

In [None]:
optimizer = getattr(torch.optim,OPTIMIZER)(model_full.parameters(), lr=lr)
criterion = getattr(torch.nn, LOSS)()
metric = getattr(torchmetrics, METRIC)('binary')

min_loss, best_epoch = float('inf'), 0

for epoch in range(NUM_EPOCHS):
    train_loss, train_metric = train_one_epoch(model_full, train_loader, optimizer, criterion, metric, device)
    #val_loss, val_metric = validate(model, val_loader, criterion, metric, device)

    print(f"[{epoch+1}/{NUM_EPOCHS}] Train Loss: {train_loss:.4f} | Acc: {train_metric:.4f}")
    #print(f"[{epoch+1}/{NUM_EPOCHS}] Val   Loss: {val_loss:.4f} | Acc: {val_metric:.4f}")

    # if val_loss < min_loss:
    #     print(f"New best val loss {min_loss:.4f} → {val_loss:.4f}")
    #     min_loss = val_loss
    #     best_epoch = epoch
    #     #torch.save(model.state_dict(), args.checkpoint_path)

    # if epoch - best_epoch >= PATIENCE:
    #     print("Early stopping.")
    #     break



[1/30] Train Loss: 0.0757 | Acc: 0.9752




[2/30] Train Loss: 0.0627 | Acc: 0.9798




[3/30] Train Loss: 0.0576 | Acc: 0.9812




[4/30] Train Loss: 0.0558 | Acc: 0.9818




[5/30] Train Loss: 0.0533 | Acc: 0.9827




[6/30] Train Loss: 0.0504 | Acc: 0.9836




[7/30] Train Loss: 0.0485 | Acc: 0.9841




[8/30] Train Loss: 0.0473 | Acc: 0.9844




[9/30] Train Loss: 0.0456 | Acc: 0.9851




[10/30] Train Loss: 0.0446 | Acc: 0.9854




[11/30] Train Loss: 0.0435 | Acc: 0.9859




[12/30] Train Loss: 0.0432 | Acc: 0.9859




[13/30] Train Loss: 0.0420 | Acc: 0.9863




[14/30] Train Loss: 0.0414 | Acc: 0.9864




[15/30] Train Loss: 0.0402 | Acc: 0.9871




[16/30] Train Loss: 0.0390 | Acc: 0.9874




[17/30] Train Loss: 0.0393 | Acc: 0.9872




[18/30] Train Loss: 0.0381 | Acc: 0.9876




[19/30] Train Loss: 0.0373 | Acc: 0.9879




[20/30] Train Loss: 0.0372 | Acc: 0.9878




[21/30] Train Loss: 0.0362 | Acc: 0.9883




[22/30] Train Loss: 0.0360 | Acc: 0.9883




[23/30] Train Loss: 0.0346 | Acc: 0.9885




[24/30] Train Loss: 0.0346 | Acc: 0.9887




[25/30] Train Loss: 0.0343 | Acc: 0.9890




[26/30] Train Loss: 0.0337 | Acc: 0.9891




[27/30] Train Loss: 0.0333 | Acc: 0.9890




[28/30] Train Loss: 0.0330 | Acc: 0.9895




[29/30] Train Loss: 0.0317 | Acc: 0.9896




[30/30] Train Loss: 0.0317 | Acc: 0.9898


### Test

In [None]:
# === Paramètres ===
test_embeddings_path = "/content/drive/MyDrive/data/embeddings_final_test.pkl"
#model_path = "checkpoints/best_model.pth"
output_csv_path = "submission_medimageins_wo_caille.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# === Charger les embeddings ===
df_test = pd.read_pickle(test_embeddings_path)

In [None]:
# === Charger le modèle
input_size = len(df_test["embedding"].iloc[0])
# model = Classifier(input_size=input_size, output_size=1).to(device)
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.eval()

# === Prédiction
solutions_data = {"ID": [], "Pred": []}

for i in tqdm(range(len(df_test))):
    embedding = torch.tensor(df_test.iloc[i]["embedding"]).float().unsqueeze(0).to(device)
    test_id = int(df_test.iloc[i]["ID"])  # Prend l'ID depuis la colonne "ID"

    with torch.no_grad():
        pred = model(embedding).item()

    solutions_data["ID"].append(test_id)
    solutions_data["Pred"].append(int(pred > 0.5))

# === Export .csv
df_submission = pd.DataFrame(solutions_data).set_index("ID")
df_submission.to_csv(output_csv_path)

print(f"Submission saved to {output_csv_path}")


100%|██████████| 85054/85054 [01:05<00:00, 1306.81it/s]

✅ Submission saved to submission_medimageins_wo_caille.csv





In [None]:
# === Charger le modèle
input_size = len(df_test["embedding"].iloc[0])
# model = Classifier(input_size=input_size, output_size=1).to(device)
# model.load_state_dict(torch.load(model_path, map_location=device))
model_full.eval()

# === Prédiction
solutions_data = {"ID": [], "Pred": []}

for i in tqdm(range(len(df_test))):
    embedding = torch.tensor(df_test.iloc[i]["embedding"]).float().unsqueeze(0).to(device)
    test_id = int(df_test.iloc[i]["ID"])  # Prend l'ID depuis la colonne "ID"

    with torch.no_grad():
        pred = model_full(embedding).item()

    solutions_data["ID"].append(test_id)
    solutions_data["Pred"].append(int(pred > 0.5))

# === Export .csv
df_submission = pd.DataFrame(solutions_data).set_index("ID")
df_submission.to_csv(output_csv_path)

print(f"Submission saved to {output_csv_path}")


100%|██████████| 85054/85054 [01:05<00:00, 1300.48it/s]


✅ Submission saved to submission_medimageins_full.csv


In [None]:
!ls

drive  sample_data  submission_medimageins.csv


In [None]:
df_test

Unnamed: 0,ID,label,embedding
0,0,,"[-0.060545433, -0.0066295075, 0.0133655695, 0...."
1,1,,"[-0.06779506, 0.0030385004, 0.027163219, -0.01..."
2,10,,"[-0.06873386, -8.134835e-05, 0.019373346, -0.0..."
3,100,,"[-0.047853183, 0.0013519925, 0.014813482, -0.0..."
4,1000,,"[-0.055740744, -0.00045497064, 0.011394967, -0..."
...,...,...,...
85049,9995,,"[-0.062102914, -0.0021868548, 0.017434511, -0...."
85050,9996,,"[-0.062059645, 0.007868373, 0.018032223, -0.02..."
85051,9997,,"[-0.061726414, 0.0077918186, 0.021779917, -0.0..."
85052,9998,,"[-0.060853273, 0.0030750649, 0.02285435, -0.01..."


In [None]:
df_submission

Unnamed: 0_level_0,Pred
ID,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
...,...
85049,1
85050,1
85051,1
85052,1


In [None]:
df_submission.head(40)

Unnamed: 0_level_0,Pred
ID,Unnamed: 1_level_1
0,1
1,1
10,1
100,1
1000,1
10000,1
10001,1
10002,1
10003,1
10004,1
