In [None]:
feature_extractor = "resnet50"
#feature_extractor = "phykon"
#feature_extractor = "univ1"
#feature_extractor = "univ2"
IN_DIM_M = 1024
N_epoch = 30
N_epoch_dif = 30
N_epoch_aug = 30

In [None]:
import os
import zipfile

# URL for the dataset
if feature_extractor == "resnet50":
  url = "https://zenodo.org/records/15711642/files/datasetTrident_resnet.zip?download=1"
  rootB = "./trident_processed_resnet/B/20x_256px_0px_overlap/features_resnet50/"
  rootE = "./trident_processed_resnet/E/20x_256px_0px_overlap/features_resnet50/"
  rootS = "./trident_processed_resnet/S/20x_256px_0px_overlap/features_resnet50/"
elif feature_extractor == "phykon":
  url = "https://zenodo.org/records/15736995/files/datasetTrident_phikon.zip?download=1"
  rootB = "./trident_processed_phikon/B/20x_224px_0px_overlap/features_phikon_v2/"
  rootE = "./trident_processed_phikon/E/20x_224px_0px_overlap/features_phikon_v2/"
  rootS = "./trident_processed_phikon/S/20x_224px_0px_overlap/features_phikon_v2/"
elif feature_extractor == "univ1":
  url = "https://zenodo.org/records/15711374/files/datasetTrident_univ1.zip?download=1"
  rootB = "./trident_processed_univ1/B/20x_256px_0px_overlap/features_uni_v1/"
  rootE = "./trident_processed_univ1/E/20x_256px_0px_overlap/features_uni_v1/"
  rootS = "./trident_processed_univ1/S/20x_256px_0px_overlap/features_uni_v1/"
elif feature_extractor == "univ2":
  url = "https://zenodo.org/records/15732622/files/datasetTrident_univ2.zip?download=1"
  rootB = "./trident_processed_univ2/B/20x_256px_0px_overlap/features_uni_v2/"
  rootE = "./trident_processed_univ2/E/20x_256px_0px_overlap/features_uni_v2/"
  rootS = "./trident_processed_univ2/S/20x_256px_0px_overlap/features_uni_v2/"
  IN_DIM_M = 1536
else:
  raise ValueError("Feature extractor not supported")


In [None]:

# Download the file using wget
!wget -O /content/Train.zip "$url"

# Define the extraction path
extract_path = './'

# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile('./Train.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List the contents of the extracted folder
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)

del url, extract_path, extracted_files

In [None]:
import torch
import h5py

In [None]:
!pip install denoising-diffusion-pytorch

In [None]:
# dataset.py
import os, h5py, torch
from torch.utils.data import Dataset

class WSIFeatureBag(Dataset):
    def __init__(self, h5_path, cls_label):
        self.path = h5_path
        self.label = torch.tensor(cls_label, dtype=torch.long)

    def __len__(self):          # un bag = una slide
        return 1

    def __getitem__(self, idx):
        with h5py.File(self.path, 'r') as f:
            feats = torch.from_numpy(f['features'][:]).float()  # (N,1024)
        return feats, self.label


In [None]:
# models/vae.py
import torch, torch.nn as nn, torch.nn.functional as F

class FeatureVAE(nn.Module):
    def __init__(self, in_dim=1024, latent_dim=128):
        super().__init__()
        self.latent_dim = latent_dim

        # Encoder
        self.fc1 = nn.Linear(in_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.mu = nn.Linear(256, latent_dim)
        self.logv = nn.Linear(256, latent_dim)

        # Decoder
        self.fc3 = nn.Linear(latent_dim, 256)
        self.fc4 = nn.Linear(256, 512)
        self.fc5 = nn.Linear(512, in_dim)


    def encode(self, x):
        h = F.relu(self.fc2(F.relu(self.fc1(x))))
        return self.mu(h), self.logv(h)

    def reparameterize(self, mu, logv):
        std = torch.exp(0.5*logv)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h = F.relu(self.fc4(F.relu(self.fc3(z))))
        return self.fc5(h)

    def forward(self, x):
        mu, logv = self.encode(x)
        z  = self.reparameterize(mu, logv)
        xr = self.decode(z)
        return xr, mu, logv


In [None]:
# build_dataset.py
from glob import glob
from torch.utils.data import ConcatDataset
import numpy as np


filesB   = sorted(glob(f"{rootB}/*.h5"))
labelsB = np.zeros(len(filesB), dtype=int)
filesE   = sorted(glob(f"{rootE}/*.h5"))
labelsE = np.ones(len(filesE), dtype=int)
filesS   = sorted(glob(f"{rootS}/*.h5"))
labelsS = np.ones(len(filesS), dtype=int)*2

# Flatten the list of files and concatenate the labels
files = filesB + filesE + filesS
labels = np.concatenate([labelsB, labelsE, labelsS])


bags = [WSIFeatureBag(p,l) for p,l in zip(files, labels)]
dataset = ConcatDataset(bags)
loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

del bags, files, labels

In [None]:
vae = FeatureVAE(in_dim = IN_DIM_M).cuda()
opt = torch.optim.Adam(vae.parameters(), lr=1e-4)
for epoch in range(N_epoch):
    for feats,_ in loader:
        feats = feats.cuda()
        xr, mu, logv = vae(feats)
        mse = F.mse_loss(xr, feats)
        kld = -0.5*torch.mean(1+logv-mu.pow(2)-logv.exp())
        loss = mse + 1e-3*kld
        opt.zero_grad(); loss.backward(); opt.step()
        del feats, xr, mu, logv
    if epoch % 5 == 0:
        print(f"Epoch {epoch} --- MSE: {mse * 1000:.4f}, KLD: {kld * 1000:.4f}, Loss: {loss * 1000:.4f}")
print("Done!")
print(f"Epoch {N_epoch} MSE: {mse * 1000:.4f}, KLD: {kld * 1000:.4f}, Loss: {loss * 1000:.4f}")

In [None]:
from denoising_diffusion_pytorch import Unet, GaussianDiffusion

def create_diffusion_model(height=8, width=16):
    unet = Unet(
        dim=64,
        dim_mults=(1, 2, 4),
        channels=1
    )

    diffusion = GaussianDiffusion(
        model=unet,
        image_size=(height, width),  # Dimensioni corrette
        timesteps=250,
        sampling_timesteps=100
    )
    return diffusion


In [None]:
from torch.utils.data import TensorDataset, DataLoader

def train_diffusion_on_latents(vae, bag_loader, device='cuda', epochs=10):
    """
    Funzione corretta per allenare il diffusion model su tutte le patch di tutte le WSIs
    senza confondere i livelli di DataLoader
    """
    # 1. Estrai tutti i latenti da tutte le WSIs
    all_latents = []

    with torch.no_grad():
        for bag, _ in bag_loader:  # bag_loader restituisce un bag per iterazione
            feats = bag.squeeze(0).to(device)  # [N_patches, 1024]

            # Processa in chunk per evitare OOM
            chunk_size = 256
            z_chunks = []
            for i in range(0, feats.size(0), chunk_size):
                chunk = feats[i:i+chunk_size]
                z_mu, _ = vae.encode(chunk)
                z_reshaped = z_mu.view(-1, 1, 8, 16)
                z_chunks.append(z_reshaped.cpu())

            all_latents.append(torch.cat(z_chunks, dim=0))

    # 2. Combina tutti i latenti in un unico dataset
    latent_dataset = torch.cat(all_latents, dim=0)  # [N_tot_patch, 1, 8, 16]

    # 3. Crea DataLoader per i patch latenti
    patch_loader = DataLoader(
        TensorDataset(latent_dataset),
        batch_size=128,
        shuffle=True,
        pin_memory=True
    )

    # 4. Inizializza modello diffusion
    diffusion = create_diffusion_model().to(device)
    optimizer = torch.optim.Adam(diffusion.parameters(), lr=8e-5)
    scaler = torch.cuda.amp.GradScaler()  # Per mixed precision

    # 5. Allenamento
    diffusion.train()
    for epoch in range(epochs):
        total_loss = 0
        for i, (latents,) in enumerate(patch_loader):
            latents = latents.to(device, non_blocking=True)

            # Mixed precision training
            with torch.cuda.amp.autocast():
                loss = diffusion(latents)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()


            total_loss += loss.item()

            if i % 100 == 0:
                avg_loss = total_loss / (i + 1)
                print(f"Epoca {epoch+1} | Batch {i}/{len(patch_loader)} | Loss: {avg_loss:.4f}")

        avg_epoch_loss = total_loss / len(patch_loader)
        print(f"Epoca {epoch+1} completata | Loss media: {avg_epoch_loss:.4f}")

    return diffusion


In [None]:
from torch.utils.data import DataLoader, TensorDataset

def train_diffusion_on_latents_old(vae, data_loader, device='cuda', epochs=50):
    vae.eval()

    # Calcola le dimensioni di reshape
    latent_dim = vae.latent_dim
    height = 8  # Fattore di 128
    width = 16   # 8 * 16 = 128

    # Estrai e rimodella i latenti
    all_latents = []
    with torch.no_grad():
        for feats, _ in data_loader:
            feats = feats.squeeze(0).to(device)
            z_mu, _ = vae.encode(feats)
            z_reshaped = z_mu.view(-1, 1, height, width)
            all_latents.append(z_reshaped.cpu())
            del feats, z_mu, z_reshaped

    # Training del diffusion model
    del data_loader
    print(f"Reshaped latents shape: {all_latents[0].shape}")
    print(f"Total number of latents: {len(all_latents)}")

    print("Training Diffusion Model...")
    latent_dataset = torch.cat(all_latents, dim=0)
    latent_loader = DataLoader(TensorDataset(latent_dataset), batch_size=128, shuffle=True)

    diffusion = create_diffusion_model(height, width).to(device)
    optimizer = torch.optim.Adam(diffusion.parameters(), lr=8e-5)

    diffusion.train()
    for epoch in range(epochs):
        i = 0
        for (latents,) in latent_loader:
            latents = latents.to(device)
            loss = diffusion(latents)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            i += 1
            if i % 100 == 0:
              print(f" Batch {i}/{len(latent_loader)}")
            del latents
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

    return diffusion


In [None]:
diffusion = train_diffusion_on_latents(vae, loader, epochs=N_epoch_dif)

In [None]:
def augment_features_kstep(feats, vae, diffusion_model, K=0.2):
    device = feats.device
    with torch.no_grad():
        # Encode tutta la WSI
        z_mu, _ = vae.encode(feats)  # [N_patches, 128]
        z_input = z_mu.view(-1, 1, 8, 16)  # [N_patches, 1, 8, 16]

        # K-step diffusion
        T = diffusion_model.num_timesteps
        k = int(K * T)
        t = torch.full((z_input.shape[0],), k, device=device, dtype=torch.long)
        z_noisy = diffusion_model.q_sample(z_input, t)

        # Reverse diffusion
        z_denoised = diffusion_model.p_sample_loop(
            shape=z_noisy.shape,
            return_all_timesteps=False
        )

        # Decode
        z_final = z_denoised.view(-1, 128)
        augmented_feats = vae.decode(z_final)

    return torch.cat([augmented_feats], dim=0)


In [None]:
def augment_features_kstep_old(original_feats, vae, diffusion_model, K=0.2):
    vae.eval()
    diffusion_model.eval()

    with torch.no_grad():
        z_mu, _ = vae.encode(original_feats)
        z_input = z_mu.view(-1, 1, 8, 16)  # Stessa dimensione del training

        # K-step diffusion
        T = diffusion_model.num_timesteps
        k = int(K * T)
        t = torch.full((z_input.shape[0],), k, device=original_feats.device, dtype=torch.long)
        z_noisy = diffusion_model.q_sample(z_input, t)

        # Reverse diffusion
        z_denoised = diffusion_model.p_sample_loop(
            shape=z_noisy.shape,
            return_all_timesteps=False
        )

        # Decode
        z_final = z_denoised.view(-1, 128)
        augmented_feats = vae.decode(z_final)

    return torch.cat([augmented_feats], dim=0)


In [None]:
'''
class ABMIL(nn.Module):
    def __init__(self, in_dim=1024, n_classes=3):
        super().__init__()
        self.embed = nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.Tanh()
        )
        self.att  = nn.Sequential(
            nn.Linear(256,128), nn.Tanh(),
            nn.Linear(128,1)
        )
        self.cls  = nn.Linear(256, n_classes)

    def forward(self, bag):
        H = self.embed(bag)              # (n_inst,256)
        A = torch.softmax(self.att(H),0) # (n_inst,1)
        M = torch.sum(A*H, 0, keepdim=True)
        return self.cls(M)               # (1,3)

def train_mil_with_augdiff(vae, diffusion_model, data_loader, device='cuda',epochs = 4):
    mil_model = ABMIL(in_dim=1024, n_classes=3).to(device)
    optimizer = torch.optim.Adam(mil_model.parameters(), lr=1e-4)

    for epoch in range(epochs):
        for bag, label in data_loader:
            original_feats = bag.squeeze(0).to(device)  # [n_patches, 1024]

            # ✓ FEATURE AUGMENTATION con AugDiff
            augmented_feats = augment_features_kstep(
                original_feats, vae, diffusion_model, K=0.2
            )

            # MIL forward pass
            logits = mil_model(augmented_feats)
            loss = F.cross_entropy(logits, label.to(device))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            del bag, label, original_feats, augmented_feats, logits

    return mil_model
device = 'cuda'
mil_model = train_mil_with_augdiff(vae, diffusion, loader,device,N_epoch_aug)
'''

In [None]:
!mkdir datasetResnetExtrapolation

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
def augment_features_kstep_batched(feats, vae, diffusion, K=0.2, batch_size=256):
    augmented = []
    for i in range(0, feats.shape[0], batch_size):
        chunk = feats[i:i+batch_size]
        chunk = chunk.cuda()
        aug_chunk = augment_features_kstep(chunk, vae, diffusion, K=K)
        augmented.append(aug_chunk.cpu())
        del chunk, aug_chunk
        torch.cuda.empty_cache()
    return torch.cat(augmented, dim=0)


In [None]:
B_f = []
E_f = []
S_f = []

bag_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)

# Controlla la shape del primo elemento del dataset
first_item = dataset[0]
if isinstance(first_item, tuple):
    features, label = first_item
    print(f"Shape delle features nel dataset: {features.shape}")
    print(f"Shape della label nel dataset: {label.shape if hasattr(label, 'shape') else type(label)}")
else:
    print(f"Shape del primo elemento del dataset: {first_item.shape}")

# Controlla la shape del primo batch del DataLoader
for batch in bag_loader:
    if isinstance(batch, tuple):
        batch_features, batch_label = batch
        print(f"Shape delle features nel primo batch: {batch_features.shape}")
        print(f"Shape delle label nel primo batch: {batch_label.shape if hasattr(batch_label, 'shape') else type(batch_label)}")
    else:
        # When batch_size is 1, batch is a list [data, label]
        batch_features = batch[0]
        batch_label = batch[1]
        print(f"Shape delle features nel primo batch: {batch_features.shape}")
        print(f"Shape delle label nel primo batch: {batch_label.shape if hasattr(batch_label, 'shape') else type(batch_label)}")
    break  # Solo il primo batch
    del batch_features, batch_label
del first_item

In [None]:
for bag, y in bag_loader:
    feats = bag[0]  # (N,1024)
    aug_feats = augment_features_kstep_batched(feats, vae, diffusion, K=0.2, batch_size=256)
    print(f"Shape delle features nel primo batch: {feats.shape}")
    print(f"Shape delle features generate nel primo batch: {aug_feats.shape}")

    if y == 0:
        B_f.append(feats)
        B_f.append(aug_feats)
    elif y == 1:
        E_f.append(feats)
        E_f.append(aug_feats)
    else:
        S_f.append(feats)
        S_f.append(aug_feats)


In [None]:
files = filesS

labels = np.concatenate([labelsS])

bags = [WSIFeatureBag(p,l) for p,l in zip(files, labels)]
dataset = ConcatDataset(bags)
del bags, files, labels
bag_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
for bag, y in bag_loader:
    feats = bag[0]  # (N,1024)
    aug_feats = augment_features_kstep_batched(feats, vae, diffusion, K=0.2, batch_size=256)
    if y == 0:
        B_f.append(feats)
        B_f.append(aug_feats)
    elif y == 1:
        E_f.append(feats)
        E_f.append(aug_feats)
    else:
        S_f.append(feats)
        S_f.append(aug_feats)

In [None]:
dataset_csv = []
j = 0

folder_dest = "./datasetResnetExtrapolation/"

for i in range(len(B_f)):
    v = "M-" + j
    j += 1
    torch.save(B_f[i], folder_dest + v + ".pt")
    dataset_csv.append([v,v,'B'])
for i in range(len(S_f)):
    v = "M-" + j
    j += 1
    torch.save(S_f[i], folder_dest + v + ".pt")
    dataset_csv.append([v,v,'S'])
for i in range(len(E_f)):
    v = "M-" + j
    j += 1
    torch.save(E_f[i], folder_dest + v + ".pt")
    dataset_csv.append([v,v,'E'])

In [None]:
import csv
# Nome del file CSV
file_csv = folder_dest + 'datasetComposition.csv'

# Scrittura del file CSV
with open(file_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Scrivere l'intestazione
    writer.writerow(['case_id', 'slide_id', 'label'])
    # Scrivere i dati
    writer.writerows(dataset_csv)

print(f"File CSV salvato: {file_csv}")

In [None]:
# Crea un archivio ZIP della cartella
def create_zip_archive(folder_path, zip_name=None):
    """
    Crea un archivio ZIP di una cartella mantenendo la struttura
    """
    if zip_name is None:
        zip_name = f"{os.path.basename(folder_path)}.zip"

    print(f"🗜️ Creando archivio ZIP: {zip_name}")

    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Mantiene la struttura delle cartelle nell'archivio
                arc_name = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arc_name)
                print(f"  📁 Aggiunto: {arc_name}")

    # Mostra dimensione dell'archivio
    zip_size = os.path.getsize(zip_name)
    print(f"✅ Archivio creato: {zip_name} ({zip_size/1024/1024:.2f} MB)")
    return zip_name

In [None]:
# Percorso della cartella da caricare
zip_filename = 'datasetCompleted.zip'  # Nome dell'archivio
zip_path = create_zip_archive(folder_dest, zip_filename)


In [None]:
import torch
import requests
import json
import os

# Token di accesso Zenodo (sostituisci con il tuo)
ACCESS_TOKEN = 'uVSb7icJqT9efPM71KYgviJ50r7eML9ynei2q7hDkedVlFrf8fBsr9lFaJ3O'

# Crea una nuova deposizione
def create_deposition(title):
    url = 'https://zenodo.org/api/deposit/depositions'
    headers = {"Content-Type": "application/json"}
    params = {'access_token': ACCESS_TOKEN}

    data = {
        'metadata': {
            'title': title,
            'upload_type': 'dataset',
            'description': 'Dataset WSI project MLiA',
            'creators': [{'name': 'Raf-Tony-Luca'}]
        }
    }

    r = requests.post(url, params=params, data=json.dumps(data), headers=headers)
    return r.json()

# Carica il file
def upload_file(deposition_id, file_path):
    # Get bucket URL
    url = f'https://zenodo.org/api/deposit/depositions/{deposition_id}'
    params = {'access_token': ACCESS_TOKEN}
    r = requests.get(url, params=params)
    bucket_url = r.json()["links"]["bucket"]

    # Upload file
    filename = os.path.basename(file_path)
    with open(file_path, "rb") as fp:
        r = requests.put(f"{bucket_url}/{filename}",
                        data=fp,
                        params=params)
    return r.json()

# Pubblica il dataset
def publish_deposition(deposition_id):
    url = f'https://zenodo.org/api/deposit/depositions/{deposition_id}/actions/publish'
    params = {'access_token': ACCESS_TOKEN}
    r = requests.post(url, params=params)
    return r.json()


In [None]:
# Esegui l'upload
print("Creando deposizione...")
deposition = create_deposition("dataset_trident_"+ feature_extractor + "_AugDiff")
deposition_id = deposition['id']

print(f"Caricando file... (ID: {deposition_id})")
upload_result = upload_file(deposition_id, zip_filename)

print("Pubblicando dataset...")
publication = publish_deposition(deposition_id)

print(f"Dataset pubblicato! DOI: {publication['doi']}")
print(f"URL: {publication['links']['record_html']}")