In [None]:
import torch
import h5py

In [None]:
import os
import zipfile

# URL for the dataset
url = "https://zenodo.org/records/15711642/files/datasetTrident_resnet.zip?download=1"

# Download the file using wget
!wget -O /content/Train.zip "$url"

# Define the extraction path
extract_path = './'

# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile('./Train.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List the contents of the extracted folder
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)


--2025-06-26 13:45:02--  https://zenodo.org/records/15711642/files/datasetTrident_resnet.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.185.43.25, 188.185.45.92, 188.185.48.194, ...
Connecting to zenodo.org (zenodo.org)|188.185.43.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1044466718 (996M) [application/octet-stream]
Saving to: ‘/content/Train.zip’


2025-06-26 13:45:52 (20.0 MB/s) - ‘/content/Train.zip’ saved [1044466718/1044466718]

Extracted files: ['.config', 'trident_processed_resnet', '.ipynb_checkpoints', 'results_features', 'Train.zip', 'sample_data']


In [None]:
# dataset.py
import os, h5py, torch
from torch.utils.data import Dataset

class WSIFeatureBag(Dataset):
    def __init__(self, h5_path, cls_label):
        self.path = h5_path
        self.label = torch.tensor(cls_label, dtype=torch.long)

    def __len__(self):          # un bag = una slide
        return 1

    def __getitem__(self, idx):
        with h5py.File(self.path, 'r') as f:
            feats = torch.from_numpy(f['features'][:]).float()  # (N,1024)
        return feats, self.label


In [None]:
# models/vae.py
import torch, torch.nn as nn, torch.nn.functional as F

class FeatureVAE(nn.Module):
    def __init__(self, in_dim=1024, latent_dim=128):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.mu  = nn.Linear(256, latent_dim)
        self.logv= nn.Linear(256, latent_dim)

        self.fc3 = nn.Linear(latent_dim, 256)
        self.fc4 = nn.Linear(256, 512)
        self.fc5 = nn.Linear(512, in_dim)

    def encode(self, x):
        h = F.relu(self.fc2(F.relu(self.fc1(x))))
        return self.mu(h), self.logv(h)

    def reparameterize(self, mu, logv):
        std = torch.exp(0.5*logv)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h = F.relu(self.fc4(F.relu(self.fc3(z))))
        return self.fc5(h)

    def forward(self, x):
        mu, logv = self.encode(x)
        z  = self.reparameterize(mu, logv)
        xr = self.decode(z)
        return xr, mu, logv


In [None]:
# build_dataset.py
from glob import glob
from torch.utils.data import ConcatDataset

rootB = "./trident_processed_resnet/B/20x_256px_0px_overlap/features_resnet50/"
filesB   = sorted(glob(f"{rootB}/*.h5"))
labelB = np.zeros(len(filesB), dtype=int)
rootE = "./trident_processed_resnet/E/20x_256px_0px_overlap/features_resnet50/"
filesE   = sorted(glob(f"{rootE}/*.h5"))
labelE = np.ones(len(filesE), dtype=int)
rootS = "./trident_processed_resnet/S/20x_256px_0px_overlap/features_resnet50/"
filesS   = sorted(glob(f"{rootS}/*.h5"))
labelS = np.ones(len(filesS), dtype=int)*2

files = filesB + filesE + filesS
labels = labesB + labelsE + labelsS

bags = [WSIFeatureBag(p,l) for p,l in zip(files, labels)]
dataset = ConcatDataset(bags)       # 22 bag
loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
vae = FeatureVAE().cuda()
opt = torch.optim.Adam(vae.parameters(), lr=1e-4)
for epoch in range(30):
    for feats,_ in loader:                 # loader patch-wise
        feats = feats.cuda()
        xr, mu, logv = vae(feats)
        mse = F.mse_loss(xr, feats)
        kld = -0.5*torch.mean(1+logv-mu.pow(2)-logv.exp())
        loss = mse + 1e-3*kld
        opt.zero_grad(); loss.backward(); opt.step()


In [None]:
# models/latent_unet.py
from denoising_diffusion_pytorch import Unet, GaussianDiffusion

latent_dim = 128
unet = Unet(
    dim=64, dim_mults=(1,2,4,8), channels=latent_dim
)

diffusion = GaussianDiffusion(
    model=unet, image_size=1,   # vettori 1×1×D trattati come “immagini”
    timesteps=1000,             # T
    loss_type='l2'
)


In [None]:
for epoch in range(10):
    for feats,_ in loader:
        z = vae.encode(feats.cuda())[0]
        loss = diffusion(z)
        loss.backward(); opt.step(); opt.zero_grad()


In [None]:
def augment_features(feats, K_ratio=0.2):
    with torch.no_grad():
        z0 = vae.encode(feats)[0]                  # (N,128)
        t  = int(K_ratio*diffusion.num_timesteps)
        noise = torch.randn_like(z0)
        zt = diffusion.q_sample(x_start=z0, t=torch.full((len(z0),),t).cuda(), noise=noise)
        z_hat = diffusion.p_sample_loop(z_t=zt, t_start=t)
        feats_aug = vae.decode(z_hat)
    return torch.cat([feats_aug],0)


In [None]:
# models/abmil.py
class ABMIL(nn.Module):
    def __init__(self, in_dim=1024, n_classes=3):
        super().__init__()
        self.embed = nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.Tanh()
        )
        self.att  = nn.Sequential(
            nn.Linear(256,128), nn.Tanh(),
            nn.Linear(128,1)
        )
        self.cls  = nn.Linear(256, n_classes)

    def forward(self, bag):
        H = self.embed(bag)              # (n_inst,256)
        A = torch.softmax(self.att(H),0) # (n_inst,1)
        M = torch.sum(A*H, 0, keepdim=True)
        return self.cls(M)               # (1,3)


In [None]:
mil = ABMIL().cuda()
opt = torch.optim.Adam(mil.parameters(), lr=1e-4)

In [None]:
bag_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
for bag, y in bag_loader:          # bag_loader restituisce un bag per step
    feats = bag[0]                 # (N,1024)
    feats = augment_features(feats.cuda())   # concat original+aug
    y = y.cuda()

    logits = mil(feats)
    loss = F.cross_entropy(logits, y)
    opt.zero_grad(); loss.backward(); opt.step()


In [None]:
!mkdir datasetResnetExtrapolation

In [None]:
B_f = []
E_f = []
S_f = []
index_p = 0
bag_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
for bag, y in bag_loader:          # bag_loader restituisce un bag per step
    feats = bag[0]                 # (N,1024)
    aug_feats = augment_features(feats.cuda())   # concat original+aug
    if y == 0:
        B_f.append(feats)
        B_f.append(aug_feats)
    elif y == 1:
        E_f.append(feats)
        E_f.append(aug_feats)
    else:
        S_f.append(feats)
        S_f.append(aug_feats)


In [None]:
files = filesS
labels = labelsS

bags = [WSIFeatureBag(p,l) for p,l in zip(files, labels)]
dataset = ConcatDataset(bags)       # 22 bag
bag_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)
for bag, y in bag_loader:          # bag_loader restituisce un bag per step
    feats = bag[0]                 # (N,1024)
    aug_feats = augment_features(feats.cuda())   # concat original+aug
    if y == 0:
        B_f.append(feats)
        B_f.append(aug_feats)
    elif y == 1:
        E_f.append(feats)
        E_f.append(aug_feats)
    else:
        S_f.append(feats)
        S_f.append(aug_feats)

In [None]:
dataset_csv = []
j = 0

folder_dest = "./datasetResnetExtrapolation/"

for i in range(len(B_f)):
    v = "M-" + j
    j += 1
    torch.save(B_f[i], folder_dest + v + ".pt")
    dataset_csv.append([v,v,'B'])
for i in range(len(S_f)):
    v = "M-" + j
    j += 1
    torch.save(S_f[i], folder_dest + v + ".pt")
    dataset_csv.append([v,v,'S'])
for i in range(len(E_f)):
    v = "M-" + j
    j += 1
    torch.save(E_f[i], folder_dest + v + ".pt")
    dataset_csv.append([v,v,'E'])

In [None]:
import csv
# Nome del file CSV
file_csv = folder_dest + 'datasetComposition.csv'

# Scrittura del file CSV
with open(file_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Scrivere l'intestazione
    writer.writerow(['case_id', 'slide_id', 'label'])
    # Scrivere i dati
    writer.writerows(dataset_csv)

print(f"File CSV salvato: {file_csv}")

In [None]:
# Crea un archivio ZIP della cartella
def create_zip_archive(folder_path, zip_name=None):
    """
    Crea un archivio ZIP di una cartella mantenendo la struttura
    """
    if zip_name is None:
        zip_name = f"{os.path.basename(folder_path)}.zip"

    print(f"🗜️ Creando archivio ZIP: {zip_name}")

    with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Mantiene la struttura delle cartelle nell'archivio
                arc_name = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arc_name)
                print(f"  📁 Aggiunto: {arc_name}")

    # Mostra dimensione dell'archivio
    zip_size = os.path.getsize(zip_name)
    print(f"✅ Archivio creato: {zip_name} ({zip_size/1024/1024:.2f} MB)")
    return zip_name

In [None]:
# Percorso della cartella da caricare
zip_filename = 'datasetCompleted.zip'  # Nome dell'archivio
zip_path = create_zip_archive(folder_dest, zip_filename)


In [None]:
import torch
import requests
import json
import os

# Token di accesso Zenodo (sostituisci con il tuo)
ACCESS_TOKEN = 'uVSb7icJqT9efPM71KYgviJ50r7eML9ynei2q7hDkedVlFrf8fBsr9lFaJ3O'

# Crea una nuova deposizione
def create_deposition(title):
    url = 'https://zenodo.org/api/deposit/depositions'
    headers = {"Content-Type": "application/json"}
    params = {'access_token': ACCESS_TOKEN}

    data = {
        'metadata': {
            'title': title,
            'upload_type': 'dataset',
            'description': 'Dataset WSI project MLiA',
            'creators': [{'name': 'Raf-Tony-Luca'}]
        }
    }

    r = requests.post(url, params=params, data=json.dumps(data), headers=headers)
    return r.json()

# Carica il file
def upload_file(deposition_id, file_path):
    # Get bucket URL
    url = f'https://zenodo.org/api/deposit/depositions/{deposition_id}'
    params = {'access_token': ACCESS_TOKEN}
    r = requests.get(url, params=params)
    bucket_url = r.json()["links"]["bucket"]

    # Upload file
    filename = os.path.basename(file_path)
    with open(file_path, "rb") as fp:
        r = requests.put(f"{bucket_url}/{filename}",
                        data=fp,
                        params=params)
    return r.json()

# Pubblica il dataset
def publish_deposition(deposition_id):
    url = f'https://zenodo.org/api/deposit/depositions/{deposition_id}/actions/publish'
    params = {'access_token': ACCESS_TOKEN}
    r = requests.post(url, params=params)
    return r.json()


In [None]:
# Esegui l'upload
print("Creando deposizione...")
deposition = create_deposition("dataset_trident_resnet50_AugDiff")
deposition_id = deposition['id']

print(f"Caricando file... (ID: {deposition_id})")
upload_result = upload_file(deposition_id, zip_filename)

print("Pubblicando dataset...")
publication = publish_deposition(deposition_id)

print(f"Dataset pubblicato! DOI: {publication['doi']}")
print(f"URL: {publication['links']['record_html']}")