<a href="https://colab.research.google.com/github/Kazi-Rakib-Hasan-Jawwad/Histo-FSL/blob/master/ViT_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Steps to connect Colab with local runtime:
1. Put this command in virtual environment terminal:

> jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0

2. Copy and paste the url in colab.


Check availability of GPU.

In [None]:
import torch
use_cuda = torch.cuda.is_available()
if use_cuda:
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 1
__CUDA Device Name: NVIDIA GeForce RTX 3080 Ti
__CUDA Device Total Memory [GB]: 12.636192768


In [None]:
from torch.utils.data import DataLoader
import pandas as pd
import torchvision
import tqdm
from torchvision import transforms
from torch import Tensor, nn
from abc import abstractmethod
from typing import Optional
from transformers import ViTModel
from easyfsl.datasets import FeaturesDataset
from easyfsl.samplers import TaskSampler
from easyfsl.methods import PrototypicalNetworks, RelationNetworks, SimpleShot, BDCSPN, TIM, PTMAP
from easyfsl.utils import evaluate
from tqdm import tqdm

In [None]:
'''
from pathlib import Path
working_directory = Path("/home/rakib/jupyter_notebooks/iBOT_project")
cache_dir = working_directory / "cache"
'''

In [None]:
def print_trainable_parameters(model: torch.nn) -> None:
    """Print number of trainable parameters."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param}"
        f" || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [None]:
def predict_embeddings_vit(
    dataloader: DataLoader,
    model: nn.Module,
    device: Optional[str] = None,
) -> pd.DataFrame:
    """
    Predict embeddings for a dataloader.
    Args:
        dataloader: dataloader to predict embeddings for. Must deliver tuples (images, class_names)
        model: model to use for prediction
        device: device to cast the images to. If none, no casting is performed. Must be the same as
            the device the model is on.
    Returns:
        dataframe with columns embedding and class_name
    """
    all_embeddings = []
    all_class_names = []
    with torch.no_grad():
        for images, class_names in tqdm(
            dataloader, unit="batch", desc="Predicting embeddings"
        ):
            if device is not None:
                images = images.to(device)
            all_embeddings.append(model(images).last_hidden_state[:, 0, :].detach().cpu())  # Changed from: all_embeddings.append(model(images).detach().cpu())
            if isinstance(class_names, torch.Tensor):
                all_class_names += class_names.tolist()
            else:
                all_class_names += class_names

    concatenated_embeddings = torch.cat(all_embeddings)

    return pd.DataFrame(
        {"embedding": list(concatenated_embeddings), "class_name": all_class_names}
    )

In [None]:
def compute_prototypes(support_features: Tensor, support_labels: Tensor) -> Tensor:
    """
    Compute class prototypes from support features and labels
    Args:
        support_features: for each instance in the support set, its feature vector
        support_labels: for each instance in the support set, its label

    Returns:
        for each label of the support set, the average feature vector of instances with this label
    """

    n_way = len(torch.unique(support_labels))
    # Prototype i is the mean of all instances of features corresponding to labels == i
    return torch.cat(
        [
            support_features[torch.nonzero(support_labels == label)].mean(0)
            for label in range(n_way)
        ]
    )

class FewShotClassifier(nn.Module):
    def __init__(
        self,
        feature_dim: int,
        num_classes: int,
        use_softmax: bool = False,
        feature_centering: Optional[Tensor] = None,
        feature_normalization: Optional[float] = None,
    ):
        super().__init__()

        self.linear_layer = nn.Sequential(nn.Linear(feature_dim, 128), nn.Linear(128, num_classes))
        #self.conv_layer = nn.Conv2d(in_channels, num_classes, kernel_size=3, padding=1)
        self.use_softmax = use_softmax

        self.prototypes = torch.tensor(())

        self.feature_centering = (
            feature_centering if feature_centering is not None else torch.tensor(0)
        )
        self.feature_normalization = feature_normalization

    @abstractmethod
    def forward(
        self,
        query_images: Tensor,
    ) -> Tensor:
        raise NotImplementedError("All few-shot algorithms must implement a forward method.")

    def process_support_set(
        self,
        support_images: Tensor,
        support_labels: Tensor,
    ):
        self.compute_prototypes_and_store_support_set(support_images, support_labels)

    @staticmethod
    def is_transductive() -> bool:
        raise NotImplementedError("All few-shot algorithms must implement an is_transductive method.")

    def compute_features(self, images: Tensor) -> Tensor:
        original_features = self.backbone(images)
        centered_features = original_features - self.feature_centering
        if self.feature_normalization is not None:
            return nn.functional.normalize(centered_features, p=self.feature_normalization, dim=1)
        return centered_features

    def softmax_if_specified(self, output: Tensor, temperature: float = 1.0) -> Tensor:
        return (temperature * output).softmax(-1) if self.use_softmax else output

    def compute_prototypes_and_store_support_set(
        self,
        support_images: Tensor,
        support_labels: Tensor,
    ):
        self.support_labels = support_labels
        self.support_features = self.compute_features(support_images)
        self._raise_error_if_features_are_multi_dimensional(self.support_features)
        self.prototypes = compute_prototypes(self.support_features, support_labels)

    @staticmethod
    def _raise_error_if_features_are_multi_dimensional(features: Tensor):
        if len(features.shape) != 2:
            raise ValueError(
                "Illegal backbone or feature shape. "
                "Expected output for an image is a 1-dim tensor."
            )

class ModifiedPrototypicalNetworks(FewShotClassifier):
    def __init__(
        self,
        feature_dim: int,
        num_classes: int,
        use_softmax: bool = False,
        feature_centering: Optional[Tensor] = None,
        feature_normalization: Optional[float] = None,
    ):
        super().__init__(
            feature_dim,
            num_classes,
            use_softmax=use_softmax,
            feature_centering=feature_centering,
            feature_normalization=feature_normalization,
        )

    def forward(
        self,
        query_images: Tensor,
    ) -> Tensor:
        query_features = self.compute_features(query_images)
        self._raise_error_if_features_are_multi_dimensional(query_features)

        scores = self.linear_layer(query_features)

        return self.softmax_if_specified(scores)


In [None]:
# prompt: I want to see what functions or models are available in module class. I can call it by import module

import module
print(dir(module))




In [None]:
from module import Chowder

chowder = Chowder(
    in_features=768,                     # output dimension of Phikon
    out_features=1,                      # dimension of predictions (a probability for class "1")
    n_top=5,                             # number of top scores in Chowder (in the image, N is 2)
    n_bottom=5,                          # number of bottom scores in Chowder
    mlp_hidden=[200, 100],               # MLP hidden layers after the max-min layer
    mlp_activation=torch.nn.Sigmoid(),   # MLP activation
    bias=True                            # bias for first 1D convolution which computes scores
)

# Chowder has 23,170 parameters: it's a very small model !
print_trainable_parameters(chowder)


trainable params: 23170 || all params: 23170 || trainable%: 100.00


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load phikon

model = ViTModel.from_pretrained("owkin/phikon", add_pooling_layer=False)
model.to(device)
#model.eval()

Some weights of the model checkpoint at owkin/phikon were not used when initializing ViTModel: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0): ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
      

In [None]:
output_directory_k = Path("/home/rakib/models/paper_benchmarking_ssl_diverse_pathology/inference/kather/ibot/2")

data_to_test = Path("/home/rakib/data/kather_texture/")
transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor()])
testset = torchvision.datasets.ImageFolder(data_to_test, transform=transform)

classes = testset.classes
print(classes)
dataloader = DataLoader(testset, batch_size=128, shuffle=False, drop_last=False)
feature_df = predict_embeddings_vit(dataloader, model, device=device)

['adipose', 'complex', 'debris', 'empty', 'lympho', 'mucosa', 'stroma', 'tumor']


Predicting embeddings: 100%|█████████████████| 40/40 [00:24<00:00,  1.64batch/s]


In [None]:
model2 = ModifiedPrototypicalNetworks(feature_dim=728, num_classes=1)
print_trainable_parameters(model2)

trainable params: 93441 || all params: 93441 || trainable%: 100.00


In [None]:

# TaskSampler for FSL
task_sampler = TaskSampler(
    feature_df,
    n_way=3,
    n_shot=5,
    n_query=10,
    n_tasks=100,
)

# DataLoader using TaskSampler
features_loader = DataLoader(
    feature_df,
    batch_sampler=task_sampler,
    num_workers=1,
    pin_memory=True,
    collate_fn=task_sampler.episodic_collate_fn,
)

AttributeError: 'DataFrame' object has no attribute 'get_labels'

In [None]:
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from statistics import mean
from pathlib import Path
import copy


# Define your loss function and other parameters
LOSS_FUNCTION = nn.CrossEntropyLoss()
n_epochs = 200
scheduler_milestones = [120, 160]
scheduler_gamma = 0.1
learning_rate = 1e-5
tb_logs_dir = Path(".")
train_optimizer = SGD(
    few_shot_classifier.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-5
)
train_scheduler = MultiStepLR(
    train_optimizer,
    milestones=scheduler_milestones,
    gamma=scheduler_gamma,
)
tb_writer = SummaryWriter(log_dir=str(tb_logs_dir))

# Define your custom training epoch function
def training_epoch(
    model: FewShotClassifier, data_loader: DataLoader, optimizer: Optimizer
):
    all_loss = []
    model.train()
    with tqdm(
        enumerate(kather_emb_df), total=len(kather_emb_df), desc="Training"
    ) as tqdm_train:
        for episode_index, (
            support_images,
            support_labels,
            query_images,
            query_labels,
            _,
        ) in tqdm_train:
            optimizer.zero_grad()
            model.process_support_set(
                support_images.to(DEVICE), support_labels.to(DEVICE)
            )
            classification_scores = model(query_images.to(DEVICE))
            loss = LOSS_FUNCTION(classification_scores, query_labels.to(DEVICE))
            loss.backward()
            optimizer.step()
            all_loss.append(loss.item())
            tqdm_train.set_postfix(loss=mean(all_loss))
    return mean(all_loss)

# Initialize variables for best state and best validation accuracy
best_state = few_shot_classifier.state_dict()
best_validation_accuracy = 0.0

# Training loop
for epoch in range(n_epochs):
    print(f"Epoch {epoch}")
    average_loss = training_epoch(few_shot_classifier, train_loader, train_optimizer)
    validation_accuracy = evaluate(
        few_shot_classifier, val_loader, device=DEVICE, tqdm_prefix="Validation"
    )

    # Update best state if validation accuracy improves
    if validation_accuracy > best_validation_accuracy:
        best_validation_accuracy = validation_accuracy
        best_state = copy.deepcopy(few_shot_classifier.state_dict())
        print("Ding ding ding! We found a new best model!")

    # Log metrics to tensorboard
    tb_writer.add_scalar("Train/loss", average_loss, epoch)
    tb_writer.add_scalar("Val/acc", validation_accuracy, epoch)

    # Step the learning rate scheduler
    train_scheduler.step()

# Load the best state
few_shot_classifier.load_state_dict(best_state)

NameError: name 'features_dataset' is not defined

In [None]:
from utils import auc, pad_collate_fn
import utils
print(utils.__file__)

# We define the loss function, optimizer and metrics for the training
criterion = torch.nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss
optimizer = torch.optim.Adam              # Adam optimizer
metrics = {"auc": auc}                    # AUC will be the tracking metric

# ``collator`` is a function that apply a deterministic
# transformation to a batch of samples before being processed
# by the GPU. Here, this function is ``pad_collate_fn``. The
# goal of this function is align matrices of features (the inputs)
# in terms of shape. Indeed, some WSI may have 200 features (very
# small piece of tissues) or 1,000 (the maximum we set). In that case,
# all matrices will have a shape of at most the bigger matrices in the
# batch. Our (200, 768) input matrix will become a (1000, 768) matrix
# with 800 ``inf`` values. A boolean mask is stored so that to tell
# torch not to process these 800 values but only focus on the 200 real ones

collator = pad_collate_fn


/home/rakib/jupyter_notebooks/iBOT_project/cache/datasets--owkin--camelyon16-features/snapshots/932e3f46255585b9a83cd3f0d74bf1c806fea5a0/scripts/utils.py


In [None]:
import warnings
from copy import deepcopy
import multiprocessing
from datetime import datetime

from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
from trainer import TorchTrainer, slide_level_train_step, slide_level_val_step

# We run a 5-fold cross-validation with 1 repeat (you can tweak these parameters)
n_repeats = 1
n_folds = 5
train_metrics, val_metrics = [], []
test_logits = []

cv_start_time = datetime.now()

for repeat in range(n_repeats):
    print(f"Running cross-validation #{repeat+1}")
    # We stratify with respect to the training labels
    cv_skfold = StratifiedKFold(
        n_splits=n_folds,
        shuffle=True,
        random_state=repeat,
    )
    cv_splits = cv_skfold.split(cam16_design_indices, y=cam16_design_labels)

    # 1 training fold approximately takes 25 seconds
    for i, (train_indices, val_indices) in enumerate(cv_splits):
        fold_start_time = datetime.now()
        trainer = TorchTrainer(
            model=deepcopy(chowder),
            criterion=criterion,
            metrics=metrics,
            batch_size=16,                           # you can tweak this
            num_epochs=15,                           # you can tweak this
            learning_rate=1e-3,                      # you can tweak this
            weight_decay=0.0,                        # you can tweak this
            device="cuda:0",
            num_workers=multiprocessing.cpu_count(), # you can tweak this
            optimizer=deepcopy(optimizer),
            train_step=slide_level_train_step,
            val_step=slide_level_val_step,
            collator=pad_collate_fn,
        )

        print(f"Running cross-validation on split #{i+1}")
        cam16_train_dataset = torch.utils.data.Subset(
            cam16_design_dataset, indices=train_indices
        )
        cam16_val_dataset = torch.utils.data.Subset(
            cam16_design_dataset, indices=val_indices
        )

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            # Training step for the given number of epochs
            local_train_metrics, local_val_metrics = trainer.train(
                cam16_train_dataset, cam16_val_dataset
            )
            # Predictions on test (logits, sigmoid(logits) = probability)
            local_test_logits = trainer.predict(cam16_test_dataset)[1]

        train_metrics.append(local_train_metrics)
        val_metrics.append(local_val_metrics)
        test_logits.append(local_test_logits)
        fold_end_time = datetime.now()
        fold_running_time = fold_end_time - fold_start_time
        print("\n-----------------------------Finished in {}---------------------------------------\n".format(fold_running_time))
    #clear_output()
cv_end_time = datetime.now()
cv_running_time = cv_end_time - cv_start_time
print("\nFinished cross-validation in {}".format(cv_running_time))

Running cross-validation #1
Running cross-validation on split #1
Epoch 1: train_loss=0.69553, train_auc=0.5032, val_loss=0.68943, val_auc=0.4687
Epoch 2: train_loss=0.67532, train_auc=0.5332, val_loss=0.64351, val_auc=0.4901
Epoch 3: train_loss=0.69388, train_auc=0.4789, val_loss=0.67824, val_auc=0.4957
Epoch 4: train_loss=0.67432, train_auc=0.5521, val_loss=0.64426, val_auc=0.4872
Epoch 5: train_loss=0.68850, train_auc=0.4389, val_loss=0.65221, val_auc=0.4943
Epoch 6: train_loss=0.68018, train_auc=0.5465, val_loss=0.65567, val_auc=0.6037
Epoch 7: train_loss=0.67314, train_auc=0.6061, val_loss=0.61848, val_auc=0.8196
Epoch 8: train_loss=0.61496, train_auc=0.8317, val_loss=0.56281, val_auc=0.9219
Epoch 9: train_loss=0.50494, train_auc=0.9342, val_loss=0.42341, val_auc=0.9602
Epoch 10: train_loss=0.39363, train_auc=0.9394, val_loss=0.31809, val_auc=0.9673
Epoch 11: train_loss=0.30270, train_auc=0.9530, val_loss=0.28298, val_auc=0.9716
Epoch 12: train_loss=0.25982, train_auc=0.9657, val_l

In [None]:
from easyfsl.datasets import FeaturesDataset
from easyfsl.samplers import TaskSampler
from torch.utils.data import DataLoader
import torch
from datetime import datetime

# Define your loss function, optimizer, and other training parameters
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Set the number of repeats and folds for cross-validation
n_repeats = 1
n_folds = 5
train_metrics, val_metrics = [], []

cv_start_time = datetime.now()

# Loop over repeats and folds
for repeat in range(n_repeats):
    print(f"Running cross-validation #{repeat+1}")
    cv_skfold = StratifiedKFold(
        n_splits=n_folds,
        shuffle=True,
        random_state=repeat,
    )
    cv_splits = cv_skfold.split(cam16_design_indices, y=cam16_design_labels)

    for i, (train_indices, val_indices) in enumerate(cv_splits):
        fold_start_time = datetime.now()

        # Instantiate the TaskSampler for the current fold
        task_sampler = TaskSampler(
            features_dataset,
            n_way=n_way,
            n_shot=n_shots,
            n_query=100,
            n_tasks=10,
        )

        # Create data loaders using the TaskSampler
        train_loader = DataLoader(
            features_dataset,
            batch_sampler=task_sampler,
            num_workers=1,
            pin_memory=True,
            collate_fn=task_sampler.episodic_collate_fn,
        )

        # Instantiate the model
        model = ModifiedPrototypicalNetworks(in_channels, num_classes)

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            for batch in train_loader:
                # Extract support and query sets from the batch
                support_images, support_labels, query_images, query_labels = batch

                # Process support set to update prototypes
                model.process_support_set(support_images, support_labels)

                # Forward pass
                logits = model(query_images)

                # Compute loss
                loss = criterion(logits, query_labels)

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # Validate the model
            model.eval()
            # Add validation code here if needed

        fold_end_time = datetime.now()
        fold_running_time = fold_end_time - fold_start_time
        print("\n-----------------------------Finished in {}---------------------------------------\n".format(fold_running_time))

    # Clear output if needed
    # clear_output()

cv_end_time = datetime.now()
cv_running_time = cv_end_time - cv_start_time
print("\nFinished cross-validation in {}".format(cv_running_time))


NameError: name 'FeaturesDataset' is not defined

In [None]:
# We run a 5-fold cross-validation with 1 repeat (you can tweak these parameters)
n_repeats = 1
n_folds = 5
train_metrics, val_metrics = [], []
test_logits = []

cv_start_time = datetime.now()

for repeat in range(n_repeats):
    print(f"Running cross-validation #{repeat+1}")
    # We stratify with respect to the training labels
    cv_skfold = StratifiedKFold(
        n_splits=n_folds,
        shuffle=True,
        random_state=repeat,
    )
    cv_splits = cv_skfold.split(cam16_design_indices, y=cam16_design_labels)

    # 1 training fold approximately takes 25 seconds
    for i, (train_indices, val_indices) in enumerate(cv_splits):
        fold_start_time = datetime.now()
        trainer = TorchTrainer(
            model=deepcopy(model2),
            criterion=criterion,
            metrics=metrics,
            batch_size=16,                           # you can tweak this
            num_epochs=15,                           # you can tweak this
            learning_rate=1e-3,                      # you can tweak this
            weight_decay=0.0,                        # you can tweak this
            device="cuda:0",
            num_workers=multiprocessing.cpu_count(), # you can tweak this
            optimizer=deepcopy(optimizer),
            train_step=slide_level_train_step,
            val_step=slide_level_val_step,
            collator=pad_collate_fn,
        )

        print(f"Running cross-validation on split #{i+1}")
        cam16_train_dataset = torch.utils.data.Subset(
            cam16_design_dataset, indices=train_indices
        )
        cam16_val_dataset = torch.utils.data.Subset(
            cam16_design_dataset, indices=val_indices
        )
        task_sampler = TaskSampler(
                features_dataset,
                n_way=n_way,
                n_shot=n_shots,
                n_query=100,
                n_tasks=10,
            )
        features_loader = DataLoader(
                features_dataset,
                batch_sampler=task_sampler,
                num_workers=1,
                pin_memory=True,
                collate_fn=task_sampler.episodic_collate_fn,
            )

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            # Training step for the given number of epochs
            local_train_metrics, local_val_metrics = trainer.train(
                cam16_train_dataset, cam16_val_dataset
            )
            # Predictions on test (logits, sigmoid(logits) = probability)
            local_test_logits = trainer.predict(cam16_test_dataset)[1]

        train_metrics.append(local_train_metrics)
        val_metrics.append(local_val_metrics)
        test_logits.append(local_test_logits)
        fold_end_time = datetime.now()
        fold_running_time = fold_end_time - fold_start_time
        print("\n-----------------------------Finished in {}---------------------------------------\n".format(fold_running_time))
    #clear_output()
cv_end_time = datetime.now()
cv_running_time = cv_end_time - cv_start_time
print("\nFinished cross-validation in {}".format(cv_running_time))

In [None]:
from utils import get_cv_metrics, roc_auc_score

cv_train_metrics = get_cv_metrics(train_metrics)
cv_val_metrics = get_cv_metrics(val_metrics)
test_metrics = trainer.evaluate(cam16_test_dataset)

print("Cross-validation results:")
for k, v in cv_train_metrics.items():
    print(f"mean_train_{k}: {v}")

for k, v in cv_val_metrics.items():
    print(f"mean_val_{k}: {v}")

print("\nEnsembling results on test set:")
test_auc = roc_auc_score(
    cam16_test_dataset.labels,
    np.mean(test_logits, axis=0)
)
print(f"test_auc: {test_auc:.4f}")


Cross-validation results:
mean_train_auc: 0.9866 ± 0.0035
mean_val_auc: 0.9564 ± 0.0318

Ensembling results on test set:
test_auc: 0.9235


In [None]:
import os
from typing import Optional
import random

from datasets import load_dataset
from transformers import set_seed as set_seed_hf
from transformers import AutoImageProcessor

dataset_name = "/home/rakib/data/NCT-CRC-HE-100K-NONORM"
# You can change the dataset name above if you wish to finetune the model on your own dataset.


# We set a seed globally for data loading and training
SEED = 123

def set_seed(seed: Optional[int] = None):
    """Set all seeds to make results reproducible (deterministic mode).
    When seed is None, disables deterministic mode.
    Credits @BramVanroy
    """
    if seed is not None:
        set_seed_hf(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(SEED)
dataset = load_dataset("imagefolder", data_dir="/home/rakib/data/NCT-CRC-HE-100K-NONORM", cache_dir=cache_dir)


Resolving data files:   0%|          | 0/100000 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Debug dataset properties
print(dataset.keys())
print(dataset.items())
print(dataset.unique)

dict_keys(['train'])
dict_items([('train', Dataset({
    features: ['image', 'label'],
    num_rows: 100000
}))])
<bound method DatasetDict.unique of DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 100000
    })
})>


In [None]:
nct_data = dataset['train']

# Get labels and images
labels = nct_data['label']
images = nct_data['image']

In [None]:
# NOT NECESSARY IF PREVIOUS SLIDES WERE SEQUENTIALLY EXECUTED

import warnings
from copy import deepcopy
import multiprocessing
from datetime import datetime

from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
from trainer import TorchTrainer, slide_level_train_step, slide_level_val_step

In [None]:
# This strategy doesnot take into account the class imbalance issue

# Define the number of samples you want to randomly select
num_samples = 1000  # Change this number to your desired value

# Randomly sample from the dataset according to the number of samples
random_indices = random.sample(range(len(labels)), num_samples)

# Extract sampled labels and images
sampled_labels = [labels[i] for i in random_indices]
sampled_images = [images[i] for i in random_indices]

# Strategy to solve the class imbalance issue
'''
from collections import defaultdict

# Define the number of samples you want to randomly select
num_samples_per_class = 10  # Change this number to your desired value per class

# Initialize a dictionary to store sampled indices for each class
class_indices = defaultdict(list)

# Map class names to class labels
class_name_to_label = {v: k for k, v in label2id.items()}

# Collect indices for each class
for i, label in enumerate(labels):
    class_name = label2id[label]
    class_label = class_name_to_label[class_name]
    class_indices[class_label].append(i)

# Randomly sample from each class
sampled_indices = []
for class_label, indices in class_indices.items():
    sampled_indices.extend(random.sample(indices, num_samples_per_class))

# Extract sampled labels and images
sampled_labels = [labels[i] for i in sampled_indices]
sampled_images = [images[i] for i in sampled_indices]
'''


'\nfrom collections import defaultdict\n\n# Define the number of samples you want to randomly select\nnum_samples_per_class = 10  # Change this number to your desired value per class\n\n# Initialize a dictionary to store sampled indices for each class\nclass_indices = defaultdict(list)\n\n# Map class names to class labels\nclass_name_to_label = {v: k for k, v in label2id.items()}\n\n# Collect indices for each class\nfor i, label in enumerate(labels):\n    class_name = label2id[label]\n    class_label = class_name_to_label[class_name]\n    class_indices[class_label].append(i)\n\n# Randomly sample from each class\nsampled_indices = []\nfor class_label, indices in class_indices.items():\n    sampled_indices.extend(random.sample(indices, num_samples_per_class))\n\n# Extract sampled labels and images\nsampled_labels = [labels[i] for i in sampled_indices]\nsampled_images = [images[i] for i in sampled_indices]\n'

In [None]:
from sklearn.model_selection import train_test_split

# Define the percentage for the validation set
split_percentage = 0.5

# Split the sampled data into train and validation sets
train_labels, val_labels, train_images, val_images = train_test_split(sampled_labels, sampled_images, test_size=split_percentage)

# Strategy to solve the class imbalance issue
'''
# Split the sampled data into train and validation sets
train_indices, val_indices = train_test_split(sampled_indices, test_size=split_percentage, stratify=sampled_labels)

# Because it's a list function, this step is necessary:

# Extract labels and images for train and validation sets
train_labels = [labels[i] for i in train_indices]
train_images = [images[i] for i in train_indices]

val_labels = [labels[i] for i in val_indices]
val_images = [images[i] for i in val_indices]
'''

"\n# Split the sampled data into train and validation sets\ntrain_indices, val_indices = train_test_split(sampled_indices, test_size=split_percentage, stratify=sampled_labels)\n\n# Because it's a list function, this step is necessary:\n\n# Extract labels and images for train and validation sets\ntrain_labels = [labels[i] for i in train_indices]\ntrain_images = [images[i] for i in train_indices]\n\nval_labels = [labels[i] for i in val_indices]\nval_images = [images[i] for i in val_indices]\n"

In [None]:
# From the NCT-CRC 999 samples, we create train and validation sets of 500 images each

# test_dataset_path = "/home/rakib/data/CRC-VAL-HE-7K"

# Test dataset contains 7,180 images
test_dataset = load_dataset("imagefolder", data_dir="/home/rakib/data/CRC-VAL-HE-7K", cache_dir=cache_dir)

t_data =  test_dataset['train']

# Get labels and images
t_labels = t_data['label']
t_images = t_data['image']

# Randomly sample from the dataset according to the number of samples
random_t_indices = random.sample(range(len(t_labels)), num_samples)

# Extract sampled labels and images
sampled_t_labels = [t_labels[i] for i in random_t_indices]
sampled_t_images = [t_images[i] for i in random_t_indices]

Resolving data files:   0%|          | 0/7180 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Create train and validation datasets
train_dataset = {'image': train_images, 'label': train_labels}
val_dataset = {'image': val_images, 'label': val_labels}

# Print the number of samples in each set
print(f"Number of samples in the train set: {len(train_labels)}")
print(f"Number of samples in the validation set: {len(val_labels)}")

subset_dataset = {'image': sampled_t_images, 'label': sampled_t_labels}

print(f"Training dataset size: {len(train_dataset)}\n" f"Validation dataset size: {len(val_dataset)}\n" f"Test dataset size: {len(test_dataset)}\n")

Number of samples in the train set: 500
Number of samples in the validation set: 500
Training dataset size: 2
Validation dataset size: 2
Test dataset size: 1



In [None]:
test_dataset.unique

<bound method DatasetDict.unique of DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 7180
    })
})>

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_dict(train_dataset)

In [None]:

# Create train and validation datasets
#train_dataset = Dataset.from_dict({'image': train_images, 'label': train_labels})
val_dataset = Dataset.from_dict(val_dataset)

In [None]:
test_dataset = Dataset.from_dict({'image': t_data['image'], 'label': t_data['label']})
#subset_dataset = Dataset.from_dict("image": [data["image"] for data in balanced_test_dataset], "label": [data["label"] for data in balanced_test_dataset])
print(f"Training dataset size: {len(train_dataset)}\n" f"Validation dataset size: {len(val_dataset)}\n" f"Test dataset size: {len(test_dataset)}\n")

Training dataset size: 500
Validation dataset size: 500
Test dataset size: 7180



In [None]:
subset_dataset = Dataset.from_dict(subset_dataset)

In [None]:
# prompt: print subset dataset properties

print(subset_dataset.unique)


<bound method Dataset.unique of Dataset({
    features: ['image', 'label'],
    num_rows: 1000
})>


In [None]:
# prompt: I want to know number of elements for each class in the subset_dataset

from collections import Counter
class_counts = Counter(subset_dataset['label'])
print(class_counts)


Counter({0: 179, 8: 170, 4: 144, 1: 132, 6: 118, 3: 81, 5: 76, 7: 59, 2: 41})


In [None]:
image_processor = AutoImageProcessor.from_pretrained("owkin/phikon")
print(image_processor)

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}



In [None]:
from typing import Dict, Any
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

# ImageNet normalization
normalize = Normalize(
    mean=image_processor.image_mean,
    std=image_processor.image_std
)

# train transforms = random crop, resizing to 224x224, random flip, normalization
train_transforms = Compose(
    [
        RandomResizedCrop(image_processor.size["height"]),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)

# val transforms = resizing to 224x224, normalization
val_transforms = Compose(
    [
        Resize(image_processor.size["height"]),
        CenterCrop(image_processor.size["height"]),
        ToTensor(),
        normalize,
    ]
)


In [None]:

'''
def preprocess_train(example_batch: dict[str, Any]) -> dict[str, Any]:
    """Apply ``train_transforms`` across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image) for image in example_batch["image"]
    ]
    return example_batch


def preprocess_val(example_batch: dict[str, Any]) -> dict[str, Any]:
    """Apply ``val_transforms`` across a batch."""
    example_batch["pixel_values"] = [
        val_transforms(image) for image in example_batch["image"]
    ]
    return example_batch
'''

# Modified to avoid type error due to python3.8
def preprocess_train(example_batch: Dict[str, Any]) -> Dict[str, Any]:
    """Apply ``train_transforms`` across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image) for image in example_batch["image"]
    ]
    return example_batch


def preprocess_val(example_batch: Dict[str, Any]) -> Dict[str, Any]:
    """Apply ``val_transforms`` across a batch."""
    example_batch["pixel_values"] = [
        val_transforms(image) for image in example_batch["image"]
    ]
    return example_batch

# Apply the transformations
train_dataset.set_transform(preprocess_train)
val_dataset.set_transform(preprocess_val)
test_dataset.set_transform(preprocess_val)

In [None]:
test_dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'image': Image(decode=True, id=None), 'label': Value(dtype='int64', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [None]:
from transformers import AutoModelForImageClassification

# Labels from our dataset
label2id = {
    '0': "ADI",
    '1': "BACK",
    '2': "DEB",
    '3': "LYM",
    '4': "MUC",
    '5': "MUS",
    '6': "NORM",
    '7': "STR",
    '8': "TUM"
}
id2label = {v: k for (k, v) in label2id.items()}

# Load the model
model = AutoModelForImageClassification.from_pretrained(
    "owkin/phikon",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=False,
    cache_dir=cache_dir,
)
print_trainable_parameters(model)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at owkin/phikon and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 85805577 || all params: 85805577 || trainable%: 100.00


In [None]:
# We also create a version of Phikon where the model is kept frozen and only the classifier head is trained (0.01% of the training parameters).
from copy import deepcopy

frozen_model = deepcopy(model)

for name, param in frozen_model.named_parameters():
     if not name.startswith("classifier."):
        param.requires_grad = False
print_trainable_parameters(frozen_model)

trainable params: 6921 || all params: 85805577 || trainable%: 0.01


In [None]:
# LoRA fine-tuning only requires 0.70% of the original trainable parameters!
from peft import LoraConfig, get_peft_model


# load and configure LoRA from Hugging Face peft library
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 596745 || all params: 86402322 || trainable%: 0.69


In [None]:
# Training Config.

import numpy as np
import torch

import evaluate
from transformers import TrainingArguments, Trainer

# LoRA configuration

batch_size = 24
args = TrainingArguments(
    "phikon-finetuned-nct-1k",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-3,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    seed=SEED,
    num_train_epochs=10,
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # dataset is roughly balanced
    push_to_hub=False,
    label_names=["labels"],
)

# Metric configuration

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred: np.ndarray) -> float:
    """Computes accuracy on a batch of predictions."""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

# Inputs generation for training

# Modified to avoid type error (def collate_fn(examples) -> dict[str, torch.Tensor]:)
def collate_fn(examples) -> Dict[str, torch.Tensor]:
    """Create the inputs for LoRA from an example in the dataset."""
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

# Here is the final trainer
trainer_lora = Trainer(
    model=lora_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Train

import warnings

from transformers.utils import logging


# We display the accuracy on the test set at the end
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    train_results_lora = trainer_lora.train()
    metrics_lora = trainer_lora.evaluate(test_dataset)
    trainer_lora.log_metrics("Fine-tuned model: VAL-CRC-7K", metrics_lora)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2498,0.232411,0.954
2,0.139,0.197188,0.972
3,0.0001,0.169068,0.974
4,0.044,0.237637,0.97
5,0.311,0.219136,0.968
6,0.3377,0.211805,0.972
7,0.3209,0.227102,0.966
8,0.0106,0.25781,0.964
9,0.0,0.218674,0.972
10,0.0015,0.213999,0.974


Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-21 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-42 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-63 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-84 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-105 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-126 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k

***** Fine-tuned model: VAL-CRC-7K metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.8118
  eval_loss               =     0.9193
  eval_runtime            = 0:00:15.10
  eval_samples_per_second =    475.193
  eval_steps_per_second   =     19.855



We now do the same training thing fully-frozen Phikon.

We observe up to a +2 increase in multi-class accuracy using LoRA fine-tuning, for only 30 seconds of extra training cost.


In [None]:

trainer_frozen = Trainer(
    frozen_model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
# Evaluation on test_dataset
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning)
    train_results_frozen = trainer_frozen.train()
    metrics_frozen = trainer_frozen.evaluate(test_dataset)
    trainer_frozen.log_metrics("Frozen model: VAL-CRC-7K", metrics_frozen)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1248,0.173862,0.966
2,0.0106,0.193067,0.968
3,0.0008,0.224724,0.96
4,0.1204,0.244578,0.95
5,0.0045,0.229637,0.964
6,0.6087,0.191229,0.974
7,0.3553,0.200367,0.974
8,0.0018,0.222482,0.968
9,0.0001,0.21444,0.972
10,0.001,0.211076,0.972


Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-21 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-42 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-63 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-84 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-105 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k/checkpoint-126 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory phikon-finetuned-nct-1k

***** Frozen model: VAL-CRC-7K metrics *****
  epoch                   =       10.0
  eval_accuracy           =     0.8507
  eval_loss               =     0.5789
  eval_runtime            = 0:00:21.96
  eval_samples_per_second =    326.948
  eval_steps_per_second   =     13.661


# **Visualizing features**

Code to visualize the features. This part helps differentiate between a frozen model and LoRA in order to examine the differences in the embeddings.

In [None]:
from tqdm.notebook import tqdm
import pandas as pd

from matplotlib.axes._axes import Axes
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE

# First we define a set of functions to
# 1) get the embeddings from the models
# 2) compute the 2D projections using the t-SNE algorithm
# 3) visualize these projections using ``seaborn```

def get_raw_embeddings(model, dataset, use_fp16: bool = True):
    """Retrieve tiles embeddings from a model equipped with a classifier head."""
    embeddings = []
    for pixel_values in tqdm(dataset["pixel_values"]):
        image = pixel_values.unsqueeze(0).to(
            "cuda:0" if torch.cuda.is_available() else "cpu",
            torch.float16 if use_fp16 else torch.float32
          )
        output = model(image, output_hidden_states=True)
        _embeddings = output.hidden_states[-1][:, 0, :].detach().cpu().numpy()
        embeddings.append(_embeddings)
    return np.concatenate(embeddings, axis=0)


def get_tsne_embeddings(raw_embeddings: np.ndarray, **kwargs):
    """Compute 2-dimensional tsne projections from raw embeddings."""
    tsne = TSNE(**kwargs)
    tsne_embeddings = tsne.fit_transform(raw_embeddings)
    tsne_embeddings = pd.DataFrame(tsne_embeddings, columns=["tsne-1", "tsne-2"])
    tsne_embeddings["Tissue type"] = test_subset_labels
    tsne_embeddings["Tissue type"] = tsne_embeddings["Tissue type"].astype(str).replace(label2id)
    return tsne_embeddings

def plot_tsne_embeddings(tsne_embeddings: np.ndarray, title: str, ax: Axes):
    """Plot tsne embeddings in the 2D space."""
    sns.scatterplot(
        x="tsne-1", y="tsne-2",
        hue="Tissue type",
        palette=sns.color_palette("hls", 9),
        data=tsne_embeddings,
        legend="full",
        alpha=0.3,
        ax=ax
    )
    ax.set_title(title)
    return ax

A subset of 1,000 images from the original test set is considered for the inference.

In [None]:
subset_size = 1000
test_subset = test_dataset[:subset_size]
test_subset_labels = np.array(test_subset["label"])

print(f"Computing LORA and frozen models embeddings on {subset_size} test images...")
test_subset_embeddings_lora = get_raw_embeddings(
    model=lora_model, dataset=test_subset
)
test_subset_embeddings_frozen = get_raw_embeddings(
    model=frozen_model, dataset=test_subset
)

print("Computing tsne projections...")
tsne_embeddings_lora = get_tsne_embeddings(
    test_subset_embeddings_lora, n_components=2
)
tsne_embeddings_frozen = get_tsne_embeddings(
    test_subset_embeddings_frozen, n_components=2
  )

NameError: name 'test_dataset' is not defined

The differences between the LoRA fine-tuned and frozen models are small due to the highly separable nature of NCT-CRC prediction task (different tissues can be distinguished easily by the naked eye). However, we notice that LoRA fine-tuning allows to better disentangle clusters such as Lymphocytes (Yellow) and Tumor (red), which can play a significant role in cancer diagnosis.

In [None]:
print("Plotting in 2 dimensions.")
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
axes[0] = plot_tsne_embeddings(
    tsne_embeddings_lora, title="Lora embeddings", ax=axes[0]
)
axes[1] = plot_tsne_embeddings(
    tsne_embeddings_frozen, title="Frozen embeddings", ax=axes[1]
)
plt.show()

In [None]:
# prompt: There are 9 unique elements in train_dataset['label']. I want to know number of elements for each unique element

unique_labels, counts = np.unique(train_dataset['label'], return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f"Label: {label}, Count: {count}")


Label: 0, Count: 62
Label: 1, Count: 60
Label: 2, Count: 54
Label: 3, Count: 51
Label: 4, Count: 52
Label: 5, Count: 60
Label: 6, Count: 52
Label: 7, Count: 43
Label: 8, Count: 66


In [None]:
import random

# Get the unique labels and their counts
unique_labels, counts = np.unique(test_dataset['train']['label'], return_counts=True)

In [None]:
# Initialize a dictionary to store the indices of each class
class_indices = {}
for i, label in enumerate(test_dataset['train']['label']):
    if label not in class_indices:
        class_indices[label] = []
    class_indices[label].append(i)

In [None]:
# Create a balanced subset of the test dataset
balanced_test_dataset = []

In [None]:
# Initialize a list to store the indices
all_sampled_indices = []

for label in unique_labels:
    # Randomly select 25 samples from each class
    sampled_indices = random.sample(class_indices[label], 10)
    # Add the sampled indices to the overall list
    all_sampled_indices.extend(sampled_indices)


In [None]:
print(len(all_sampled_indices))
print(np.unique(all_sampled_indices))

90
[ 199  208  215  413  471  886  970 1101 1104 1170 1351 1379 1507 1533
 1786 1855 1880 1893 1951 2081 2192 2218 2238 2307 2377 2409 2411 2439
 2459 2480 2542 2624 2654 2702 2792 2825 2869 2886 2912 3151 3182 3327
 3408 3412 3728 3831 3892 4126 4162 4172 4295 4365 4368 4381 4408 4515
 4630 4647 4701 4703 4902 4945 5005 5028 5032 5086 5100 5216 5247 5424
 5557 5607 5637 5656 5657 5659 5765 5805 5817 5867 6221 6245 6271 6513
 6573 6627 6802 6863 6895 6983]


In [None]:
# Add the sampled images and labels to the balanced subset
for index in all_sampled_indices:
    balanced_test_dataset.append({'image': test_dataset['train']['image'][index], 'label': test_dataset['train']['label'][index]})

KeyboardInterrupt: 

In [None]:
from datasets import Dataset
#subset_dataset = Dataset.from_dict("image": [data["image"] for data in balanced_test_dataset], "label": [data["label"] for data in balanced_test_dataset])
#subset_dataset = Dataset.from_dict({'image': balanced_test_dataset["image"], "label": balanced_test_dataset["label"]})

subset_dataset = Dataset.from_dict({'image': test_dataset['train']['image'][index] for index in all_sampled_indices, 'label': test_dataset['train']['label'][index] for index in all_sampled_indices})

SyntaxError: invalid syntax (3909063921.py, line 5)

In [None]:
# prompt: The above cell is taking too long to run. Help me Parallelize the processing: The cell is currently processing the data sequentially. You can try parallelizing the processing to see if it improves the performance.

import multiprocessing as mp

def process_data(data):
    # Perform data processing on each data point
    processed_data = ...
    return processed_data

# Create a pool of worker processes
pool = mp.Pool(processes=mp.cpu_count())

# Use the pool to process the data in parallel
processed_data = pool.map(process_data, dataset)

# Close the pool
pool.close()

# Combine the processed data into a new dataset
processed_dataset = Dataset.from_dict(processed_data)


In [None]:
# prompt: There are 9 classes in test dataset. I want to make a balanced subset of test_dataset, containing 100 random samples from each class.

import random

# Get the unique labels and their counts
unique_labels, counts = np.unique(test_dataset['train']['label'], return_counts=True)

# Initialize a dictionary to store the indices of each class
class_indices = {}
for i, label in enumerate(test_dataset['train']['label']):
    if label not in class_indices:
        class_indices[label] = []
    class_indices[label].append(i)

# Create a balanced subset of the test dataset
balanced_test_dataset = []
for label in unique_labels:
    # Randomly select 100 samples from each class
    sampled_indices = random.sample(class_indices[label], 100)
    # Add the sampled images and labels to the balanced subset
    for index in sampled_indices:
        balanced_test_dataset.append({'image': test_dataset['train']['image'][index], 'label': test_dataset['train']['label'][index]})

# Print the size of the balanced test dataset
print(f"Balanced test dataset size: {len(balanced_test_dataset)}")
'''
from datasets import Dataset
balanced_test_dict = {
    "image": [data["image"] for data in balanced_test_dataset],
    "label": [data["label"] for data in balanced_test_dataset],
}
balanced_test_dataset = Dataset.from_dict(balanced_test_dict)
'''

KeyboardInterrupt: 

In [None]:
# prompt: I want to know number of elements for each class in the balanced_test_dataset

# Get the unique labels and their counts in the balanced test dataset
unique_labels, counts = np.unique([data['label'] for data in balanced_test_dataset], return_counts=True)

# Print the number of elements for each class
for label, count in zip(unique_labels, counts):
    print(f"Label: {label}, Count: {count}")


Label: 0, Count: 100
Label: 1, Count: 100
Label: 2, Count: 100
Label: 3, Count: 100
Label: 4, Count: 100
Label: 5, Count: 100
Label: 6, Count: 100
Label: 7, Count: 100
Label: 8, Count: 100


In [None]:
from datasets import Dataset
balanced_test_dict = {
    "image": [data["image"] for data in balanced_test_dataset],
    "label": [data["label"] for data in balanced_test_dataset],
}
balanced_test_dataset = Dataset.from_dict(balanced_test_dict)

NameError: name 'balanced_test_dataset' is not defined