# Pretrained model for feature extraction

Goal: we first test a pretrained embedding model from Owkin.

In [None]:
from urllib.request import urlopen
from PIL import Image
import timm

# get example histology image
img = Image.open(
    urlopen("https://github.com/owkin/HistoSSLscaling/raw/main/assets/example.tif")
)

# load model from the hub
model = timm.create_model(
    model_name="hf-hub:1aurent/vit_base_patch16_224.owkin_pancancer",
    pretrained=True,
).eval()

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

# input is a (batch_size, num_channels, img_size, img_size) shaped tensor
data = transforms(img).unsqueeze(0)
# output is a (batch_size, num_features) shaped tensor
output = model(data)

output.shape

## Creating an embedding pipeline

Convert JPG/PNG patches into .npy files

In [None]:
import os
from pathlib import Path
from urllib.request import urlopen

import numpy as np
import timm
import torch
from PIL import Image
from torchvision import transforms as T
from tqdm import tqdm


def main(image_folder: Path, output_folder: Path) -> None:
    # Ensure output_folder exists
    output_folder.mkdir(parents=True, exist_ok=True)

    # Load model from the hub
    model = timm.create_model(
        model_name="hf-hub:1aurent/vit_base_patch16_224.owkin_pancancer",
        pretrained=True,
    )
    model.eval()
    model.to("mps")

    # Get model specific transforms (normalization, resize)
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)

    # Process each image in the folder
    imgs = [image_folder / f for f in os.listdir(image_folder) if f.endswith(".jpg")]
    for image_path in tqdm(imgs, total=len(imgs)):
        try:
            # Load image
            img = Image.open(image_path).convert("RGB")

            # Apply transformations
            data = transforms(img).unsqueeze(0).to("mps")

            # Generate embeddings
            with torch.no_grad():
                output = model(data)

            # Store embeddings
            embedding = output.squeeze(0).cpu().numpy()
            output_file = output_folder / (image_path.stem + ".npy")
            np.save(output_file, embedding)
        except Exception as e:
            print(f"Error processing {image_path.name}: {e}")

    # Save embeddings to file
    print(f"Embeddings saved to {output_folder}")


Let's embed all the patches of a slide

Speed on a M2: 5569 patches in [02:29<00:00, 37.16 patches/s]

In [None]:
# Define your image folder and output file here
patches_folder = "../processed/2qj5MlLLBT_a/patches"

# patches_folder = "../image_to_embed/"
output_folder = "../embeddings/"

main(Path(patches_folder), Path(output_folder))

We can then read the `npy` embeddings and convert them back into tensors.

In [None]:
# Open one npy file
import numpy as np

embedding_file = "../embeddings/27_589_6912_150784_0_256_256.npy"

# Load the .npy file
embeddings = np.load(embedding_file)

# Print or inspect the loaded embeddings
print(embeddings)
print(f"Shape of the embeddings: {embeddings.shape}")


# Embedding analyses

Let's analyze how the patches are placed in the latent space. Use UMap for dimension reduction.

In [None]:
npy_files = [Path(output_folder, f) for f in os.listdir(output_folder)]
embeddings = [np.load(npy_file) for npy_file in npy_files]
embeddings = np.array(embeddings)
embeddings.shape

In [None]:
import umap

reducer = umap.UMAP()

In [None]:
reduced = reducer.fit_transform(embeddings)
reduced.shape

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.scatter(reduced[:, 0], reduced[:, 1])
plt.show()

# Clusters appear, it would be interesting to analyze them
# One cluster must be glass