Before starting, you will need to install some packages to reproduce the baseline.

In [None]:
!pip install tqdm
!pip install scikit-learn

In [None]:
from pathlib import Path
from tqdm import tqdm

import logging

import numpy as np
import pandas as pd

import torch

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [None]:
# import data
PATH_COLAB = '/content/drive/MyDrive/challenge_ens_2023_small/moco_features.zip'
PATH_DEVICE = '..'
try:
    from google.colab import drive
    logging.info('Working on Colab.')
    
    # connect your drive to the session
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/challenge_data_ens_small/

    # unzip data into the colab session
    ! unzip $PATH_COLAB -d /content
    logging.info('Data unziped in your Drive.')

    %cd /content

    %cp -R drive/MyDrive/challenge_ens_2023_small/supplementary_data/ .
    %cp drive/MyDrive/challenge_ens_2023_small/train_output.csv .


except:
    logging.info('Working on your device.')
    
    data_exists = os.path.exists(PATH_DEVICE + '/train_input') and os.path.exists(PATH_DEVICE + '/test_input') and os.path.exists(PATH_DEVICE + '/train_output.csv')
    
    if data_exists:
        logging.info(f"Dataset found on device at : '{PATH_DEVICE}.'") 
    else:
        raise FileNotFoundError(f"Data folder not found at '{PATH_DEVICE}'")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data architecture

After downloading or unzipping the downloaded files, your data tree must have the following architecture in order to properly run the notebook:
```
your_data_dir/
├── train_output.csv
├── train_input/
│   ├── images/
│       ├── ID_001/
│           ├── ID_001_tile_000_17_170_43.jpg
...
│   └── moco_features/
│       ├── ID_001.npy
...
├── test_input/
│   ├── images/
│       ├── ID_003/
│           ├── ID_003_tile_000_16_114_93.jpg
...
│   └── moco_features/
│       ├── ID_003.npy
...
├── supplementary_data/
│   ├── baseline.ipynb
│   ├── test_metadata.csv
│   └── train_metadata.csv
```

For instance, `your_data_dir = /storage/DATA_CHALLENGE_ENS_2022/`


This notebook aims to reproduce the baseline method on this challenge called `MeanPool`. This method consists in a logistic regression learnt on top of tile-level MoCo V2 features averaged over the slides.

For a given slide $s$ with $N_s=1000$ tiles and corresponding MoCo V2 features $\mathbf{K_s} \in \mathbb{R}^{(1000,\,2048)}$, a slide-level average is performed over the tile axis.

For $j=1,...,2048$:

$$\overline{\mathbf{k_s}}(j) = \frac{1}{N_s} \sum_{i=1}^{N_s} \mathbf{K_s}(i, j) $$

Thus, the training input data for MeanPool consists of $S_{\text{train}}=344$ mean feature vectors $\mathbf{k_s}$, $s=1,...,S_{\text{train}}$, where $S_{\text{train}}$ denotes the number of training samples.

## Data loading

In [None]:
# put your own path to the data root directory (see example in `Data architecture` section)
data_dir = Path("../")

# load the training and testing data sets
train_features_dir = data_dir / "train_input" / "moco_features"
test_features_dir = data_dir / "test_input" / "moco_features"
df_train = pd.read_csv(data_dir  / "supplementary_data" / "train_metadata.csv")
df_test = pd.read_csv(data_dir  / "supplementary_data" / "test_metadata.csv")

# concatenate y_train and df_train
y_train = pd.read_csv(data_dir  / "train_output.csv")
df_train = df_train.merge(y_train, on="Sample ID")

print(f"Training data dimensions: {df_train.shape}")  # (344, 4)
df_train.head()

## Data processing

We now load the features matrices $\mathbf{K_s} \in \mathbb{R}^{(1000,\,2048)}$ for $s=1,...,344$ and perform slide-level averaging. This operation should take at most 5 minutes on your laptop.

In [None]:
#size_train = 50 #len(df_train)
#X_train = np.zeros((size_train, 1000, 2048))
#y_train = np.zeros((size_train), dtype=int)
centers_train = []
patients_train = []

for i, (sample, label, center, patient) in enumerate(tqdm(
    df_train[["Sample ID", "Target", "Center ID", "Patient ID"]].values
)):
    # load the coordinates and features (1000, 3+2048)
    #_features = np.load(train_features_dir / sample)
    # get coordinates (zoom level, tile x-coord on the slide, tile y-coord on the slide)
    # and the MoCo V2 features
    #coordinates, features = _features[:, :3], _features[:, 3:]  # Ks
    # slide-level averaging
    #X_train[i] = features
    #y_train[i] = label
    centers_train.append(center)
    patients_train.append(patient)
    

# convert to numpy arrays
#X_train = np.array(X_train)
#y_train = np.array(y_train)
centers_train = np.array(centers_train)
patients_train = np.array(patients_train)

## Multilayer perceptron with a max over an image

In [None]:
from torch import nn
from torch.functional import F

class Perceptron(torch.nn.Module):
    def __init__(self):
        super(Perceptron, self).__init__()
        
        self.input = nn.Linear(2048,512)

        self.hidden = nn.Linear(512, 1)

        #self.output = nn.Linear(128, 1)

        self.dropout = torch.nn.Dropout(p = 0.3)

        self.relu = torch.nn.ReLU() # instead of Heaviside step fn  

        self.sigmoid = torch.nn.Sigmoid()

        self.max = torch.nn  
    
    def forward(self, x):

        x = self.input(x)
        x = self.relu(x)

        x = self.dropout(x)
        
        x = self.hidden(x)
        x = self.sigmoid(x)

        #x = self.output(x)
        #x = self.sigmoid(x)

        x = x.squeeze()
        output = torch.max(x, dim=1).values # instead of Heaviside step fn
        return output

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, df, features_dir, test = False):
        self.df_train = df
        self.features_dir = features_dir
        self.test = test

    def __len__(self):
        return len(self.df_train)

    def __getitem__(self, idx):
        if self.test:
            sample = df_train.iloc[idx][["Sample ID"]]
        else:
            sample, label =df_train.iloc[idx][["Sample ID", "Target"]]
            
        # load the coordinates and features (1000, 3+2048)
        _features = np.load(self.features_dir / sample)
        # get coordinates (zoom level, tile x-coord on the slide, tile y-coord on the slide)
        # and the MoCo V2 features
        coordinates, features = _features[:, :3], _features[:, 3:]  # Ks

        if self.test:
            return {'x': features}
        return {'x':features, 'y':label}

In [None]:
N_split = 150
train_dataset = MyDataset(df=df_train.iloc[:N_split], features_dir=train_features_dir)
val_dataset = MyDataset(df=df_train.iloc[N_split:], features_dir=train_features_dir)


In [None]:
from torch.utils.data import DataLoader


loader = DataLoader(dataset=train_dataset, batch_size=43)
val_loader = DataLoader(dataset=val_dataset, batch_size=43)



In [None]:
from torch.nn import BCELoss
from torch.optim import Adam

model = Perceptron().to(device)
criterion = BCELoss().to(device)
optimizer = Adam(model.parameters(), lr=0.001)

In [None]:
def train(loader):
    model.train()
    mean_loss = 0
    for i, val in enumerate(loader):
        labels = val['y'].to(device)
        x = val['x'].to(device)
        optimizer.zero_grad()
        output = model(x)
        
        loss = criterion(output, labels.float())
        loss.backward()
        optimizer.step()
        mean_loss = 1/(i+1) * (loss - mean_loss) + i / (i+1) * mean_loss
    return mean_loss

In [None]:
def train_fold(model, loader, val_loader=[], n_epoch = 10, verbose=1):
    for epoch in range(10):
        loss = train(loader)

        val_loss = 0
        for i, val in enumerate(val_loader):
            model.eval()

            labels = val['y'].to(device)
            x = val['x'].to(device)
            
            output = model(x)
            
            v_loss = criterion(output, labels.float())

            val_loss = 1/(i+1) * (v_loss - val_loss) + i / (i+1) * val_loss
        
        if verbose:
            print(f"Epoch {epoch} - loss {loss:.4f} - val loss {val_loss:.4f}")

In [None]:
train_fold(model, loader, n_epoch=10, verbose=1)

In [None]:
y_trues = []
preds = []
for val in loader:
    model.eval()
    y_true = val['y'].detach().numpy().tolist()
    x = val['x'].to(device)

    pred = model(x).detach().numpy().tolist()
    y_trues.extend(y_true)
    preds.extend(pred)

auc = roc_auc_score(y_trues, preds)

print(auc)

## 5-fold cross validation

In [None]:
# /!\ we perform splits at the patient level so that all samples from the same patient
# are found in the same split

patients_unique = np.unique(patients_train)
y_unique = np.array(
    [np.mean(y_train[patients_train == p]) for p in patients_unique]
)
centers_unique = np.array(
    [centers_train[patients_train == p][0] for p in patients_unique]
)

print(
    "Training set specifications\n"
    "---------------------------\n"
    f"{len(df_train)} unique samples\n"
    f"{len(patients_unique)} unique patients\n"
    f"{len(np.unique(centers_unique))} unique centers"
)

In [None]:
aucs = []
models = []
# 5-fold CV is repeated 5 times with different random states
for k in range(5):
    kfold = StratifiedKFold(5, shuffle=True, random_state=k)
    fold = 0
    # split is performed at the patient-level
    for train_idx_, val_idx_ in kfold.split(patients_unique, y_unique):
        # retrieve the indexes of the samples corresponding to the
        # patients in `train_idx_` and `test_idx_`
        train_idx = np.arange(len(df_train))[
            pd.Series(patients_train).isin(patients_unique[train_idx_])
        ]
        val_idx = np.arange(len(df_train))[
            pd.Series(patients_train).isin(patients_unique[val_idx_])
        ]
        # set the training and validation folds
        df_fold_train = df_train.iloc[train_idx]
        data_fold_train = MyDataset(df_fold_train, train_features_dir)
        loader_fold_train = DataLoader(data_fold_train, batch_size=40, shuffle=True)

        df_fold_val = df_train.iloc[val_idx]
        data_fold_val = MyDataset(df_fold_val, train_features_dir)
        loader_fold_val = DataLoader(data_fold_val, batch_size=40, shuffle=True)
        
        # instantiate the model
        model = Perceptron().to(device)
        criterion = BCELoss().to(device)
        optimizer = Adam(model.parameters(), lr=0.001)
        
        train_fold(model, loader_fold_train, n_epoch=20)

        # get the predictions (1-d probability)
        y_trues = []
        preds = []
        for val in loader_fold_val:
            model.eval()
            y_true = val['y'].detach().numpy().tolist()
            x = val['x'].to(device)

            pred = model(x).detach().numpy().tolist()
            y_trues.extend(y_true)
            preds.extend(pred)

        auc = roc_auc_score(y_trues, preds)

        print(f"AUC on split {k} fold {fold}: {auc:.3f}")
        aucs.append(auc)
        # add the logistic regression to the list of classifiers
        models.append(model)
        fold += 1
    print("----------------------------")
print(
    f"5-fold cross-validated AUC averaged over {k+1} repeats: "
    f"{np.mean(aucs):.3f} ({np.std(aucs):.3f})"
)

# Submission

Now we evaluate the previous models trained through cross-validation so that to produce a submission file that can directly be uploaded on the data challenge platform.

## Data processing

In [None]:
dataset_test = MyDataset(df_test, test_features_dir, test=True)

loader_test = DataLoader(dataset_test, batch_size=20, shuffle=False)

## Inference

In [None]:
preds_test = 0


# loop over the classifiers
for lr in models:
    
    preds = []
    for val in loader_test:
        model.eval()
        x = val['x'].to(device)

        pred = model(x).detach().numpy().tolist()
        preds.extend(pred)

    preds = np.array(preds)
    preds_test += preds
# and take the average (ensembling technique)
preds_test = preds_test / len(models)

## Saving predictions

In [None]:
submission = pd.DataFrame(
    {"Sample ID": df_test["Sample ID"].values, "Target": preds_test}
).sort_values(
    "Sample ID"
)  # extra step to sort the sample IDs

# sanity checks
assert all(submission["Target"].between(0, 1)), "`Target` values must be in [0, 1]"
assert submission.shape == (149, 2), "Your submission file must be of shape (149, 2)"
assert list(submission.columns) == [
    "Sample ID",
    "Target",
], "Your submission file must have columns `Sample ID` and `Target`"

# save the submission as a csv file
submission.to_csv(data_dir / "benchmark_test_output.csv", index=None)
submission.head()

# Dealing with images

The following code aims to load and manipulate the images provided as part of  this challenge.

## Scanning images paths on disk

This operation can take up to 5 minutes.

In [None]:
train_images_dir = data_dir / "train_input" / "images"
train_images_files = list(train_images_dir.rglob("*.jpg"))

test_images_dir = data_dir / "test_input" / "images"
test_images_files = list(test_images_dir.rglob("*.jpg"))

print(
    f"Number of images\n"
    "-----------------\n"
    f"Train: {len(train_images_files)}\n" # 344 x 1000 = 344,000 tiles
    f"Test: {len(test_images_files)}\n"  # 149 x 1000 = 149,000 tiles
    f"Total: {len(train_images_files) + len(test_images_files)}\n"  # 493 x 1000 = 493,000 tiles
)

## Reading

Now we can load some of the `.jpg` images for a given sample, say `ID_001`.

In [None]:
ID_001_tiles = [p for p in train_images_files if 'ID_001' in p.name]

In [None]:
fig, axes = plt.subplots(5, 5)
fig.set_size_inches(12, 12)

for i, img_file in enumerate(ID_001_tiles[:25]):
    # get the metadata from the file path
    _, metadata = str(img_file).split("tile_")
    id_tile, level, x, y = metadata[:-4].split("_")
    img = plt.imread(img_file)
    ax = axes[i//5, i%5]
    ax.imshow(img)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f"Tile {id_tile} ({x}, {y})")
plt.show()

## Mapping with features

Note that the coordinates in the features matrices and tiles number are aligned.

In [None]:
sample = "ID_001.npy"
_features = np.load(train_features_dir / sample)
coordinates, features = _features[:, :3], _features[:, 3:]
print("xy features coordinates")
coordinates[:10, 1:].astype(int)

In [None]:
print(
    "Tiles numbering and features coordinates\n"
)
[tile.name for tile in ID_001_tiles[:10]]