Before starting, you will need to install some packages to reproduce the baseline.

In [None]:
!pip install tqdm
!pip install scikit-learn

In [None]:
!pip install xgboost joblib

from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from xgboost import XGBClassifier
import optuna
from joblib import Parallel, delayed


# Data architecture

After downloading or unzipping the downloaded files, your data tree must have the following architecture in order to properly run the notebook:
```
your_data_dir/
├── train_output.csv
├── train_input/
│   ├── images/
│       ├── ID_001/
│           ├── ID_001_tile_000_17_170_43.jpg
...
│   └── moco_features/
│       ├── ID_001.npy
...
├── test_input/
│   ├── images/
│       ├── ID_003/
│           ├── ID_003_tile_000_16_114_93.jpg
...
│   └── moco_features/
│       ├── ID_003.npy
...
├── supplementary_data/
│   ├── baseline.ipynb
│   ├── test_metadata.csv
│   └── train_metadata.csv
```

For instance, `your_data_dir = /storage/DATA_CHALLENGE_ENS_2022/`


This notebook aims to reproduce the baseline method on this challenge called `MeanPool`. This method consists in a logistic regression learnt on top of tile-level MoCo V2 features averaged over the slides.

For a given slide $s$ with $N_s=1000$ tiles and corresponding MoCo V2 features $\mathbf{K_s} \in \mathbb{R}^{(1000,\,2048)}$, a slide-level average is performed over the tile axis.

For $j=1,...,2048$:

$$\overline{\mathbf{k_s}}(j) = \frac{1}{N_s} \sum_{i=1}^{N_s} \mathbf{K_s}(i, j) $$

Thus, the training input data for MeanPool consists of $S_{\text{train}}=344$ mean feature vectors $\mathbf{k_s}$, $s=1,...,S_{\text{train}}$, where $S_{\text{train}}$ denotes the number of training samples.

## Data loading

In [None]:
# put your own path to the data root directory (see example in `Data architecture` section)
data_dir = Path("Data")

# load the training and testing data sets
train_features_dir = data_dir / "train_input" / "moco_features"
test_features_dir = data_dir / "test_input" / "moco_features"
df_train = pd.read_csv(data_dir  / "supplementary_data" / "train_metadata.csv")
df_test = pd.read_csv(data_dir  / "supplementary_data" / "test_metadata.csv")

# concatenate y_train and df_train
y_train = pd.read_csv(data_dir  / "train_output.csv")
df_train = df_train.merge(y_train, on="Sample ID")

print(f"Training data dimensions: {df_train.shape}")  # (344, 4)
df_train.head()

## Data processing

We now load the features matrices $\mathbf{K_s} \in \mathbb{R}^{(1000,\,2048)}$ for $s=1,...,344$ and perform slide-level averaging. This operation should take at most 5 minutes on your laptop.

In [None]:
def process_sample(sample_info, features_dir):
    sample, label, center, patient = sample_info
    _features = np.load(features_dir / sample)
    coordinates, features = _features[:, :3], _features[:, 3:]
    
    # Enhanced Pooling: Mean + Std + Max
    mean_feat = np.mean(features, axis=0)
    std_feat = np.std(features, axis=0)
    max_feat = np.max(features, axis=0)
    
    concatenated_features = np.concatenate([mean_feat, std_feat, max_feat])
    return concatenated_features, label, center, patient

print("Starting parallel feature extraction (Train)...")
# Parallel processing
results = Parallel(n_jobs=-1)(
    delayed(process_sample)(row, train_features_dir) 
    for row in tqdm(df_train[["Sample ID", "Target", "Center ID", "Patient ID"]].values)
)

X_train = np.array([r[0] for r in results])
y_train = np.array([r[1] for r in results])
centers_train = np.array([r[2] for r in results])
patients_train = np.array([r[3] for r in results])

print(f"X_train shape: {X_train.shape}")


## 5-fold cross validation

In [None]:
# /!\ we perform splits at the patient level so that all samples from the same patient
# are found in the same split

patients_unique = np.unique(patients_train)
y_unique = np.array(
    [np.mean(y_train[patients_train == p]) for p in patients_unique]
)
centers_unique = np.array(
    [centers_train[patients_train == p][0] for p in patients_unique]
)

print(
    "Training set specifications\n"
    "---------------------------\n"
    f"{len(X_train)} unique samples\n"
    f"{len(patients_unique)} unique patients\n"
    f"{len(np.unique(centers_unique))} unique centers"
)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 0.5), # Small colsample for high dim
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'use_label_encoder': False,
        'eval_metric': 'auc',
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    aucs = []
    # 5-fold CV
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_idx_, val_idx_ in kfold.split(patients_unique, y_unique):
        # Map patient split to sample split
        train_idx = np.arange(len(X_train))[
            pd.Series(patients_train).isin(patients_unique[train_idx_])
        ]
        val_idx = np.arange(len(X_train))[
            pd.Series(patients_train).isin(patients_unique[val_idx_])
        ]
        
        X_fold_train, y_fold_train = X_train[train_idx], y_train[train_idx]
        X_fold_val, y_fold_val = X_train[val_idx], y_train[val_idx]
        
        model = XGBClassifier(**params)
        model.fit(X_fold_train, y_fold_train)
        preds_val = model.predict_proba(X_fold_val)[:, 1]
        auc = roc_auc_score(y_fold_val, preds_val)
        aucs.append(auc)
        
    return np.mean(aucs)

print("Starting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) # 20 trials to save time, increase if needed

print(f"Best params: {study.best_params}")

# Retrain with best parameters
best_params = study.best_params
best_params['use_label_encoder'] = False
best_params['eval_metric'] = 'auc'
best_params['random_state'] = 42
best_params['n_jobs'] = -1

lrs = [] # Reuse variable name for compatibility
aucs = []

print("Retraining ensemble with best params...")
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx_, val_idx_ in kfold.split(patients_unique, y_unique):
    train_idx = np.arange(len(X_train))[
        pd.Series(patients_train).isin(patients_unique[train_idx_])
    ]
    val_idx = np.arange(len(X_train))[
        pd.Series(patients_train).isin(patients_unique[val_idx_])
    ]
    
    X_fold_train = X_train[train_idx]
    y_fold_train = y_train[train_idx]
    X_fold_val = X_train[val_idx]
    y_fold_val = y_train[val_idx]
    
    model = XGBClassifier(**best_params)
    model.fit(X_fold_train, y_fold_train)
    
    preds_val = model.predict_proba(X_fold_val)[:, 1]
    auc = roc_auc_score(y_fold_val, preds_val)
    aucs.append(auc)
    lrs.append(model)

print(f"Mean AUC with Best XGBoost: {np.mean(aucs):.3f}")


# Submission

Now we evaluate the previous models trained through cross-validation so that to produce a submission file that can directly be uploaded on the data challenge platform.

## Data processing

In [None]:
def process_test_sample(sample, features_dir):
    _features = np.load(features_dir / sample)
    coordinates, features = _features[:, :3], _features[:, 3:]
    
    mean_feat = np.mean(features, axis=0)
    std_feat = np.std(features, axis=0)
    max_feat = np.max(features, axis=0)
    
    return np.concatenate([mean_feat, std_feat, max_feat])

print("Starting parallel feature extraction (Test)...")
X_test_list = Parallel(n_jobs=-1)(
    delayed(process_test_sample)(sample, test_features_dir)
    for sample in tqdm(df_test["Sample ID"].values)
)
X_test = np.array(X_test_list)
print(f"X_test shape: {X_test.shape}")


## Inference

In [None]:
preds_test = 0
# loop over the classifiers
for lr in lrs:
    preds_test += lr.predict_proba(X_test)[:, 1]
# and take the average (ensembling technique)
preds_test = preds_test / len(lrs)

## Saving predictions

In [None]:
submission = pd.DataFrame(
    {"Sample ID": df_test["Sample ID"].values, "Target": preds_test}
).sort_values(
    "Sample ID"
)

# save the submission as a csv file
submission.to_csv(data_dir / "xgb_enhanced_submission.csv", index=None)
submission.head()


# Dealing with images

The following code aims to load and manipulate the images provided as part of  this challenge.

## Scanning images paths on disk

This operation can take up to 5 minutes.

In [None]:
train_images_dir = data_dir / "train_input" / "images"
train_images_files = list(train_images_dir.rglob("*.jpg"))

test_images_dir = data_dir / "test_input" / "images"
test_images_files = list(test_images_dir.rglob("*.jpg"))

print(
    f"Number of images\n"
    "-----------------\n"
    f"Train: {len(train_images_files)}\n" # 344 x 1000 = 344,000 tiles
    f"Test: {len(test_images_files)}\n"  # 149 x 1000 = 149,000 tiles
    f"Total: {len(train_images_files) + len(test_images_files)}\n"  # 493 x 1000 = 493,000 tiles
)

## Reading

Now we can load some of the `.jpg` images for a given sample, say `ID_001`.

In [None]:
ID_001_tiles = [p for p in train_images_files if 'ID_001' in p.name]

In [None]:
fig, axes = plt.subplots(5, 5)
fig.set_size_inches(12, 12)

for i, img_file in enumerate(ID_001_tiles[:25]):
    # get the metadata from the file path
    _, metadata = str(img_file).split("tile_")
    id_tile, level, x, y = metadata[:-4].split("_")
    img = plt.imread(img_file)
    ax = axes[i//5, i%5]
    ax.imshow(img)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f"Tile {id_tile} ({x}, {y})")
plt.show()

## Mapping with features

Note that the coordinates in the features matrices and tiles number are aligned.

In [None]:
sample = "ID_001.npy"
_features = np.load(train_features_dir / sample)
coordinates, features = _features[:, :3], _features[:, 3:]
print("xy features coordinates")
coordinates[:10, 1:].astype(int)

In [None]:
print(
    "Tiles numbering and features coordinates\n"
)
[tile.name for tile in ID_001_tiles[:10]]