In [None]:
!kaggle competitions download -c histopathologic-cancer-detection --path kaggle --quiet
!cd kaggle; unzip -qq histopathologic-cancer-detection.zip

In [None]:
import pandas as pd
import cv2
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms as T, models

from pl_flash import Flash
from pl_flash.vision import ImageClassificationData
from pytorch_lightning import Trainer
import pytorch_lightning.metrics.functional as FM

### Data Augmentation

Challenge specific data augmentation. According to the challenge, "A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. Tumor tissue in the outer region of the patch does not influence the label." We'll center-crop the images at 49x49px for this reason.

In [None]:
normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

train_transforms = T.Compose([
    T.RandomHorizontalFlip(),
    T.RandomVerticalFlip(),
    T.RandomRotation(20),
    T.CenterCrop((49, 49)),
    T.ToTensor(),
    normalize,
])

valid_transforms = T.Compose([
    T.CenterCrop((49, 49)),
    T.ToTensor(),
    normalize,
])

### Data

We can use a `pl_flash.vision.ImageClassificationData` object to load our data:

In [None]:
df = pd.read_csv("kaggle/train_labels.csv")
split = int(0.8 * len(df))
train_df, val_df = df[:split], df[split:]

def filepaths(df):
     return [f"kaggle/train/{_id}.tif" for _id in df["id"]]   

data = ImageClassificationData.from_filepaths(
    train_filepaths=filepaths(train_df),
    train_labels=list(train_df["label"]),
    train_transform=train_transforms,
    valid_filepaths=filepaths(val_df),
    valid_labels=list(val_df["label"]),
    valid_transform=valid_transforms,
)

### Model

For our model architecture, we'll use a pretrained ResNet, with a few additional linear layers. We can then create a ready-to-train `Flash` model, simply by specifying our loss function, optimizer, learning rate, and desired metric(s). 

In [None]:
resnet = models.resnet50(pretrained=True)
in_features = resnet.fc.in_features

model = nn.Sequential(
    nn.Sequential(*list(resnet.children())[:-2]), # resnet until pool
    nn.AdaptiveMaxPool2d(1),
    nn.Flatten(),
    nn.BatchNorm1d(in_features),
    nn.Dropout(0.5),
    nn.Linear(in_features=in_features, out_features=512),
    nn.ReLU(),
    nn.BatchNorm1d(512),
    nn.Dropout(0.5),
    nn.Linear(in_features=512, out_features=2),
)

def auroc(x, y):
    return FM.auroc(F.softmax(x, dim=1)[:, 1], y)

flash_model = Flash(
    model,
    loss=F.cross_entropy,
    metrics={"auroc": auroc},
    optimizer="SGD",
    learning_rate=1e-2
)

### Train

Now our `Flash` model can be used like any other PyTorch Lightning model:

In [None]:
trainer = Trainer(gpus=1, max_epochs=20)
trainer.fit(flash_model, data)

In [None]:
trainer.test(flash_model, data.val_dataloader())

AUROC 0.9739 ~ top 10% on leaderboard.

See resulting [TensorBoard.dev](https://tensorboard.dev/experiment/ewumij9mQDy1wg46jegVDw/)