Loading data from Kaggle

In [None]:


import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pretrained-pytorch-models:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2847%2F4958%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240407%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240407T232853Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2a8474fe76ee211e2cfa9ee3c8e7dc22f46ae7b29489e82f5c8cb0bf6adf7d5df83f271b0d237fa152c9fb8fc9fd958dee0921b77369d028ba144d3d2fb02dcb4e1218a8bb46ae2a2d80a0439204ec97026aef186ad69b6cdfdf283ea184df0d02f8017bc9bc31a6cd95ac171fe37f125f9038be629493069b6844039f931e9d79c9ad5d61b19d20517f28f6cb5a4332e2f99fc2f2e57429f4d3951fd630939a30485a5c59370ebdeb50e2cd1fc7060ccffe8fe670b4b865a2c8f62d683b5e34b9504af7125e58dd652da21810fc8add946c3332c890335477ed815a04448d78105480c94f6193739fd1a0ead9bc213a1826b3b41dc2e89d677339399cc7c23a,breast-histopathology-images:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F7415%2F10564%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240407%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240407T232853Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9deceb82ff80442191eb8b09f97da393680c902d986e1b7db39e0b46599a394d0b0dcfe3d130b36f3a21ae3e0bf918e27d7d8fb7aecbc6d0597d02dbad0eb5cf89ea15f423280b8747e033033949c84bea04d01a3aee3454e20c4c7fe86ab2bd8e5859bbb20d8ce04f746fad778230a863926e4cc322e9d410a33d14da22dac3a334a804cbdb605b582235823273d11f9b58e1fc3d847bc8cd482fc14a9dbc93b43e19d74e373b75659bedb8a5d2c862956ac801f01b6361eeecde40359981fe8d9538817d348a44bfcb63ac9cdb035696167cc0c49ef96891e88dcf7b64a8878f9c3422b4371cdf4e4bf37bd0282255dca87dfe8b8d0da906ec7aaa92e1c757,breastcancermodel:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F448448%2F1362309%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240407%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240407T232853Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D19163ad330f8617dd54369dd55fff08d0119c0d5accd64956595057d3d2a521431eb44b90b626156580f2153c7434776d0b40af659249d3674e19c89ed93099472d26a09dd7099ceddb3a82f94a03fb59133519966075b45d947be57ab76af97d664d180f363e7585a6501fbbc74a8adb3aa1b99c1fe65e279b7cd1aa0b075f88edf154cf7b1b2c9b392f37259a934efb048d3f655cba87806919484f24cd22bf86464fd2891aabc6a7241efa9913c71b4d8962e5ab370710a4e7e6794f8b24234a65c98a831689f52831b89898f2ad8a6267f029d896a8b49cc1b8a1f1ff23d13c4a45b38c333072ca64956e74ed0ea836fbbc3ddb307e04b71a905f4e27382'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /val/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Importing Libraries


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, CyclicLR
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight


from glob import glob
from skimage.io import imread
from os import listdir

import time
import copy
from tqdm import tqdm_notebook as tqdm

In [None]:
run_training = False
retrain = False
find_learning_rate = False

In [None]:
base_path = "../input/breast-histopathology-images/IDC_regular_ps50_idx5/"
folder = listdir(base_path)
len(folder)

279

In [None]:
total_images = 0
for n in range(len(folder)):
    patient_id = folder[n]
    for c in [0, 1]:
        patient_path = base_path + patient_id
        class_path = patient_path + "/" + str(c) + "/"
        subfiles = listdir(class_path)
        total_images += len(subfiles)

In [None]:
total_images

277524

In [None]:
data = pd.DataFrame(index=np.arange(0, total_images), columns=["patient_id", "path", "target"])

Unnamed: 0,patient_id,path,target
0,10295,../input/breast-histopathology-images/IDC_regu...,0
1,10295,../input/breast-histopathology-images/IDC_regu...,0
2,10295,../input/breast-histopathology-images/IDC_regu...,0
3,10295,../input/breast-histopathology-images/IDC_regu...,0
4,10295,../input/breast-histopathology-images/IDC_regu...,0


In [None]:
data.target = data.target.astype(np.int)

In [None]:
pos_selection = np.random.choice(data[data.target==1].index.values, size=50, replace=False)
neg_selection = np.random.choice(data[data.target==0].index.values, size=50, replace=False)

Some code below are referenced from kaggle:
Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
def extract_coords(df):
    coord = df.path.str.rsplit("_", n=4, expand=True)
    coord = coord.drop([0, 1, 4], axis=1)
    coord = coord.rename({2: "x", 3: "y"}, axis=1)
    coord.loc[:, "x"] = coord.loc[:,"x"].str.replace("x", "", case=False).astype(np.int)
    coord.loc[:, "y"] = coord.loc[:,"y"].str.replace("y", "", case=False).astype(np.int)
    df.loc[:, "x"] = coord.x.values
    df.loc[:, "y"] = coord.y.values
    return df

def get_cancer_dataframe(patient_id, cancer_id):
    path = base_path + patient_id + "/" + cancer_id
    files = listdir(path)
    dataframe = pd.DataFrame(files, columns=["filename"])
    path_names = path + "/" + dataframe.filename.values
    dataframe = dataframe.filename.str.rsplit("_", n=4, expand=True)
    dataframe.loc[:, "target"] = np.int(cancer_id)
    dataframe.loc[:, "path"] = path_names
    dataframe = dataframe.drop([0, 1, 4], axis=1)
    dataframe = dataframe.rename({2: "x", 3: "y"}, axis=1)
    dataframe.loc[:, "x"] = dataframe.loc[:,"x"].str.replace("x", "", case=False).astype(np.int)
    dataframe.loc[:, "y"] = dataframe.loc[:,"y"].str.replace("y", "", case=False).astype(np.int)
    return dataframe

def get_patient_dataframe(patient_id):
    df_0 = get_cancer_dataframe(patient_id, "0")
    df_1 = get_cancer_dataframe(patient_id, "1")
    patient_df = df_0.append(df_1)
    return patient_df

In [None]:
example = get_patient_dataframe(data.patient_id.values[0])
example.head()

Unnamed: 0,x,y,target,path
0,1351,1101,0,../input/breast-histopathology-images/IDC_regu...
1,1501,501,0,../input/breast-histopathology-images/IDC_regu...
2,1501,1101,0,../input/breast-histopathology-images/IDC_regu...
3,451,901,0,../input/breast-histopathology-images/IDC_regu...
4,801,451,0,../input/breast-histopathology-images/IDC_regu...


In [None]:
BATCH_SIZE = 32
NUM_CLASSES = 2

OUTPUT_PATH = ""
MODEL_PATH = "../input/breastcancermodel/"
LOSSES_PATH = "../input/breastcancermodel/"

In [None]:
torch.manual_seed(0)
np.random.seed(0)

In [None]:
data.head()
data.loc[:, "target"] = data.target.astype(np.str)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 277524 entries, 0 to 277523
Data columns (total 3 columns):
patient_id    277524 non-null object
path          277524 non-null object
target        277524 non-null object
dtypes: object(3)
memory usage: 18.5+ MB


In [None]:
patients = data.patient_id.unique()

train_ids, sub_test_ids = train_test_split(patients,
                                           test_size=0.3,
                                           random_state=0)
test_ids, val_ids = train_test_split(sub_test_ids, test_size=0.5, random_state=0)

So far we can't stratify on the targets as we are splitting on patient ids. If we would like to include some target information we would need to create a feature that allows us to generate some balance.

In [None]:
print(len(train_ids)/patients.shape[0]*100, len(val_ids)/patients.shape[0]*100, len(test_ids)/patients.shape[0]*100)

69.89247311827957 15.053763440860216 15.053763440860216


In [None]:
print(len(train_ids), len(val_ids), len(test_ids))

195 42 42


In [None]:
train_df = data.loc[data.patient_id.isin(train_ids),:].copy()
test_df = data.loc[data.patient_id.isin(test_ids),:].copy()
val_df = data.loc[data.patient_id.isin(val_ids),:].copy()

train_df = extract_coords(train_df)
test_df = extract_coords(test_df)
val_df = extract_coords(val_df)

Some code below are referenced from kaggle:
Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
def my_transform(key="train", plot=False):
    train_sequence = [transforms.Resize((50,50)),
                      transforms.RandomHorizontalFlip(),
                      transforms.RandomVerticalFlip()]
    val_sequence = [transforms.Resize((50,50))]
    if plot==False:
        train_sequence.extend([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
        val_sequence.extend([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

    data_transforms = {'train': transforms.Compose(train_sequence),'val': transforms.Compose(val_sequence)}
    return data_transforms[key]

In [None]:
class BreastCancerDataset(Dataset):

    def __init__(self, df, transform=None):
        self.states = df
        self.transform=transform

    def __len__(self):
        return len(self.states)

    def __getitem__(self, idx):
        patient_id = self.states.patient_id.values[idx]
        x_coord = self.states.x.values[idx]
        y_coord = self.states.y.values[idx]
        image_path = self.states.path.values[idx]
        image = Image.open(image_path)
        image = image.convert('RGB')

        if self.transform:
            image = self.transform(image)

        if "target" in self.states.columns.values:
            target = np.int(self.states.target.values[idx])
        else:
            target = None

        return {"image": image,
                "label": target,
                "patient_id": patient_id,
                "x": x_coord,
                "y": y_coord}

In [None]:
train_dataset = BreastCancerDataset(train_df, transform=my_transform(key="train"))
val_dataset = BreastCancerDataset(val_df, transform=my_transform(key="val"))
test_dataset = BreastCancerDataset(test_df, transform=my_transform(key="val"))

In [None]:
image_datasets = {"train": train_dataset, "val": val_dataset, "test": test_dataset}
dataset_sizes = {x: len(image_datasets[x]) for x in ["train", "val", "test"]}

## Creating pytorch dataloaders <a class="anchor" id="dataloaders"></a>

As the gradients for each learning step are computed over batches we benefit from shuffling the training data after each epoch. This way each batch is composed differently and we don't start to learn for specific sequences of images. For validation and training we drop the last batch that often consists less images than the batch size.

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

In [None]:
dataloaders = {"train": train_dataloader, "val": val_dataloader, "test": test_dataloader}

In [None]:
print(len(dataloaders["train"]), len(dataloaders["val"]), len(dataloaders["test"]))

6127 1262 1284


## Defining the model structure <a class="anchor" id="model_structure"></a>

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

This model is referenced from kaggle:
Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
model = torchvision.models.resnet18(pretrained=False)
if run_training:
    model.load_state_dict(torch.load("../input/pretrained-pytorch-models/resnet18-5c106cde.pth"))
num_features = model.fc.in_features
print(num_features)

model.fc = nn.Sequential(
    nn.Linear(num_features, 512),
    nn.ReLU(),
    nn.BatchNorm1d(512),
    nn.Dropout(0.5),

    nn.Linear(512, 256),
    nn.ReLU(),
    nn.BatchNorm1d(256),
    nn.Dropout(0.5),

    nn.Linear(256, NUM_CLASSES))

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(init_weights)
model = model.to(device)

512


In [None]:
NUM_CLASSES=2

This Model is built by ourselves

In [None]:
model2 = torchvision.models.resnet18(pretrained=False)
if run_training:
    model2.load_state_dict(torch.load("../input/pretrained-pytorch-models/resnet18-5c106cde.pth"))
num_features = model2.fc.in_features
print(num_features)

model2.fc = nn.Sequential(
    nn.Linear(512, 240),
    nn.ReLU(),
    nn.BatchNorm1d(240),
    nn.Dropout(0.6),

    nn.Linear(240, 128),
    nn.ReLU(),
    nn.BatchNorm1d(128),
    nn.Dropout(0.4),

    nn.Linear(128, 3))

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model2.apply(init_weights)
model2 = model2.to(device)

This model is built by our group

In [None]:

model3 = torchvision.models.resnet18(pretrained=False)
if run_training:
    model3.load_state_dict(torch.load("../input/pretrained-pytorch-models/resnet18-5c106cde.pth"))
num_features = model3.fc.in_features
print(num_features)

# Modify Fully Connected Layer
model3.fc = nn.Sequential(
    nn.Linear(num_features, 240),
    nn.ReLU(),
    nn.BatchNorm1d(240),
    nn.Dropout(0.5),

    # Add Pooling Layer
    nn.AdaptiveAvgPool2d((1, 1)),
)

# Update num_features for the next layer
num_features = 240

# Add new Linear layer
model3.fc_new = nn.Sequential(
    nn.Linear(num_features, 128),
    nn.ReLU(),
    nn.BatchNorm1d(128),
    nn.Dropout(0.5),

    nn.Linear(128, NUM_CLASSES)
)

# Initialize Weights
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model3.fc.apply(init_weights)
model3.fc_new.apply(init_weights)

# Combine the two parts
model3 = nn.Sequential(
    model3,
  # Flatten the output before feeding into the fully connected layers
    model3.fc_new
)

model3 = model3.to(device)

512


In [None]:
weights = compute_class_weight(y=train_df.target.values, class_weight="balanced", classes=train_df.target.unique())
class_weights = torch.FloatTensor(weights)
if device.type=="cuda":
    class_weights = class_weights.cuda()
print(class_weights)

tensor([0.6967, 1.7710])


In [None]:
train_df.target.unique()

array(['0', '1'], dtype=object)

You can see that class 1 (positive cancer) has a higher weight.

In [None]:
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
def f1_score(preds, targets):

    tp = (preds*targets).sum().to(torch.float32)
    fp = ((1-targets)*preds).sum().to(torch.float32)
    fn = (targets*(1-preds)).sum().to(torch.float32)

    epsilon = 1e-7
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)

    f1_score = 2 * precision * recall/(precision + recall + epsilon)
    return f1_score

This code block is referenced from kaggle with some modifications to hyperparameters:

Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
def train_loop(model, criterion, optimizer, lr_find=False, scheduler=None, num_epochs = 3, lam=0.0):
    since = time.time()
    if lr_find:
        phases = ["train"]
    else:
        phases = ["train", "val", "test"]

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    loss_dict = {"train": [], "val": [], "test": []}
    lam_tensor = torch.tensor(lam, device=device)

    running_loss_dict = {"train": [], "val": [], "test": []}

    lr_find_loss = []
    lr_find_lr = []
    smoothing = 0.2

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in phases:
            if phase == "train":
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            tk0 = tqdm(dataloaders[phase], total=int(len(dataloaders[phase])))

            counter = 0
            for bi, d in enumerate(tk0):
                inputs = d["image"]
                labels = d["label"]
                inputs = inputs.to(device, dtype=torch.float)
                labels = labels.to(device, dtype=torch.long)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()

                        #l2_reg = torch.tensor(0., device=device)
                        #for param in model.parameters():
                            #l2_reg = lam_tensor * torch.norm(param)

                        #loss += l2_reg

                        optimizer.step()
                        # cyclical lr schedule is invoked after each batch
                        if scheduler is not None:
                            scheduler.step()
                            if lr_find:
                                lr_step = optimizer.state_dict()["param_groups"][0]["lr"]
                                lr_find_lr.append(lr_step)
                                if counter==0:
                                    lr_find_loss.append(loss.item())
                                else:
                                    smoothed_loss = smoothing  * loss.item() + (1 - smoothing) * lr_find_loss[-1]
                                    lr_find_loss.append(smoothed_loss)

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                counter += 1


                tk0.set_postfix({'loss': running_loss / (counter * dataloaders[phase].batch_size),
                                 'accuracy': running_corrects.double() / (counter * dataloaders[phase].batch_size)})
                running_loss_dict[phase].append(running_loss / (counter * dataloaders[phase].batch_size))

            epoch_loss = running_loss / dataset_sizes[phase]
            loss_dict[phase].append(epoch_loss)
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    results = {"model": model,
               "loss_dict": loss_dict,
               "running_loss_dict": running_loss_dict,
               "lr_find": {"lr": lr_find_lr, "loss": lr_find_loss}}
    return results

## Searching for an optimal cyclical learning rate <a class="anchor" id="lr_cycle_optima"></a>

The learning rate is one of the most important hyperparameters for tuning neural networks. A rate that is too high will lead to jumps to higher values in the training loss during optimization. If it's too small the learning process is too slow and will probably stop too early in the case we have defined a minimum required loss change. Take a look at the paper [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186).

This code block is referenced from kaggle :

Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
start_lr = 1e-6
end_lr = 0.1

In [None]:
def get_lr_search_scheduler(optimizer, min_lr, max_lr, max_iterations):
    # max_iterations should be the number of steps within num_epochs_*epoch_iterations
    # this way the learning rate increases linearily within the period num_epochs*epoch_iterations
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer,
                                               base_lr=min_lr,
                                               max_lr=max_lr,
                                               step_size_up=max_iterations,
                                               step_size_down=max_iterations,
                                               mode="triangular")

    return scheduler

def get_scheduler(optimiser, min_lr, max_lr, stepsize):
    # suggested_stepsize = 2*num_iterations_within_epoch
    stepsize_up = np.int(stepsize/2)
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimiser,
                                               base_lr=min_lr,
                                               max_lr=max_lr,
                                               step_size_up=stepsize_up,
                                               step_size_down=stepsize_up,
                                               mode="triangular")
    return scheduler

In [None]:
import math

if find_learning_rate:
    lr_find_epochs=1
    optimizer = optim.SGD(model.fc.parameters(), start_lr)
    scheduler = get_lr_search_scheduler(optimizer, start_lr, end_lr, lr_find_epochs*len(train_dataloader))
    results = train_loop(model2, criterion, optimizer, lr_find=True, scheduler=scheduler, num_epochs=lr_find_epochs)
    lr_find_lr, lr_find_loss = results["lr_find"]["lr"], results["lr_find"]["loss"]

    find_lr_df = pd.DataFrame(lr_find_loss, columns=["smoothed loss"])
    find_lr_df.loc[:, "lr"] = lr_find_lr
    find_lr_df.to_csv("learning_rate_search.csv", index=False)
else:
    find_lr_df = pd.read_csv(MODEL_PATH + "learning_rate_search.csv")

## Performing the training or loading results <a class="anchor" id="run"></a>

In [None]:
device=torch.device()

This code block is referenced from kaggle with some modifications to hyperparameters:

Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
print(run_training)
print(device)
run_training=True
if run_training:
    NUM_EPOCHS = 2
    optimizer = optim.SGD(model.fc.parameters(), lr=0.01)
    scheduler = get_scheduler(optimizer, start_lr, end_lr, 2*NUM_EPOCHS)
    results = train_loop(model, criterion, optimizer, scheduler=scheduler, num_epochs = NUM_EPOCHS)
    model, loss_dict, running_loss_dict = results["model"], results["loss_dict"], results["running_loss_dict"]

    if device == "cpu":
        OUTPUT_PATH += ".pth"
    else:
        OUTPUT_PATH += "_cuda.pth"

    torch.save(model.state_dict(), OUTPUT_PATH)

    losses_df = pd.DataFrame(loss_dict["train"],columns=["train"])
    losses_df.loc[:, "val"] = loss_dict["val"]
    losses_df.loc[:, "test"] = loss_dict["test"]
    losses_df.to_csv("losses_breastcancer.csv", index=False)

    running_losses_df = pd.DataFrame(running_loss_dict["train"], columns=["train"])
    running_losses_df.loc[0:len(running_loss_dict["val"])-1, "val"] = running_loss_dict["val"]
    running_losses_df.loc[0:len(running_loss_dict["test"])-1, "test"] = running_loss_dict["test"]
    running_losses_df.to_csv("running_losses_breastcancer.csv", index=False)
else:
    if device == "cpu":
        load_path = MODEL_PATH + ".pth"
    else:
        load_path = MODEL_PATH + "_cuda.pth"
    model.load_state_dict(torch.load(load_path, map_location='cpu'))
    model.eval()

    losses_df = pd.read_csv(LOSSES_PATH + "losses_breastcancer.csv")
    running_losses_df = pd.read_csv(LOSSES_PATH + "running_losses_breastcancer.csv")

True
cpu
Epoch 0/1
----------


HBox(children=(IntProgress(value=0, max=6127), HTML(value='')))

This code block is referenced from kaggle with some modifications to hyperparameters:

Fink, L. (2020). Breast Cancer [Kaggle notebook]. Kaggle.
https://www.kaggle.com/code/allunia/breast-cancer

In [None]:
def sigmoid(x):
    return 1./(1+np.exp(-x))

def evaluate_model(model, predictions_df, key):
    was_training = model.training
    model.eval()

    with torch.no_grad():
        for i, data in enumerate(dataloaders[key]):
            inputs = data["image"].to(device)
            labels = data["label"].to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            proba = outputs.cpu().numpy().astype(np.float)
            predictions_df.loc[i*BATCH_SIZE:(i+1)*BATCH_SIZE-1, "proba"] = sigmoid(proba[:, 1])
            predictions_df.loc[i*BATCH_SIZE:(i+1)*BATCH_SIZE-1, "true"] = data["label"].numpy().astype(np.int)
            predictions_df.loc[i*BATCH_SIZE:(i+1)*BATCH_SIZE-1, "predicted"] = preds.cpu().numpy().astype(np.int)
            predictions_df.loc[i*BATCH_SIZE:(i+1)*BATCH_SIZE-1, "x"] = data["x"].numpy()
            predictions_df.loc[i*BATCH_SIZE:(i+1)*BATCH_SIZE-1, "y"] = data["y"].numpy()
            predictions_df.loc[i*BATCH_SIZE:(i+1)*BATCH_SIZE-1, "patient_id"] = data["patient_id"]

    predictions_df = predictions_df.dropna()
    return predictions_df

In [None]:
if run_training:
    val_predictions = pd.DataFrame(index = np.arange(0, dataset_sizes["val"]), columns = ["true", "predicted", "proba"])
    test_predictions = pd.DataFrame(index = np.arange(0, dataset_sizes["test"]), columns = ["true", "predicted", "proba"])

    val_predictions = evaluate_model(model, val_predictions, "val")
    test_predictions = evaluate_model(model, test_predictions, "test")

    val_predictions.to_csv("val_predictions.csv", index=False)
    test_predictions.to_csv("test_predictions.csv", index=False)

else:

    val_predictions = pd.read_csv(LOSSES_PATH + "val_predictions.csv")
    test_predictions = pd.read_csv(LOSSES_PATH + "test_predictions.csv")

    val_predictions.patient_id = val_predictions.patient_id.astype(np.str)

In [None]:
from sklearn.metrics import confusion_matrix

def get_confusion_matrix(y_true, y_pred):
    transdict = {1: "cancer", 0: "no cancer"}
    y_t = np.array([transdict[x] for x in y_true])
    y_p = np.array([transdict[x] for x in y_pred])

    labels = ["no cancer", "cancer"]
    index_labels = ["actual no cancer", "actual cancer"]
    col_labels = ["predicted no cancer", "predicted cancer"]
    confusion = confusion_matrix(y_t, y_p, labels=labels)
    confusion_df = pd.DataFrame(confusion, index=index_labels, columns=col_labels)
    for n in range(2):
        confusion_df.iloc[n] = confusion_df.iloc[n] / confusion_df.sum(axis=1).iloc[n]
    return confusion_df

In [None]:
get_confusion_matrix(test_predictions["true"],test_predictions["predicted"])

Unnamed: 0,predicted no cancer,predicted cancer
actual no cancer,0.854794,0.145206
actual cancer,0.213635,0.786365


# Conclusion <a class="anchor" id="conclusion"></a>