In [1]:
from __future__ import division
import argparse
import torch
import torch.nn.functional as F
import torchvision.datasets as dset
import torchvision.transforms as T
import torchvision.models as models
import pandas as pd
import os
import pydicom
import numpy as np
import h5py
import time
from tqdm import tqdm as tqdm
import cv2
import pickle
from shutil import copyfile
from matplotlib import pyplot as plt
import seaborn as sns
import random

In [2]:
# activation will store the features (batched)
activation = None

# hook to get feature vector in forward pass
def hook(model, input, output):
    global activation
    activation = input

number_of_samples = 100000

global_mean = 0.0
global_std = 500.0
transform = T.Normalize(mean=[global_mean], std=[global_std])

features_filename = '/scratch/efficientnet-test-features.hdf5'

device = 'cuda'

# get best model
#model = models.resnext50_32x4d(pretrained=True, progress=True)
#model.fc = torch.nn.Linear(model.fc.in_features, 1)
#model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
#model.to(device)

#best_model_path = '/projectnb/ece601/kaggle-pulmonary-embolism/cliao25/EC601-Pulmonary-Embolism/SequenceModeling/exp-4-SGD/model-resnext-50-28.pth'
#model.load_state_dict(torch.load(best_model_path))

from efficientnet_pytorch import EfficientNet
cnn = EfficientNet.from_pretrained('efficientnet-b0',num_classes=1).cuda()
cnn._conv_stem.in_channels = 1
cnn._conv_stem = torch.nn.Conv2d(1, 32, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
cnn.load_state_dict(
    torch.load(
        '/projectnb/ece601/kaggle-pulmonary-embolism/jiamingy/efficientnetb0/model-efficientb0-40.pth'
    )
)
#cnn = torch.nn.DataParallel(cnn)
model = cnn.to(device)

model._fc.register_forward_hook(hook)


Loaded pretrained weights for efficientnet-b0


<torch.utils.hooks.RemovableHandle at 0x2b459164f750>

In [3]:
valid_samples = 26540
samples_per_split = 100000
train_samples = 70000

class KagglePEDataset(torch.utils.data.Dataset):
    """Kaggle PE dataset."""

    def __init__(self, csv_file, stage, transform=None, split=0):
        """
        Args:
            csv_file (string): Path to the csv file.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.pedataframe = pd.read_csv(csv_file)
        self.pos_df = self.pedataframe[self.pedataframe.pe_present_on_image == 1]
        self.split_df = self.pedataframe[split*samples_per_split : (split+1)*samples_per_split]
        self.neg_df = self.split_df[self.split_df.pe_present_on_image == 0]
        self.transform = transform
        self.stage = stage
        
        # data is divided into sets of 100,000 2D slices
        # use set 16, 17, and 18 to test
        
        self.split = split
        
        self.split_hdf5_filename = '/scratch/npy-' + str(split + 1) + '.hdf5'
    
    def destroy(self):
        ''' For copy on the fly (when scratch space insufficient)'''
        os.remove(self.filename )

    def __len__(self):
        """ Return number of 2D images. (Each CT slice is an independent image.)"""
        if (self.split + 1) * 100000 > len(self.pedataframe):
            return len(self.pedataframe) - (self.split * 100000)
        
        return 100000
    
    def get_class_weights(self):
        subset = self.pedataframe[:len(self)]
        pos = subset[subset.pe_present_on_image == 1]
        neg = subset[subset.pe_present_on_image == 0]
        return 1. / torch.tensor([len(neg), len(pos)], dtype=torch.float)
    
    def get_targets(self):
        subset = self.pedataframe[:len(self)]
        return torch.tensor(subset.pe_present_on_image, dtype=torch.long)
    
    def center_crop(self, crop_size, img):
        row = (img.shape[-2] - crop_size) // 2
        col = (img.shape[-1] - crop_size) // 2
        img = img[row : row + crop_size, col : col + crop_size]
        return img
    
    def random_crop(self, crop_size, img):
        row = random.randint(0, img.shape[-2] - crop_size)
        col = random.randint(0, img.shape[-1] - crop_size)
        img = img[row : row + crop_size, col : col + crop_size]
        return img
    
    def random_flip(self, img):
        r = random.randint(0,1)
        if r == 0:
            img = np.copy(np.flipud(img))
        return img

    def __getitem__(self, idx):
        ''' idx is index into dataframe. '''
        
        idx = idx + self.split*100000

        data_identifier = self.pedataframe.StudyInstanceUID[idx] + \
            '/' + self.pedataframe.SOPInstanceUID[idx]
        
        # look for image in negative dataset
        h5py_file = h5py.File(self.split_hdf5_filename, "r")
        
        img = h5py_file[data_identifier][:]
        
        #resize 512x512 -> 256x256
        img = cv2.resize(img, (256,256), interpolation=cv2.INTER_AREA)
        
        # center crop 224
        if self.stage == 'train':
            img = self.random_crop(224, img)
            img = self.random_flip(img)
        else:
            img = self.center_crop(224, img)

        # unsqueeze to add channel dimension
        img = torch.tensor(img, dtype=torch.float).unsqueeze(0)
        h5py_file.close()
        
        return self.transform(img), idx

In [4]:
data_dir = '/projectnb/ece601/kaggle-pulmonary-embolism/rsna-str-pulmonary-embolism-detection/'
train_csv = data_dir + 'train.csv'
train_dir = data_dir + 'train/'
train_df = pd.read_csv(train_csv)

global_mean = 0.0
global_std = 500.0
transform = T.Normalize(mean=[global_mean], std=[global_std])

In [5]:
for test_set_idx in [17, 16, 15]:
    model.eval()

    h5py_features_file = h5py.File(features_filename, "a")
    batch_size = 32
    pe_dataset = KagglePEDataset(csv_file=train_csv, stage='valid', transform=transform, split=test_set_idx)
    loader = torch.utils.data.DataLoader(pe_dataset, batch_size=batch_size, num_workers=1)

    with torch.no_grad():
        for (data, idx) in tqdm(loader):
            data = data.to(device)
            _ = model(data)

            # activation[0] is Bx2048. Store this vector
            activation = activation[0].detach().cpu()

            for sample in range(activation.shape[0]):
                sample_idx = idx[sample] # index into dataframe
                h5py_features_file.create_dataset(str(sample_idx), data=activation[sample,:])
    h5py_features_file.close()

100%|██████████| 2832/2832 [02:50<00:00, 16.65it/s]
100%|██████████| 3125/3125 [03:05<00:00, 16.84it/s]
100%|██████████| 3125/3125 [03:06<00:00, 16.77it/s]


In [8]:
features_filename = '/scratch/resnet-test-features.hdf5'

device = 'cuda'

# activation will store the features (batched)
activation = None

# hook to get feature vector in forward pass
def hook(model, input, output):
    global activation
    activation = input

# get best model
model = models.resnext50_32x4d(pretrained=True, progress=True)
model.fc = torch.nn.Linear(model.fc.in_features, 1)
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
model.to(device)

best_model_path = '/projectnb/ece601/kaggle-pulmonary-embolism/cliao25/EC601-Pulmonary-Embolism/SequenceModeling/exp-4-SGD/model-resnext-50-28.pth'
model.load_state_dict(torch.load(best_model_path))

model.fc.register_forward_hook(hook)


<torch.utils.hooks.RemovableHandle at 0x2b459176ced0>

In [9]:
for test_set_idx in [17, 16, 15]:
    model.eval()

    h5py_features_file = h5py.File(features_filename, "a")
    batch_size = 32
    pe_dataset = KagglePEDataset(csv_file=train_csv, stage='valid', transform=transform, split=test_set_idx)
    loader = torch.utils.data.DataLoader(pe_dataset, batch_size=batch_size, num_workers=1)

    with torch.no_grad():
        for (data, idx) in tqdm(loader):
            data = data.to(device)
            _ = model(data)

            # activation[0] is Bx2048. Store this vector
            activation = activation[0].detach().cpu()

            for sample in range(activation.shape[0]):
                sample_idx = idx[sample] # index into dataframe
                h5py_features_file.create_dataset(str(sample_idx), data=activation[sample,:])
    h5py_features_file.close()

100%|██████████| 2832/2832 [05:07<00:00,  9.21it/s]
100%|██████████| 3125/3125 [05:37<00:00,  9.25it/s]
100%|██████████| 3125/3125 [05:36<00:00,  9.28it/s]
