In [77]:
from res.plot_lib import plot_data, plot_model, set_default
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import numpy as np
import pandas as pd
import helper
import os
from sklearn.metrics import roc_curve, auc

from skimage.transform import rotate
from skimage.util import random_noise
from skimage.filters import gaussian
from skimage.io import imread, imsave
from tqdm import tqdm
from scipy import ndimage
from torchsummary import summary

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [78]:
### Code adapted from Yann LeCun and Alfredo Canziani 2019 Spring NYU Deep Learning Course
set_default()
def get_n_params(model):
    np=0
    for p in list(model.parameters()):
        np += p.nelement()
    return np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [79]:
data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
data.set_index("patient_id", inplace=True)
data["diagnosis"].value_counts()
data.head()

Unnamed: 0_level_0,image_name,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
IP_7279968,ISIC_2637011,male,45.0,head/neck,unknown,benign,0
IP_3075186,ISIC_0015719,female,45.0,upper extremity,unknown,benign,0
IP_2842074,ISIC_0052212,female,50.0,lower extremity,nevus,benign,0
IP_6890425,ISIC_0068279,female,45.0,head/neck,unknown,benign,0
IP_8723313,ISIC_0074268,female,55.0,upper extremity,unknown,benign,0


In [114]:
transform = transforms.Compose([transforms.Resize(255),
                                transforms.CenterCrop(224),
                                transforms.ToTensor()])


### Code by Andrew Jong https://gist.github.com/andrewjong/6b02ff237533b3b2c554701fb53d5c4d

class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0][-16:-4]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

dataset = ImageFolderWithPaths("/Users/jinmeng1/Desktop/College/Grad School/First Year Masters/Fall Semester/Intro to Data Science/Final/images/train_folder", transform=transform)

In [122]:
transform = transforms.Compose([transforms.Resize(255),
                                transforms.CenterCrop(224),
                                transforms.ToTensor()])
dataset = datasets.ImageFolder("/Users/jinmeng1/Desktop/College/Grad School/First Year Masters/Fall Semester/Intro to Data Science/Final/images/train_folder", transform=transform)
test_dataset = ImageFolderWithPaths("/Users/jinmeng1/Desktop/College/Grad School/First Year Masters/Fall Semester/Intro to Data Science/Final/images/test_folder", transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=40, shuffle=True)

## Image Augmentation

Rotating melanoma images to number of samples with melanoma

In [115]:
mel = np.array(dataset.targets) == 1

# for i in mel.nonzero()[0]:
#     rotated_images_batch = []
#     for j in range(1, 41): # range depends on batch size
#         rotated_images_batch.append((torch.tensor(rotate(dataloader.dataset.__getitem__(i)[0], angle=13*j, mode= 'wrap')), 1))

rotated_images = []
for i in mel.nonzero()[0]:
    for j in range(1, 26): 
        rotated_images.append((torch.tensor(rotate(dataloader.dataset.__getitem__(i)[0], angle=13*j, mode= 'wrap')), 1))
    
final_images = dataset.__add__(rotated_images)
dataloader = torch.utils.data.DataLoader(final_images, batch_size=40, shuffle=True)

### Fully Connected NN and ConvNet Class

In [117]:
input_size = 3*224*224
output_size = 2

class FC2Layer(nn.Module):
    def __init__(self, input_size, n_hidden, output_size):
        super(FC2Layer, self).__init__()
        self.input_size = input_size
        self.network = nn.Sequential(
            nn.Linear(input_size, n_hidden), 
            nn.ReLU(), 
            nn.Linear(n_hidden, n_hidden), 
            nn.ReLU(), 
            nn.Linear(n_hidden, output_size), 
            nn.LogSoftmax(dim=1)
        )

    def forward(self, x):
        x = x.view(-1, self.input_size)
        return self.network(x)

class CNN(nn.Module):
    def __init__(self, input_size, n_feature, output_size):
        super(CNN, self).__init__()
        self.n_feature = n_feature
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=n_features, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(n_features, n_features, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(self.n_feature*53*53, 50)
        self.fc2 = nn.Linear(50, output_size)
        
    def forward(self, x, verbose=False):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = x.view(-1, self.n_feature*53*53)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x
    
class leaky_CNN(nn.Module):
    def __init__(self, input_size, n_feature, output_size):
        super(CNN, self).__init__()
        self.n_feature = n_feature
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=n_features, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(n_features, n_features, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(self.n_feature*53*53, 50)
        self.fc2 = nn.Linear(50, output_size)
        
    def forward(self, x, verbose=False):
        x = self.conv1(x)
        x = F.leaky_relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = self.conv2(x)
        x = F.leaky_relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = x.view(-1, self.n_feature*53*53)
        x = self.fc1(x)
        x = F.leaky_relu(x)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x

### Training and Testing Functions

In [125]:
accuracy_list = []

def train(epoch, model, perm = torch.arange(0,150528).long()):
    model.train()
    for batch_idx, (data, target, img_id) in enumerate(dataloader):
        # send to device
        data, target = data.to(device), target.to(device)
        
        # permute pixels
        data = data.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(dataloader.dataset),
                100. * batch_idx / len(dataloader), loss.item()))
            
def test(model, perm = torch.arange(0,150528).long()):
    model.eval()
    test_loss = 0
    correct = 0
    test_auc_list = []
    for data, target, img_id in test_loader:
        # send to device
        data, target = data.to(device), target.to(device)
        
        # permute pixels
        data = data.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)
        output = model(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss                                                               
        
        output_array = np.array([np.max(output.detach().numpy()[i]) for i in range(output.size()[0])])
        fpr, tpr, _ = roc_curve(target.detach().numpy(), output_array)
        test_auc_list.append(auc(fpr, tpr))
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability                                                                 
        correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    accuracy_list.append(accuracy)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)   AUC: ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy, np.array(test_auc_list).mean() ))

In [36]:
# Fully connected network

n_hidden = 8 # number of hidden units

model_fnn = FC2Layer(input_size, n_hidden, output_size)
model_fnn.to(device)
optimizer = optim.SGD(model_fnn.parameters(), lr=0.01, momentum=0.5)
print('Number of parameters: {}'.format(get_n_params(model_fnn)))

for epoch in range(0, 3):
    train(epoch, model_fnn)
    test(model_fnn)

Number of parameters: 1204322

Test set: Average loss: 0.0001, Accuracy: 2000/2000 (100%)


Test set: Average loss: 0.0001, Accuracy: 2000/2000 (100%)


Test set: Average loss: 0.0001, Accuracy: 2000/2000 (100%)



In [37]:
# ConvNet
# Training settings 
n_features = 8 # number of feature maps

model_cnn = CNN(input_size, n_features, output_size)
model_cnn.to(device)
optimizer = optim.SGD(model_cnn.parameters(), lr=0.01, momentum=0.5)
print('Number of parameters: {}'.format(get_n_params(model_cnn)))

for epoch in range(0, 1):
    train(epoch, model_cnn)
    test(model_cnn)

Number of parameters: 1125968

Test set: Average loss: 0.3550, Accuracy: 2000/2000 (100%)


Test set: Average loss: 0.2152, Accuracy: 2000/2000 (100%)


Test set: Average loss: 0.1457, Accuracy: 2000/2000 (100%)



#### Leaky CNN (Can do this last / skip if no time)

In [None]:
# ConvNet with Leaky ReLU Activation Function
# Training settings 
n_features = 8 # number of feature maps

model_cnn = leakyCNN(input_size, n_features, output_size)
model_cnn.to(device)
optimizer = optim.SGD(model_cnn.parameters(), lr=0.01, momentum=0.5)
print('Number of parameters: {}'.format(get_n_params(model_cnn)))

for epoch in range(0, 1):
    train(epoch, model_cnn)
    test(model_cnn)

## AutoEncoder
As a preprocessing step, we will try to run the images through an autoencoder to reduce image noise and use the outputs as inputs for the CNN. This will hopefully lead to increased classification performance.

In [126]:
# ### Define AutoEncoder Class
input_size = 3*224*224
output_size = 2
d = 500
n_feature = 8

class Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, d),
            nn.Tanh(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(d, input_size),
            nn.Tanh(),
        )
        
        self.n_feature = n_feature
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=n_features, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(n_features, n_features, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(self.n_feature*53*53, 50)
        self.fc2 = nn.Linear(50, output_size)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        x = x.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = x.view(-1, self.n_feature*53*53)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x

    
    
class leaky_Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, d),
            nn.Tanh(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(d, input_size),
            nn.Tanh(),
        )
        
        self.n_feature = n_feature
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=n_features, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(n_features, n_features, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(self.n_feature*53*53, 50)
        self.fc2 = nn.Linear(50, output_size)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        x = x.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)
        x = self.conv1(x)
        x = F.leaky_relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = self.conv2(x)
        x = F.leaky_relu(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = x.view(-1, self.n_feature*53*53)
        x = self.fc1(x)
        x = F.leaky_relu(x)
        x = self.fc2(x)
        x = F.log_softmax(x, dim=1)
        return x

#### Training AE Model

In [153]:
AE_model = Autoencoder().to(device)
criterion = nn.MSELoss()

perm = torch.arange(0,150528).long()

### Configure the optimiser

learning_rate = 1e-3

optimizer = torch.optim.Adam(
    AE_model.parameters(),
    lr=learning_rate,
)

img_ids = []
prob_list = []

AE_model.train()
for batch_idx, (data, target, img_id) in enumerate(dataloader):
    img_ids.append(img_id)
    # send to device
    data, target = data.to(device), target.to(device)

    # permute pixels
    data = data.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)

    optimizer.zero_grad()
    output = AE_model(data.view(-1, 3*224*224))
    loss = F.nll_loss(output, target)
    loss.backward()
    optimizer.step()
    prob_list.append(np.array([i[1]/i.sum() for i in np.abs(output.detach().numpy())]))
    if batch_idx % 10 == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(dataloader.dataset),
            100. * batch_idx / len(dataloader), loss.item()))



#### Testing AE Model

In [485]:
AE_model.eval()
test_loss = 0
correct = 0
test_auc_list = []

img_ids = []
prob_list = []

for data, target, img_id in test_loader:
    img_ids.append(img_id)
    # send to device
    data, target = data.to(device), target.to(device)

    # permute pixels
    data = data.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)
    output = AE_model(data.view(-1, 3*224*224))
    test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss                                                               

#     output_array = np.array([np.max(output.detach().numpy()[i]) for i in range(output.size()[0])])
    fpr, tpr, _ = roc_curve(target.detach().numpy(), output_array)
    test_auc_list.append(auc(fpr, tpr))
    pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability                                                                 
    correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()
    prob_list.append(np.array([i[1]/i.sum() for i in np.abs(output.detach().numpy())]))

test_loss /= len(test_loader.dataset)
accuracy = 100. * correct / len(test_loader.dataset)
accuracy_list.append(accuracy)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)   AUC: ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    accuracy, np.array(test_auc_list).mean() ))




Test set: Average loss: 0.0000, Accuracy: 2000/2000 (100%)   AUC: (nan%)



#### Training Leaky AE

In [167]:
img_ids = []
prob_list = []

AE_model = leaky_Autoencoder().to(device)
criterion = nn.MSELoss()
### Configure the optimiser

learning_rate = 1e-3

optimizer = torch.optim.Adam(
    AE_model.parameters(),
    lr=learning_rate,
)


AE_model.train()
for batch_idx, (data, target, img_id) in enumerate(dataloader):
    img_ids.append(img_id)
    # send to device
    data, target = data.to(device), target.to(device)

    # permute pixels
    data = data.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)

    optimizer.zero_grad()
    output = AE_model(data.view(-1, 3*224*224))
    loss = F.nll_loss(output, target)
    loss.backward()
    optimizer.step()
    prob_list.append(np.array([i[1]/i.sum() for i in np.abs(output.detach().numpy())]))
    if batch_idx % 10 == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), len(dataloader.dataset),
            100. * batch_idx / len(dataloader), loss.item()))



#### Testing Leaky AE

#### New Input

In [None]:
transform = transforms.Compose([transforms.Resize(255),
                                transforms.CenterCrop(224),
                                transforms.ToTensor()])


### Code by Andrew Jong https://gist.github.com/andrewjong/6b02ff237533b3b2c554701fb53d5c4d

class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0][-16:-4]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

dataset = ImageFolderWithPaths("/Users/jinmeng1/Desktop/College/Grad School/First Year Masters/Fall Semester/Intro to Data Science/Final/images/train_folder", transform=transform)

transform = transforms.Compose([transforms.Resize(255),
                                transforms.CenterCrop(224),
                                transforms.ToTensor()])
#dataset = datasets.ImageFolder("/Users/jinmeng1/Desktop/College/Grad School/First Year Masters/Fall Semester/Intro to Data Science/Final/images/train_folder", transform=transform)
test_dataset = ImageFolderWithPaths("/Users/jinmeng1/Desktop/College/Grad School/First Year Masters/Fall Semester/Intro to Data Science/Final/images/test_folder", transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=40, shuffle=True)

mel = np.array(dataset.targets) == 1

# for i in mel.nonzero()[0]:
#     rotated_images_batch = []
#     for j in range(1, 41): # range depends on batch size
#         rotated_images_batch.append((torch.tensor(rotate(dataloader.dataset.__getitem__(i)[0], angle=13*j, mode= 'wrap')), 1))

rotated_images = []
for i in mel.nonzero()[0]:
    for j in range(1, 26): 
        rotated_images.append((torch.tensor(rotate(dataloader.dataset.__getitem__(i)[0], angle=13*j, mode= 'wrap')), 1))
    
final_images = dataset.__add__(rotated_images)
dataloader = torch.utils.data.DataLoader(final_images, batch_size=40, shuffle=True)

In [168]:
img_ids = []
prob_list = []

AE_model.eval()
test_loss = 0
correct = 0
test_auc_list = []
for data, target, img_id in test_loader:
    img_ids.append(img_id)
    # send to device
    data, target = data.to(device), target.to(device)

    # permute pixels
    data = data.view(-1, 3*224*224)[:, perm].view(-1,3,224,224)
    output = AE_model(data.view(-1, 3*224*224))
    test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss                                                               

    output_array = np.array([np.max(output.detach().numpy()[i]) for i in range(output.size()[0])])
    fpr, tpr, _ = roc_curve(target.detach().numpy(), output_array)
    test_auc_list.append(auc(fpr, tpr))
    pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability                                                                 
    correct += pred.eq(target.data.view_as(pred)).cpu().sum().item()
    prob_list.append(np.array([i[1]/i.sum() for i in np.abs(output.detach().numpy())]))
    
test_loss /= len(test_loader.dataset)
accuracy = 100. * correct / len(test_loader.dataset)
accuracy_list.append(accuracy)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)   AUC: ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    accuracy, np.array(test_auc_list).mean() ))




Test set: Average loss: 0.0000, Accuracy: 2000/2000 (100%)   AUC: (nan%)



In [169]:
kaggle_submission = pd.DataFrame({"img_id":np.concatenate(img_ids), "mel_prob":np.concatenate(prob_list)})
kaggle_submission.to_csv(file_path)

NameError: name 'file_path' is not defined