In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import matplotlib.pylab as pltlab
import pydicom
from datetime import datetime
from tqdm import tqdm

In [None]:
# load CSV's
csv_train_file = pd.read_csv('/home/malmason/datasets/siim-isic-melanoma-classification/train.csv')

In [None]:
csv_train_file = csv_train_file.sample(frac=1).reset_index(drop=True)

In [None]:
# Point to image folders
img_train_folder = '/home/malmason/datasets/siim-isic-melanoma-classification/ycbcr/norm/train/'

In [None]:
# Used for looping through image columns
X_train_img = csv_train_file['image_name']

In [None]:
csv_train_file.head(2)

In [None]:
print(f"Train set size: {len(csv_train_file)}, \tUnique patients: {len(csv_train_file['patient_id'].value_counts())}")

In [None]:
# Fill missing values
csv_train_file["anatom_site_general_challenge"].fillna("Unknown", inplace = True)
csv_train_file["sex"].fillna("Unknown", inplace = True)

In [None]:
head_neck = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'head/neck']
upper_extremity = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'upper extremity']
lower_extremity = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'lower extremity']
oral_genital = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'oral/genital']
palms_soles = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'palms/soles']
torso = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'torso']
none = csv_train_file.loc[csv_train_file['anatom_site_general_challenge'] == 'Unknown']

zero, one = csv_train_file['target'].value_counts()
print(f'\nTotal train set: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(one / zero,4)*100}%')

print("\nTrain set breakdown\n-------------------")
zero , one = head_neck.groupby('target').target.count()
print(f"Head neck melanoma\tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one = lower_extremity.groupby('target').target.count()
print(f"lower_extremity   \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one = oral_genital.groupby('target').target.count()
print(f"oral_genital      \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one = palms_soles.groupby('target').target.count()
print(f"palms_soles       \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one = torso.groupby('target').target.count()
print(f"torso             \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one = upper_extremity.groupby('target').target.count()
print(f"upper_extremity   \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one =  none.groupby('target').target.count()
print(f"Unknown           \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
print(f"\nNull values for location: {csv_train_file['anatom_site_general_challenge'].isnull().sum()}")

# User to order 0 to 1 for anatom_site_general_challenge categorization

In [None]:
age_range = csv_train_file['age_approx'].dropna().unique()
age_range.sort()
for some_var in age_range:
    age = csv_train_file.loc[csv_train_file['age_approx'] == some_var]
    try:
        zero , one =  age.groupby('target').target.count()
        print(f"Age approx:  {some_var}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
    except:
        print(f'{some_var} is missing either 0 or 1')
# Ages classified 0 to 1 in order of age, as older more likely to gain skin cancer

In [None]:
male = csv_train_file.loc[csv_train_file['sex'] == 'male']
female = csv_train_file.loc[csv_train_file['sex'] == 'female']
unknown = csv_train_file.loc[csv_train_file['sex'] == 'Unknown']
zero , one = male.groupby('target').target.count()
print(f"male     \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
zero , one = female.groupby('target').target.count()
print(f"female   \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")

try:
    zero , one =  unknown.groupby('target').target.count()
    print(f"unknown:  \tTotal: {zero+one}\tNormal: {zero}\tMelanoma: {one}\tPercent: {np.round(100*(one / zero),2)}%")
except:
    print(f'missing either 0 or 1')

In [None]:
# Grfeater risk has higher value
malignent_cat = {'benign':0, 'melanoma':1}
sex_cat = {'female':0, 'male':1, 'unknown':2}
localization_cat = {'palms/soles':0, 'lower extremity':1, 'torso':2, 'Unknown':3, 'upper extremity':4, 'oral/genital':5, 'head/neck':6}

age_cat = {0.0:0, 10.0:1, 40.0:2, 35.0:3, 20:4, 30.0:5, 25.0:6, 45.0:7, 50.0:8, 15.0:9, 55.0:10,
                            60.0:11, 65.0:12, 70.0:13, 85.0:14, 75.0:15, 80.0:16, 90.0:17}

csv_train_file.benign_malignant = csv_train_file.benign_malignant.map(malignent_cat).astype(float)
csv_train_file.sex = csv_train_file.sex.map(sex_cat).astype(float)
csv_train_file.age_approx = csv_train_file.age_approx.map(age_cat).astype(float)
csv_train_file.anatom_site_general_challenge = csv_train_file.anatom_site_general_challenge.map(localization_cat).astype(float)

In [None]:
csv_train_file.head()

In [None]:
csv_train_file = csv_train_file.sort_values('target')

In [None]:
zer_value, one_value = csv_train_file.target.value_counts()

zer_value_num = int(zer_value*.8)
one_value_num = int(one_value*.8)
tot_value = len(csv_train_file)

csv_train_file_train = csv_train_file[:zer_value_num]
csv_train_file_train = csv_train_file_train.append(csv_train_file[zer_value:zer_value+one_value_num])

csv_train_file_val = csv_train_file[zer_value_num:zer_value]
csv_train_file_val = csv_train_file_val.append(csv_train_file[zer_value+one_value_num:])

zer_value_tr, one_value_tr = csv_train_file_train.target.value_counts()
zer_value_va, one_value_va = csv_train_file_val.target.value_counts()
print(f'Zero and one train: {zer_value_tr, one_value_tr}, Zero and one val: {zer_value_va, one_value_va}')

In [None]:
csv_train_file_train = csv_train_file_train.append(csv_train_file_train.loc[csv_train_file_train['target'] == 1])
csv_train_file_train = csv_train_file_train.append(csv_train_file_train.loc[csv_train_file_train['target'] == 1])
csv_train_file_train = csv_train_file_train.append(csv_train_file_train.loc[csv_train_file_train['target'] == 1])

In [None]:
zer_value_tr, one_value_tr = csv_train_file_train.target.value_counts()
zer_value_va, one_value_va = csv_train_file_val.target.value_counts()
print(f'Zero and one train: {zer_value_tr, one_value_tr}, Zero and one val: {zer_value_va, one_value_va}')

In [None]:
csv_train_file_train = csv_train_file_train.sample(frac=1).reset_index(drop=True)
csv_train_file_val = csv_train_file_val.sample(frac=1).reset_index(drop=True)

In [None]:
X_train_data = csv_train_file_train.drop(['image_name', 'patient_id', 'diagnosis', 'benign_malignant', 'target'], axis=1)
X_val_data = csv_train_file_val.drop(['image_name', 'patient_id', 'diagnosis', 'benign_malignant', 'target'], axis=1)
Y_train = csv_train_file_train['target']
Y_val = csv_train_file_val['target']

In [None]:
print(X_train_data.shape, Y_train.shape, X_val_data.shape, Y_val.shape)

In [None]:
mean = X_train_data.mean(axis=0)
X_train_data -= mean
std = X_train_data.std(axis=0)
X_train_data /= std

mean = X_val_data.mean(axis=0)
X_val_data -= mean
std = X_val_data.std(axis=0)
X_val_data /= std

In [None]:
X_train_img = csv_train_file_train['image_name']
X_val_img = csv_train_file_val['image_name']

In [None]:
sizing = 224 - 2            # conv1
sizing = (sizing - 2) / 2   # conv2  
sizing = (sizing - 2)       # conv3
sizing = (sizing - 2) / 2   # conv4
sizing = (sizing - 2)       # conv5
sizing = (sizing - 2) / 2   # conv6
sizing = (sizing - 2)       # conv7
sizing = (sizing - 2) / 2   # conv8
sizing = (sizing - 2) /2    # conv9
print(512*(int(sizing)*int(sizing)))

In [None]:
sizing = 224 - 4            # conv1
sizing = (sizing - 4) / 2   # conv2  
sizing = (sizing - 4) / 2   # conv4
sizing = (sizing - 4) / 2   # conv6
sizing = (sizing - 4) / 2   # conv8
sizing = (sizing - 4) / 2   # conv9
print(512*(int(sizing)*int(sizing)))

In [None]:
class CNN(nn.Module):
  def __init__(self, n_output_neurons):
    super(CNN, self).__init__()

    self.conv1 = nn.Conv2d(3 , 32,   kernel_size=3, stride=1)
    self.conv2 = nn.Conv2d(32, 32,   kernel_size=3, stride=1)
    self.bn1   = nn.BatchNorm2d(32)
    self.conv3 = nn.Conv2d(32 , 64,  kernel_size=3, stride=1)
    self.conv4 = nn.Conv2d(64 , 64,  kernel_size=3, stride=1)
    self.bn2   = nn.BatchNorm2d(64)
    self.conv5 = nn.Conv2d(64 , 128, kernel_size=3, stride=1)
    self.conv6 = nn.Conv2d(128 ,128, kernel_size=3, stride=1)
    self.bn3   = nn.BatchNorm2d(128)
    self.conv7 = nn.Conv2d(128, 256, kernel_size=3, stride=1)
    self.conv8 = nn.Conv2d(256 ,256, kernel_size=3, stride=1)
    self.bn4   = nn.BatchNorm2d(256)
    self.conv9 = nn.Conv2d(256, 512, kernel_size=3, stride=1)
    self.bn5   = nn.BatchNorm2d(512)

    self.conv11 = nn.Conv2d(3 , 32,   kernel_size=5, stride=1)
    self.bn11   = nn.BatchNorm2d(32)
    self.conv21 = nn.Conv2d(32 , 64,  kernel_size=5, stride=1)
    self.bn21   = nn.BatchNorm2d(64)
    self.conv31 = nn.Conv2d(64 , 128, kernel_size=5, stride=1)
    self.bn31   = nn.BatchNorm2d(128)
    self.conv41 = nn.Conv2d(128, 256, kernel_size=5, stride=1)
    self.bn41   = nn.BatchNorm2d(256)
    self.conv51 = nn.Conv2d(256, 512, kernel_size=5, stride=1)
    
    self.fc1   = nn.Linear(8192+4608,128)
    self.fc2   = nn.Linear(128,16)
    self.fc3   = nn.Linear(16, n_output_neurons)

  def forward(self, x):
    xa = F.relu(self.conv1(x),2)
    xa = F.relu(F.max_pool2d(self.conv2(xa),2))
    xa = self.bn1(xa)
    xa = F.relu(self.conv3(xa),2)
    xa = F.relu(F.max_pool2d(self.conv4(xa),2))
    xa = self.bn2(xa)
    xa = F.relu(self.conv5(xa),2)
    xa = F.relu(F.max_pool2d(self.conv6(xa),2))
    xa = self.bn3(xa)
    xa = F.relu(self.conv7(xa),2)
    xa = F.relu(F.max_pool2d(self.conv8(xa),2))
    xa = self.bn4(xa)
    xa = F.relu(F.max_pool2d(self.conv9(xa),2))
    xa = self.bn5(xa)

    xb = F.relu(F.max_pool2d(self.conv11(x),2))
    xb = self.bn11(xb)
    xb = F.relu(F.max_pool2d(self.conv21(xb),2))
    xb = self.bn21(xb)
    xb = F.relu(F.max_pool2d(self.conv31(xb),2))
    xb = self.bn31(xb)
    xb = F.relu(F.max_pool2d(self.conv41(xb),2))
    xb = self.bn41(xb)
    xb = F.relu(F.max_pool2d(self.conv51(xb),2))
    xb = self.bn5(xb)
    
    xa = xa.view(xa.size(0), -1)
    xb = xb.view(xb.size(0), -1)
    
    x = torch.cat((xa, xb), dim=1)
    
    
    x = F.relu(self.fc1(x))
    x = F.dropout(x, p=0.3)
    x = F.relu(self.fc2(x))
    x = F.dropout(x, p=0.2)
    x = self.fc3(x)

    return x

In [None]:
n_channels = 3
n_output_neurons = 1

In [None]:
model = CNN(n_output_neurons)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

In [None]:
csv_train_file.head(2)

In [None]:
X_train_image = []
for image_get in X_train_img:
    img_train = cv2.imread(img_train_folder + '{}.jpg'.format(image_get))
    
    rand_num = np.random.uniform(low=1, high=1.2)
    M = cv2.resize(img_train, None, fx= rand_num, fy= rand_num, interpolation= cv2.INTER_LINEAR)
    
    H_crop = ((224*rand_num)-224)/2
    V_crop = ((224*rand_num)-224)/2
    
    C = M[np.int(H_crop):np.int(M.shape[0]-H_crop),np.int(V_crop):np.int(M.shape[1]-np.int(V_crop))]
    img_train= cv2.resize(C,(224,224))
    
    X_train_image.append(img_train)

In [None]:
X_val_image = []
for image_get in X_val_img:
    img_val = cv2.imread(img_train_folder + '{}.jpg'.format(image_get))
    
    X_val_image.append(img_val)

In [None]:
X_train_image = np.array(X_train_image)
X_train_image = X_train_image.astype(np.float32)
X_val_image = np.array(X_val_image)
X_val_image = X_val_image.astype(np.float32)

In [None]:
Y_train = np.array(Y_train)
Y_val = np.array(Y_val)

In [None]:
X_train_image = X_train_image / 255
X_val_image = X_val_image / 255

In [None]:
print(f'Train shape: {X_train_image.shape}, Vl shape: {X_val_image.shape}')

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.000005)

In [None]:
X_train_image_t = np.transpose(X_train_image, (0,3,1,2))

In [None]:
input_train = torch.from_numpy(X_train_image_t)
target_train = torch.from_numpy(Y_train).reshape(-1,1).float()

In [None]:
X_val_image_t = np.transpose(X_val_image, (0,3,1,2))

In [None]:
input_val = torch.from_numpy(X_val_image_t)
target_val = torch.from_numpy(Y_val).reshape(-1,1).float()

In [None]:
X_train_data_np = X_train_data.values
X_train_data_np = X_train_data_np.astype(np.float32)

In [None]:
input_train_data = torch.from_numpy(X_train_data_np)

In [None]:
X_val_data_np = X_val_data.values
X_val_data_np = X_val_data_np.astype(np.float32)

In [None]:
input_val_data = torch.from_numpy(X_val_data_np)

In [None]:
n_epochs = 25
train_losses = []
val_losses = []

In [None]:
batch_size = 32
training_set = torch.utils.data.TensorDataset(input_train, target_train)
train_loader = torch.utils.data.DataLoader(dataset=training_set, 
                                           batch_size=batch_size,
                                           num_workers=2,
                                           shuffle=True)
val_set = torch.utils.data.TensorDataset(input_val, target_val)
val_loader = torch.utils.data.DataLoader(dataset=val_set, 
                                           batch_size=batch_size,
                                         num_workers=2,
                                           shuffle=False)

In [None]:
def batch_gd(model, criterion, optimizer, train_loader, val_loader, n_epochs):
    train_losses = np.zeros(n_epochs)
    val_losses = np.zeros(n_epochs)
    
    for it in range(n_epochs):
        t0 = datetime.now()
        
        train_loss = []
        
        for inputs, targets in tqdm(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            loss = criterion(outputs, targets)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            
        train_loss = np.mean(train_loss)
        
        val_loss = []
        
        for inputs, targets in tqdm(val_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            val_loss.append(loss.item())
            
        val_loss = np.mean(val_loss)
        
        train_losses[it] = train_loss
        val_losses[it] = val_loss

        dt = datetime.now() -t0

        print(f'Epoch {it+1}/{n_epochs}, Time: {dt}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    return train_losses, val_losses

In [None]:
train_losses, val_losses = batch_gd(
    model, criterion, optimizer, train_loader, val_loader, n_epochs)

In [None]:
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='val loss')
plt.legend()
plt.show()

In [None]:
n_correct = 0.
n_total = 0.
for inputs, targets in train_loader:

  inputs, targets = inputs.to(device), targets.to(device)

  outputs = model(inputs)

  _, train_predictions = torch.max(outputs, 1)
  
  n_correct += (train_predictions == targets).sum().item()
  n_total += targets.shape[0]

train_acc = n_correct / n_total


n_correct = 0.
n_total = 0.
for inputs, targets in val_loader:

  inputs, targets = inputs.to(device), targets.to(device)

  outputs = model(inputs)

  _, val_predictions = torch.max(outputs, 1)

  n_correct += (val_predictions == targets).sum().item()
  n_total += targets.shape[0]

val_acc = n_correct / n_total
print(f"Train acc: {train_acc:.4f}, Val acc: {val_acc:.4f}")

In [None]:
print(n_total)

## Start of test data

In [None]:
csv_test_file = pd.read_csv('/home/malmason/datasets/siim-isic-melanoma-classification/test.csv')

In [None]:
img_test_folder = '/home/malmason/datasets/siim-isic-melanoma-classification/ycbcr/norm/test/'

In [None]:
X_test_img = csv_test_file['image_name']

In [None]:
csv_test_file["anatom_site_general_challenge"].fillna("Unknown", inplace = True)
csv_test_file["sex"].fillna("Unknown", inplace = True) 

In [None]:
csv_test_file.sex = csv_test_file.sex.map(sex_cat).astype(float)
csv_test_file.age_approx = csv_test_file.age_approx.map(age_cat).astype(float)
csv_test_file.anatom_site_general_challenge = csv_test_file.anatom_site_general_challenge.map(localization_cat).astype(float)

In [None]:
X_test_data = csv_train_file.drop(['image_name', 'patient_id'], axis=1)

In [None]:
X_test_img = csv_test_file['image_name']

In [None]:
csv_test_file.head()

In [None]:
X_test_image = []
for image_get in X_test_img:
    img_test = cv2.imread(img_test_folder + '{}.jpg'.format(image_get))
    X_test_image.append(img_test)

In [None]:
X_test_image = np.array(X_test_image.astype(np.float32))

In [None]:
X_test_image = X_test_image / 255