<a href="https://colab.research.google.com/github/GuidoGiacomoMussini/DeepLearning-automated_diagnosis_of_pigmented_skin_lesions/blob/main/Binary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Skin Lesion Project - Notebook 2**

---
## - Data Augumentation

## - SMOTE 

##- Binary Classification:

* Benign      
* Malignant  

## - Dropout Tuning

---
Guido Giacomo Mussini 988273

---

# **0 - Initialization**

## 0.1 Libraries

In [None]:
pip install torchmetrics

In [None]:
import os
import shutil
from glob import glob

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from PIL import Image

import random as rnd
from IPython.display import clear_output 
from tqdm.notebook import tqdm_notebook
from collections import defaultdict, Counter, OrderedDict
from imblearn.over_sampling import SMOTE

import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torchmetrics.classification import MulticlassAccuracy, BinaryAccuracy
from torch import nn
from torch import optim
from torchvision import models
from torchsummary import summary
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torchtext.vocab import vocab

import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools

## 0.2 Functions

Model prediction: 

* train the model and test the performance.
* Input: train and test set, number of epochs, the loss function and the optimizer. Show progress is a boolean that, if true, show the accuracy and the loss epoch by epoch
* It returns 2 dictionaries containing information about Accuracy and Loss of training and test set
* inspired by: https://github.com/dash-ka/DL_Natural_Language_Processing

In [None]:
def Prediction(model, train_loader, test_loader, n_epochs, criterion, optimizer, show_progress):

  model_name = model.__class__.__name__
  loss_dict, acc_dict = defaultdict(list), defaultdict(list)
  tr_acc, tr_tot, train_accuracy, val_acc, val_tot, validation_accuracy = 0,0,0,0,0,0

  for epoch in tqdm_notebook(range(n_epochs)):
      #Within each epoch run the subsets of data = batch sizes.
      rtrain_loss = 0
      model.train()
      for b_index, t_batch in enumerate(train_loader, start = 1):

        images, labels = t_batch
        images = Variable(images).to(device)
        labels = Variable(labels).to(device)
        labels = labels.float()

        y_pred = model(images) 
        y_pred_tag = torch.round(y_pred)          
        tloss = criterion(y_pred, labels)  
        optimizer.zero_grad()         
        tloss.backward()               
        optimizer.step()

        #loss
        batch_loss = tloss.item()
        rtrain_loss += (batch_loss - rtrain_loss) / b_index
        

        #accuracy
        tr_acc += (y_pred_tag == labels).sum().item()
        tr_tot += labels.size(0)   

        #adaptive Learning rate
        
      train_accuracy = tr_acc / tr_tot
      acc_dict["training_accuracy"].append(train_accuracy)
      loss_dict["training_loss"].append(rtrain_loss)
      scheduler.step(rtrain_loss)
            

      model.eval()
      rval_loss = 0
      with torch.no_grad():
        for b_index, v_batch  in enumerate(test_loader, start = 1):

            images, labels = v_batch
            images = Variable(images).to(device)
            labels = Variable(labels).to(device)
            labels = labels.float()

            y_test_pred = model(images)
            y_pred_tag = torch.round(y_test_pred)
            #y_pred_list.append(y_pred_tag.detach().numpy())
            vloss = criterion(y_test_pred, labels)
            
            #loss
            batch_loss = vloss.item()
            rval_loss += (batch_loss - rval_loss) / b_index 
            
            #acc
            val_acc += (y_pred_tag == labels).sum().item()
            val_tot += labels.size(0)

        validation_accuracy = val_acc / val_tot
        acc_dict["validation_accuracy"].append(validation_accuracy) 
        loss_dict["validation_loss"].append(rval_loss) 

      if show_progress == True:
        print('[Model: %s ] -> [epoch %d]: \n [train Loss %.5f], [val Loss %.5f] \n [train Acc  %.5f], [val Acc  %.5f]' \
              % (model_name, epoch, rtrain_loss, rval_loss, train_accuracy, validation_accuracy ))
        print("---------------------------------")
  return loss_dict, acc_dict

Function to compute the prediction on the test set
* inspired by: https://www.kaggle.com/code/unstructuredrahul/deep-learning-pytorch-binary-classification?scriptVersionId=67067211&cellId=38

In [None]:
def test_prediction(model, test_loader):
  with torch.no_grad():

    preds = []
    model.eval()
    for i, data in enumerate(test_loader):

      images, labels = data
      images = Variable(images).to(device)
      labels = Variable(labels).to(device)
      labels = labels.float()

      y_test_pred = model(images)
      y_test_pred = y_test_pred.round()
      preds.append(y_test_pred.detach().numpy())
  return preds

Function to fix the row-index of the datasets

In [None]:
def re_index(data):
  s = pd.Series(range(len(data)))
  data = data.set_index(s)
  return data

## 0.3 - Import the Data From Kaggle (estimated time: 4 m)

In [None]:
!mkdir WD

! #mkdir ~/.kaggle
! #cp kaggle.json ~/.kaggle/
! #chmod 600 ~/.kaggle/kaggle.json
os.environ['KAGGLE_USERNAME'] = "guidomussini"
os.environ['KAGGLE_KEY'] = "f7b24d630bc3e7e7fda7a5a1b32f4582"
! kaggle datasets download -d kmader/skin-cancer-mnist-ham10000
! unzip /content/skin-cancer-mnist-ham10000.zip -d /content/WD

#remove useless data 
shutil.rmtree('/content/WD/ham10000_images_part_1')
shutil.rmtree('/content/WD/ham10000_images_part_2')
! rm '/content/WD/hmnist_28_28_L.csv'
! rm '/content/WD/hmnist_28_28_RGB.csv'
! rm '/content/WD/hmnist_8_8_L.csv'
! rm '/content/WD/hmnist_8_8_RGB.csv'
! rm '/content/skin-cancer-mnist-ham10000.zip'

# **1 - Dataset Definition**

##1.1 - Merge the images from the 2 folders

*   Code taken by: https://www.kaggle.com/code/sid321axn/step-wise-approach-cnn-model-77-0344-accuracy

In [None]:
#Merge the images of the 2 folders
base_skin_dir = os.path.join('..', 'content/WD')
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

sdf = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))


create a column in which each row contain the path to a image

In [None]:
sdf = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))

sdf['path'] = sdf['image_id'].map(imageid_path_dict.get)

## 1.2 - Metadata Handling 

check for duplicates

In [None]:
sdf = sdf.drop_duplicates(subset=['lesion_id']) 
sdf = sdf.drop_duplicates(subset=['image_id']) 

print(sdf.shape[0]) #now i have 7470 lesions 

7470


check for missing values

In [None]:
print("number of non-NA data per columns:\n",sdf.isnull().sum())
#some NA in age -> since that column will be removed from the dataset, i don't impute them

number of non-NA data per columns:
 lesion_id        0
image_id         0
dx               0
dx_type          0
age             52
sex              0
localization     0
path             0
dtype: int64


## 1.3 - Create the binary label:


*   **M**: If the lesion is Malignant
*   **B**: If the lesion is Benign



In [None]:
#Define Benign and Malignant lesions
Benign = ["nv", "bkl", "vasc", "df"]
Malignant = ["mel", "bcc", "akiec"]
m1 = sdf['dx'].isin(Benign)
m2 = sdf['dx'].isin(Malignant)
sdf['type'] = np.select([m1, m2], ["B", "M"], default=sdf['dx'])

# **2 - Data Visualisation**

Malignant\Benign Distribution

* **Green** if the lesion is **Benign**

* **Red** if the lesion is **Malignant**

In [None]:
binclass_perc = round((sdf["type"].value_counts() / len(sdf["type"])), 2)
#B    0.84
#M    0.16
plt.bar(binclass_perc.index, binclass_perc, color = ["green", "red"])

#I observe that the 2 classes are unbalanced --> SMOTE

# **3 - Data Manipulation**

## 3.1 - Generate the Training (train + val) and Test set

Encode the response variable

In [None]:
sdf['label'] = pd.Categorical(sdf['type']).codes
sdf = sdf[['path', 'label']]

#this function, defined in the section 'functions', fix the row-indeces of the dataset
#since the download (and in the next steps the sampling), save them in a unconvinient manner
sdf = re_index(sdf)

sample the training and the test set

In [None]:
#training (train + val)
train_val = sdf.sample(frac=0.7, random_state=19)
##need to convert it in an array to merge it in the future with the augumented data
x_train_val = train_val[['path']].to_numpy() 

#test
test = sdf.drop(train_val.index)

#re index the df
train_val = re_index(train_val)
test = re_index(test)

## 3.2 - Store the images as arrays (Expected time: 3m)
 
 The images are sized as 32x32 since speed up the computation of the algorithm mantaining a good amount of information




In [None]:
sh_x, sh_y = 32, 32
train_val['image'] = train_val['path'].map(lambda x: np.asarray(Image.open(x).resize((sh_x,sh_y))))
test['image'] = test['path'].map(lambda x: np.asarray(Image.open(x).resize((sh_x,sh_y))))

train_val = train_val[['image', 'label']]
x_train_val = train_val[['image']] 
test = test[['image', 'label']]

## 3.3 - Data Augumentation

Since the training set contain around 5000 observations, a good practice is to generate synthetic images in order to increase the number of observation. 

I have a randomized mix of affine trasformation, vertical and horizontal flip.

This mix it has been applied to each image, so that the dataset cardinality is doubled

**Note that** the augumented data have been used only in the training set (train +validation), while in the test set has been used only original data.

In [None]:
aug = list()
torch.manual_seed(19)
transforms = T.RandomApply(torch.nn.ModuleList([T.RandomVerticalFlip(p=1.0), \
                                                T.RandomHorizontalFlip(p=1.0)]), p=1)

#augumenter = T.RandomPerspective(distortion_scale=0.6, p=1.0)
aug = train_val["image"].map(lambda x: transforms(torch.tensor(x).permute(2,0,1)))

for i in range(len(aug)):
  aug[i] = np.array(aug[i])

Create the dataset containing the original data and the augumented ones

In [None]:
#trasform the augumented data as an array
aug = np.asarray(aug.tolist())
aug = aug.reshape(aug.shape[0], *(sh_x, sh_y, 3))

#trasform the images into array
x_train_val = np.asarray(x_train_val.values.tolist())
x_train_val= x_train_val.reshape(x_train_val.shape[0], *(sh_x, sh_y, 3))

#concatenete the 2 arrays
df_images = np.concatenate((aug, x_train_val))

#double the labels to make them fit with the original data and the concatenated augumented data
df_labels = np.concatenate((train_val[['label']].to_numpy(),train_val[['label']].to_numpy() ))

#define the dataset containing training and val examples
coln1 = {'image': list(df_images), 'label': list(df_labels)}
trainval = pd.DataFrame(data=coln1)

## 3.4 - Split the training set in Train and Validation set

In [None]:
train, val = train_test_split(trainval, test_size=0.30,random_state=19)
#fix the indexes
train = re_index(train)
val = re_index(val)
print("train:", len(train), "\nval:", len(val))

train: 7320 
val: 3138


## 3.5 - Define features and response variable for train, validation and test set

In [None]:
#train
x_train = train.drop(["label"], axis = 1)
x_train = np.asarray(x_train['image'].tolist())
y_train = train["label"]
y_train = np.array(y_train)

#validation
x_val = val.drop(["label"], axis = 1)
x_val = np.asarray(x_val['image'].tolist())
y_val = val["label"]
y_val = np.array(y_val)

#test
x_test = test.drop(["label"], axis = 1)
x_test = np.asarray(x_test['image'].tolist())
y_test = test["label"]
y_test = np.array(y_test)

## 3.6 - SMOTE

As noticed before, the data are strongly umbalanced. SMOTE is an oversampling technique which balance the classes by generate syntethic observations of the less represented class.

**Note that** only the data contained in the train set have been oversampled in this way, since in the real word the data are no evenly distributed.

In [None]:
#Smote
x_train = x_train.reshape(x_train.shape[0], sh_x * sh_y* 3)
y_train= y_train.astype('int')
sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)

print("x_train:", len(x_train))

x_train: 12328


## 3.7 - Reshape and convert the images in the correct format

all the images have been shaped as 32x32x3 images

In [None]:
#reshape to be sure that they are in the right shape
x_train = x_train.reshape(x_train.shape[0], *(sh_x, sh_y, 3))
x_val = x_val.reshape(x_val.shape[0], *(sh_x, sh_y, 3))
x_test = x_test.reshape(x_test.shape[0], *(sh_x, sh_y, 3))

In [None]:
# converting the images in torch format
x_train  =torch.from_numpy(x_train.astype(np.float32))
x_val  = torch.from_numpy(x_val.astype(np.float32))
x_test  = torch.from_numpy(x_test.astype(np.float32))

#converting the lables in torch format
y_train = torch.from_numpy(y_train.astype(np.float32))
y_train = y_train.unsqueeze(1)

y_val = torch.from_numpy(y_val.astype(np.float32))
y_val = y_val.unsqueeze(1)

y_test = torch.from_numpy(y_test.astype(np.float32))
y_test = y_test.unsqueeze(1)

## 3.8 - Normalization 

All the images will be normalized based on the mean and std dev of the train set



In [None]:
x_train = TF.normalize(x_train, x_train.mean(), x_train.std())
x_val = TF.normalize(x_val, x_val.mean(), x_val.std())
x_test = TF.normalize(x_test, x_test.mean(), x_test.std())

# **4 - Convolutional Neural Network**

## 4.1 Pytorch Dataset and Dataloader

use GPU if possible

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


Define the class Data

In [None]:
class Data(Dataset):
    def __init__(self, X, y):
        self.X = X.permute(0,3,1,2)
        self.y = y.type(torch.LongTensor)
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len

Define the Train, Validation and Test Loader

In [None]:
batch_size = 64 

train_set = Data(x_train, y_train)
train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)

val_set = Data(x_val, y_val)
val_loader = DataLoader(dataset=val_set, batch_size=batch_size, shuffle=True)

test_set = Data(x_test, y_test)
test_loader = DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True)

## 4.2 Binary Model 1





In [None]:
class Binary_1(nn.Module):
    def __init__(self):
        super().__init__()

        self.ReLU = nn.ReLU()
        self.drop50 = nn.Dropout(0.20)
        self.drop30 = nn.Dropout(0.20) 
        self.pool3 = nn.MaxPool2d(kernel_size=(3, 3))
        self.sigmoid = torch.nn.Sigmoid()

        self.conv1 = nn.Conv2d(3, 16, kernel_size=(3,3), padding = "same")
        self.norm2d1 = nn.BatchNorm2d(16)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3,3), padding = "same")
        self.norm2d2 = nn.BatchNorm2d(32)

        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3,3), padding = "same")
        self.norm2d3 = nn.BatchNorm2d(64)

        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3,3), padding = "same")
        self.norm2d4 = nn.BatchNorm2d(128)

        self.flat = nn.Flatten()
        
        self.linear1 = nn.Linear(1152, 64)
        self.norm1d1 = nn.BatchNorm1d(64)

        self.linear2 = nn.Linear(64, 32)
        self.norm1d2 = nn.BatchNorm1d(32)

        self.linear3 = nn.Linear(32, 16)
        self.norm1d3 = nn.BatchNorm1d(16)

        self.linear4= nn.Linear(16, 1)

 
    def forward(self, x):
  
        x = self.ReLU(self.conv1(x))
        x = self.pool3(x)
        x = self.norm2d1(x)
        x = self.drop30(x)

        x = self.ReLU(self.conv2(x))
        x = self.pool3(x)
        x = self.norm2d2(x)
        x = self.drop30(x)

        x = self.ReLU(self.conv3(x))
        x = self.norm2d3(x)
        x = self.drop30(x)

        x = self.ReLU(self.conv4(x))
        x = self.norm2d4(x)
        x = self.drop30(x)

        x = self.flat(x)

        x = self.ReLU(self.linear1(x))
        x = self.norm1d1(x)
        x = self.drop50(x)

        x = self.ReLU(self.linear2(x))
        x = self.norm1d2(x)
        x = self.drop50(x)

        x = self.ReLU(self.linear3(x))
        x = self.norm1d3(x)
        x = self.drop50(x)

        x = self.sigmoid(self.linear4(x))

        return x

## 4.3 - Binary Model 2

In [None]:
class Binary_2(nn.Module):
    def __init__(self):
        super().__init__()

        self.ReLU = nn.ReLU()
        self.drop = nn.Dropout(0.30)
        self.pool = nn.MaxPool2d(kernel_size=(3, 3))
        self.Sigmoid = nn.Sigmoid()

        self.conv1 = nn.Conv2d(3, 64, kernel_size=(3,3), padding = "same")
        self.norm1d2 = nn.BatchNorm2d(64)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=(3,3), padding = "same")
        self.norm2d2 = nn.BatchNorm2d(128)

        self.flat = nn.Flatten()
        
        self.linear1 = nn.Linear(1152, 32)
        self.norm1d1 = nn.BatchNorm1d(32)
        self.linear2 = nn.Linear(32, 1)

 
    def forward(self, x):
  
        x = self.ReLU(self.conv1(x))
        x = self.pool(x)
        x = self.norm1d2(x)
        x = self.drop(x)

        x = self.ReLU(self.conv2(x))
        x = self.pool(x)
        x = self.norm2d2(x)
        x = self.drop(x)

        x = self.flat(x)

        x = self.ReLU(self.linear1(x))
        x = self.norm1d1(x)
        x = self.drop(x)

        x = self.Sigmoid(self.linear2(x))

        return x

## 4.4 - Train the model 

Model 1

In [None]:
#Accuracy: 0.731 
n_epochs = 50
model1 = Binary_1()
learning_rate = 0.1
optimizer = torch.optim.SGD(model1.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    factor=0.1, 
    patience=10, 
    verbose=True,
    min_lr = 0.000001)

criterion = nn.BCELoss()

loss1, acc1 = Prediction(model1, train_loader, val_loader, n_epochs, criterion, optimizer, show_progress = True)

Model 2

In [None]:
#Accuracy: 0.673
n_epochs = 100
model2 = Binary_2()
learning_rate = 0.01
optimizer = torch.optim.SGD(model2.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    factor=0.1, 
    patience=5, 
    verbose=False,
    min_lr = 0.000001)

criterion = nn.BCELoss()

loss2, acc2 = Prediction(model2, train_loader, val_loader, n_epochs, criterion, optimizer, show_progress = True)

Summary


In [None]:
#summary(Binary_2(), (3,32,32))

## 4.5 - Results visualization

Accuracy

In [None]:
plt.plot(acc1["training_accuracy"], label = "training accuracy")
plt.plot(acc1["validation_accuracy"], label = "validation accuracy")
plt.legend()
plt.title("Accuracy")

Loss

In [None]:
plt.plot(loss1["training_loss"], label = "training loss")
plt.plot(loss1["validation_loss"], label = "validation loss")
plt.legend()
plt.title("Loss")


model 2


In [None]:
plt.plot(loss2["training_loss"], label = "training loss")
plt.plot(loss2["validation_loss"], label = "validation loss")
plt.legend()
plt.title("Loss")

In [None]:
plt.plot(acc2["training_accuracy"], label = "training accuracy")
plt.plot(acc2["validation_accuracy"], label = "validation accuracy")
plt.legend()
plt.title("Accuracy")


## 4.6 - Prediction on the test set

In [None]:
preds = test_prediction(model1, test_loader)  #model2

trasform the data in order to build the confusion matrix

In [None]:
#Need to store the data in a proper manner to visualize the metrics of interest
y_pred_list = [a.squeeze().tolist() for a in preds]

#rest = len(y_test) % batch_size
#last batch has only one element --> i temporary remove it to apply vstack.
y_pred_list1 = y_pred_list[:-1]
last_element = y_pred_list[-1]

p = np.vstack(y_pred_list1)
ytest_pred = list(itertools.chain.from_iterable(p))

#append the last element
ytest_pred.insert(len(ytest_pred), last_element)


y_true_test = y_test.ravel()

Confusion Matrix

In [None]:
a = conf_matrix = confusion_matrix(y_true_test, ytest_pred)

adf = pd.DataFrame(a, columns = ['Pred_Benign', 'Pred_Malignant'])
adf.index = ['Actual_Benign', 'Actual_Malignant']
sns.heatmap(adf, annot=True, fmt = 'g', cmap="Blues")

Accuracy

In [None]:
Accuracy = round((a[0][0] + a[1][1]) / sum(sum(a)), 3)
print("Accuracy:", Accuracy)

# **5 - Dropout Tuning**


## 5.1 - Define the model

Based on Model 1 structure

In [None]:
class Binary_Tune(nn.Module):
    def __init__(self, drp):
        super().__init__()
        self.dr = drp 
        self.ReLU = nn.ReLU()
        self.drop = nn.Dropout(drp)
        self.pool3 = nn.MaxPool2d(kernel_size=(3, 3))
        self.sigmoid = torch.nn.Sigmoid()

        self.conv1 = nn.Conv2d(3, 16, kernel_size=(3,3), padding = "same")
        self.norm2d1 = nn.BatchNorm2d(16)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3,3), padding = "same")
        self.norm2d2 = nn.BatchNorm2d(32)

        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3,3), padding = "same")
        self.norm2d3 = nn.BatchNorm2d(64)

        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3,3), padding = "same")
        self.norm2d4 = nn.BatchNorm2d(128)

        self.flat = nn.Flatten()
        
        self.linear1 = nn.Linear(1152, 64)
        self.norm1d1 = nn.BatchNorm1d(64)

        self.linear2 = nn.Linear(64, 32)
        self.norm1d2 = nn.BatchNorm1d(32)

        self.linear3 = nn.Linear(32, 16)
        self.norm1d3 = nn.BatchNorm1d(16)

        self.linear4= nn.Linear(16, 1)

 
    def forward(self, x):
  
        x = self.ReLU(self.conv1(x))
        x = self.pool3(x)
        x = self.norm2d1(x)
        x = self.drop(x)

        x = self.ReLU(self.conv2(x))
        x = self.pool3(x)
        x = self.norm2d2(x)
        x = self.drop(x)

        x = self.ReLU(self.conv3(x))
        x = self.norm2d3(x)
        x = self.drop(x)

        x = self.ReLU(self.conv4(x))
        x = self.norm2d4(x)
        x = self.drop(x)

        x = self.flat(x)

        x = self.ReLU(self.linear1(x))
        x = self.norm1d1(x)
        x = self.drop(x)

        x = self.ReLU(self.linear2(x))
        x = self.norm1d2(x)
        x = self.drop(x)

        x = self.ReLU(self.linear3(x))
        x = self.norm1d3(x)
        x = self.drop(x)

        x = self.sigmoid(self.linear4(x))

        return x

## 5.2 - Tuning

The idea is to try different values of dropout in the model, since it one of the mai causes of overfitting, and than choose the model which minimize the average square difference between the training and the validation accuracy.

The dropout values tried go from 0% to 50% with step 5.

In [None]:
#50 epochs to speed up the computation
n_epochs = 50
#list of the dropouts
tune = list()

for i in tqdm_notebook(range(1, 11)): 
  drp = i*0.05
  model_tune = None
  model_tune = Binary_Tune(drp = drp)
  learning_rate = 0.1
  optimizer = torch.optim.SGD(model_tune.parameters(), lr=learning_rate)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
      optimizer, 
      factor=0.1, 
      patience=10, 
      verbose=False,
      min_lr = 0.000001)
  
  #derive val and train loss
  accT = Prediction(model_tune, train_loader, val_loader, n_epochs, criterion, optimizer, show_progress = False)[1]

  #find the avg square distance
  l1 = accT.get("training_accuracy")
  l2 = accT.get("validation_accuracy")
  #multiply to have them (usually) > 1
  diff = (np.asarray(l1) - np.asarray(l2))*100

  #median of the square
  median_result = np.median(np.square(diff))

  print('[model: %d ==> Median accuracy difference %f:' % (i, median_result))
  print("---------------------------------")

  #update the list
  tune.append(median_result)


Plot of dropouts

In [None]:
fig = plt.figure()
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # main axes
ax.plot(tune)
#ax.set_xticks([0,2,4,6,8,10])
ax.set_xticklabels([round(x*0.05,2) for x in range(2,11)])
plt.show()

Best dropout

In [None]:
best_dropout = round((tune.index(min(tune)))*0.05, 2)
best_dropout

Re-train the best model

In [None]:
best_model = Binary_Tune(drp = best_dropout)
learning_rate = 0.1
optimizer = torch.optim.SGD(best_model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    factor=0.1, 
    patience=10, 
    verbose=False,
    min_lr = 0.000001)

#derive val and train loss
loss_best, acc_best = Prediction(best_model, train_loader, val_loader, n_epochs, criterion, optimizer, show_progress = False)


## 5.3 - Results on test set

In [None]:
preds = test_prediction(best_model, test_loader)

In [None]:
#Need to store the data in a proper manner to visualize the metrics of interest
y_pred_list = [a.squeeze().tolist() for a in preds]

#rest = len(y_test) % batch_size
#last batch has only one element --> i temporary remove it to apply vstack.
y_pred_list1 = y_pred_list[:-1]
last_element = y_pred_list[-1]

p = np.vstack(y_pred_list1)
ytest_pred = list(itertools.chain.from_iterable(p))

#append the last element
ytest_pred.insert(len(ytest_pred), last_element)


y_true_test = y_test.ravel()

a = conf_matrix = confusion_matrix(y_true_test, ytest_pred)

adf = pd.DataFrame(a, columns = ['Pred_Benign', 'Pred_Malignant'])
adf.index = ['Actual_Benign', 'Actual_Malignant']
sns.heatmap(adf, annot=True, fmt = 'g', cmap="Blues")



In [None]:
Accuracy = round((a[0][0] + a[1][1]) / sum(sum(a)), 3)
print("Accuracy:", Accuracy)