<a href="https://colab.research.google.com/github/IciaCarroBarallobre/AdventOfCode/blob/main/notebooks/%5Bv2%5D_Models%2C_train%2C_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Models, train e test





 ## Drive 

In [None]:
from google.colab import drive
drive.mount("/content/drive/", force_remount=True)

Mounted at /content/drive/


## Hiperparametros

In [None]:
#################    DATASET    ##############################
ROOT = '/content/drive/My Drive/TFG/'
ROOT_DATA = ROOT + 'data/'
ROOT_DATASETS = ROOT_DATA +'datasets/'
DATASET = 'subrolldataset'
OCT_DEVICE = 'SPECTRALIS'
CLASS = "SRD" # DME - DRT SRD CME
RANDOM = "proporcional_by_id" # "total"|"by_id"|"proporcional_by_id"| "proporcional_by_id_giving_a_test"
LABELS_CSV = "/info_labels2.csv" # "/info.csv"
################     THRESHOLD    ##################################
# Only apply at negative images
DOUBT, MAX_DOUBT = False, None

if DOUBT == True:
  if CLASS == "DRT": # 0.30 - 75, 90 -0.4214
    MAX_DOUBT = 0.4214
  elif CLASS == "CME": # 0.25 - 75, 90- 0.3874
    MAX_DOUBT = 0.3874
  elif CLASS == "SRD": # 0.5 - 75, 90 - 0.533
    MAX_DOUBT =  0.533

################     MODEL    ##################################
MODEL_STR = 'Densenet161' 
OPTIM, LR  = 'Adam', 0.001
DEBUG = 1
EPOCHS = 200
PATIENCE = 20
BATCH = 8 # More than 16, cuda out of memory

## Imports 

In [None]:
import math
import os
import numpy as np
import pandas as pd
import random 

import torchvision
from torchvision import  models, transforms

import torch 
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split, Subset

from PIL import Image 
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
import seaborn as sn

## Dataset 


### <strong> OCTDataset:  </strong> Subclase de [dataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset)


**Dataset** é unha clase abstracta que representa a un conxunto de  **samples**. Un **sample** é unha **dupla** (input,label). 

As subclases de dataset poden sobrescribir as seguintes funcións:

*   *__getitem __(self,idx)* [OBL]: Obten un sample para unha  determinada clave  *idx*. 
*   *__len __ ()*[OPT]: Devolve o tamaño do conxunto de datos.


In [None]:
class OCTDataset(Dataset):

    def __init__(self, root, max_doubt= None, device = None,transform=None, dataset_name = LABELS_CSV):
        self.root = root
        self.transform = transform
        self.device = device
        self.max_doubt = max_doubt
        self.annotations = pd.read_csv(root+dataset_name) 

        if self.device is not None:
          self.annotations = (self.annotations [self.annotations ["device"] == device]).reset_index(drop=True)

        if self.max_doubt is not None:  
          self.annotations.drop(self.annotations[(self.annotations[CLASS] == 0) & (self.annotations["doubt_percentage"] > self.max_doubt)].index, inplace = True)


    def __len__(self):
        return len(self.annotations)

    def __getitem__(self,idx):
        image = Image.open(self.annotations["root"][idx] ).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        label = np.array([self.annotations[CLASS][idx]])
        name =  self.annotations["name"][idx] 
        
        return image, label, name #sample

    def info(self):
      return self.annotations

### **DataLoader**: Dataset to Iterable

Para poder iterar sobre un dataset necesitamos un [dataloader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader).  




#### Split by idxs

In [None]:
def random_split_by_idxs(dataset, split_list):
  
  p_train, p_val = split_list[0], split_list[1]
  
  df = dataset.info()
  df_ids = df.groupby(["id"]).count()["root"].to_frame()
  
  rest, test_ids= train_test_split(list(df_ids.index), test_size = 1 - p_train - p_val )
  train_ids, val_ids = train_test_split(rest, test_size = p_val)

  train_idx = df[df.id.isin(train_ids)]
  val_idx   = df[df.id.isin(val_ids)]
  test_idx  = df[df.id.isin(test_ids)]
  
  train = Subset(dataset, indices=list(train_idx.index))
  val = Subset(dataset, indices=list(val_idx.index))
  test = Subset(dataset, indices=list(test_idx.index))
  
  return train, val, test

In [None]:
DEBUG  = 0 #test

def proporcional_random_split_by_idxs(dataset, split_list):
  
  p_train, p_val = split_list[0], split_list[1]
  
  df = dataset.info()
  df_ids = df.groupby(["id"]).sum()[CLASS].to_frame()
  positive = df_ids[df_ids[CLASS] > 0]
  negative = df_ids[df_ids[CLASS] == 0]

  #Positive proportion
  rest_of_positive, positive_test_ids= train_test_split(list(positive.index), test_size = 1 - p_train - p_val )
  positive_train_ids, positive_val_ids = train_test_split(rest_of_positive, test_size = p_val)
  
  #Negative proportion
  rest_of_negative, negative_test_ids = train_test_split(list(negative.index), test_size = 1 - p_train - p_val )
  negative_train_ids, negative_val_ids = train_test_split(rest_of_negative, test_size = p_val)

  # Join 
  train_ids =  negative_train_ids + positive_train_ids
  test_ids =  negative_test_ids + positive_test_ids
  val_ids =  negative_val_ids + positive_val_ids

  #Shuffle
  random.shuffle(train_ids)
  random.shuffle(test_ids)
  random.shuffle(val_ids)

  train_idx = df[df.id.isin(train_ids)]
  val_idx   = df[df.id.isin(val_ids)]
  test_idx  = df[df.id.isin(test_ids)]
  
  train = Subset(dataset, indices=list(train_idx.index))
  val = Subset(dataset, indices=list(val_idx.index))
  test = Subset(dataset, indices=list(test_idx.index))
  
  return train, val, test

In [None]:
def proporcional_random_split_by_idxs_giving_a_test(dataset, val):
  
  df = dataset.info()
  df_ids = df.groupby(["id"]).sum()[CLASS].to_frame()
  positive = df_ids[df_ids[CLASS] > 0]
  negative = df_ids[df_ids[CLASS] == 0]

  test_idx = df[df["test_"+CLASS] == True].index
  df = df[df["test"==False]]

  #Positive proportion
  positive_val_ids, positive_train_ids= train_test_split(list(positive.index), test_size = 1 - val )
  negative_val_ids, negative_train_ids= train_test_split(list(negative.index), test_size = 1 - p_val )
  train_ids =  negative_train_ids + positive_train_ids
  val_ids =  negative_val_ids + positive_val_ids

  #Shuffle
  random.shuffle(train_ids)
  random.shuffle(test_ids)
  random.shuffle(val_ids)

  train_idx = df[df.id.isin(train_ids)]
  val_idx   = df[df.id.isin(val_ids)]
  test_idx  = df[df.id.isin(test_ids)]
  
  train = Subset(dataset, indices=list(train_idx.index))
  val = Subset(dataset, indices=list(val_idx.index))
  test = Subset(dataset, indices=list(test_idx.index))
  
  return train, val, test  

#### Dataloader

In [None]:
def data_loader_fun(root_dataset_dir):

    image_transforms =  transforms.Compose([
        transforms.Resize((224,224)), 
        transforms.RandomHorizontalFlip(p = 0.5), # Poden aparecer patrones en calquer dir
        transforms.ToTensor() #Os anteriores traballan con PIL, non con tensores
        ])

    dataset = OCTDataset(root_dataset_dir, device=OCT_DEVICE, max_doubt = MAX_DOUBT, transform=image_transforms)
    dataset.info().count()
    
    #### SPLIT DATASET
    percentage_train = 0.6
    percentage_val = 0.2

    if RANDOM == "total":
      total = len(dataset)
      n_train, n_val = math.floor(total * percentage_train), math.floor(total *percentage_val)
      n_test = total - n_train - n_val
      train_set, val_set, test_set = random_split(dataset, [n_train,n_val,n_test])
      
    elif RANDOM == "by_id":
      train_set, val_set, test_set = random_split_by_idxs(dataset, [percentage_train,percentage_val])
      n_train, n_val, n_test = len(train_set), len(val_set), len(test_set)
      total = n_train + n_val + n_test

    elif RANDOM == "proporcional_by_id":
      result = proporcional_random_split_by_idxs(dataset, [percentage_train,percentage_val])
      train_set, val_set, test_set = result 
      n_train, n_val, n_test = len(train_set), len(val_set), len(test_set)
      total = n_train + n_val + n_test
    elif RANDOM == "proporcional_by_id_giving_a_test":
      result = proporcional_random_split_by_idxs_giving_a_test(dataset, percentage_val)   
      train_set, val_set, test_set = result 
      n_train, n_val, n_test = len(train_set), len(val_set), len(test_set)
      total = n_train + n_val + n_test

    print("Total images in dataset: ",total,"\n Train:",n_train," Val:",n_val," Test:", n_test )

    data_generator = {'train': train_set, 'val': val_set, 'test': test_set}
                             
    return {k: DataLoader(data_generator[k], batch_size=BATCH, shuffle= True) 
    for k in ['train', 'val','test']}



In [None]:
data_loader_fun(ROOT_DATASETS+DATASET)

Total images in dataset:  2623 
 Train: 1667  Val: 426  Test: 530


{'test': <torch.utils.data.dataloader.DataLoader at 0x7fe2abb61fd0>,
 'train': <torch.utils.data.dataloader.DataLoader at 0x7fe2abb92190>,
 'val': <torch.utils.data.dataloader.DataLoader at 0x7fe2abb920d0>}

## Train

### EarlyStopping

In [None]:
class EarlyStopping:
    def __init__(self, patience, delta=0):
            
        self.patience = patience # Canto tempo esperar despois do mellor loss en val.
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta # Cambio minimo para contabilizar  
        self.model = model

    def __call__(self, val_loss, model):
        score =- val_loss
        if self.best_score is None:
            self.best_score = score
            self.model = model
        elif score < self.best_score + self.delta:
            self.counter += 1
            if DEBUG > 1:
              print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.model = model
            self.counter = 0

    def save_checkpoint(self, path):    # Saves model when validation loss decrease.
        torch.save(model.state_dict(),path)

### Fun. activacion 



[Softmax](https://pytorch.org/docs/master/_modules/torch/nn/modules/activation.html#Softmax): Emplease para "comprimir" un vector K-dimensional $z$, de valores reales arbitrarios nun vector K-dimensional, $σ(z)$, de valores reales no rango $[0, 1]$. 

$f(s)_{i} = \frac{e^{s_{i}}}{\sum_{j}^{C} e^{s_{j}}}$

Sendo $s_{j}$ a puntuación inferida pola red para cada clase en C. Os resultados de softmax dependenden do resto de clases.

 ![Softmax](https://drive.google.com/uc?id=18p8tB8pn_tl7SgB2CRTWXsx1KQ3MX0M8)

 [**Sigmoid**:](https://pytorch.org/docs/stable/nn.html#torch.nn.Sigmoid) $\frac{1}{1+e^{(-x)}}$

 ![Sigmoid](https://drive.google.com/uc?id=1GnIcpt1T6I17Z8OZ6D7JNy5UBwRsOdvl)




### Loss function






**CrossEntropy**

$CE = -\sum_{i}^{C}t_{i} log (s_{i})$

Onde $t_{i }$ son as labels e $s_{i}$ son as puntaciones para cada clase i en C. 

Soese usar as funciones (Sigmoid / Softmax) antes de la CE.

*   Binary cross entropy [(nn.BCELoss)](https://pytorch.org/docs/master/generated/torch.nn.BCELoss.html)
*   Cross entropy en Torch [(nn.CrossEntropyLoss)](https://pytorch.org/docs/master/generated/torch.nn.CrossEntropyLoss.html) usa:
  *   [ nn.LogSoftmax()](https://pytorch.org/docs/master/generated/torch.nn.LogSoftmax.html)

  *   [ nn.NLLLoss()](https://pytorch.org/docs/master/generated/torch.nn.NLLLoss.html)


### Train

In [None]:
########### Load Bar
def print_bar(now, maximum, max_length = 20):
    done = now/maximum
    print("\r"+round(done*max_length)*"█"+round(max_length*(1-done))*"░"+" "+str(round(done*100,2))+"% ", end='')

In [None]:
def train_model(device, model, data_loaders, criterion, optimizer, epochs = 50):
    
    early_stopping = EarlyStopping(patience=PATIENCE)
    train_loss, val_loss, acc_train, acc_val = [], [], [], []

    for epoch in range(epochs):
        print_bar(epoch+1, epochs)
        for phase in ['train','val']:
            model.train() if phase == 'train' else model.eval()
            running_loss, correct = 0.0, 0
            
            for inputs, labels, _ in data_loaders[phase]: 
                        
                inputs = inputs.to(device)          
                optimizer.zero_grad() 

                #We don't want calculte gradients when we're val 
                with torch.set_grad_enabled(phase=='train'):
                    outputs = (model(inputs)).to(device) 
                    labels = torch.flatten(labels).type(torch.LongTensor).to(device) 
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1) 
                    if phase == 'train':
                        loss.backward() 
                        optimizer.step() 

                running_loss += loss.item()  * inputs.size(0)
                correct += torch.sum(preds == labels).item()

            epoch_loss = running_loss / len(data_loaders[phase].dataset)
            epoch_acc = correct / len(data_loaders[phase].dataset)
            #print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase,epoch_loss, epoch_acc))

            if phase == "train":
                train_loss.append(epoch_loss)
                acc_train.append(epoch_acc)
                
            elif phase == "val":
                val_loss.append(epoch_loss)
                acc_val.append(epoch_acc)
                early_stopping(epoch_loss, model)
                if early_stopping.early_stop:
                    return train_loss, val_loss, epochs, acc_train,acc_val, early_stopping
                                    
    return train_loss, val_loss, epochs, acc_train,acc_val, None

## Test 

In [None]:
def test(device, model, optimizer, data_loaders):
  
    model.eval()
    softmax = nn.Softmax(dim=1)
    y_true, y_pred, score = [], [],[]            
    list_fp,list_fn = [], []

    for inputs, labels, names in data_loaders["test"]:
            
        inputs = inputs.to(device)
        labels = torch.flatten(labels).type(torch.LongTensor).to(device)
        
        outputs = (model(inputs)).to(device) 
        _, preds = torch.max(outputs, 1)
        
        score.extend(outputs.tolist())
        y_pred.extend(preds.tolist())
        y_true.extend(labels.tolist())

        for i in range(len(inputs)):
            if (preds[i] == 1) and (labels[i] == 0):
                list_fp.append(names[i])
            elif (preds[i] == 0) and (labels[i] == 1):
                list_fn.append(names[i])
              
                
    score =  softmax(torch.tensor(score).to(device))
    score,_ = torch.max(score, 1)
    
    return y_pred, y_true, score.tolist(), list_fp, list_fn 

### Metricas



*   **Especificidad**: Fracción de verdaderos negativos. $\frac{VN}{VN+FP}$
*   **Sensibilidade/Recall**: Fraccióón de verdaderos positivos. $\frac{VP}{VP+FN}$
*   **F1 Score**: Promedio ponderado de precision y recall. Tiene en cuenta tanto los falsos positivos como los falsos negativos. (Aclarar como interpretarla)  $\frac{2 \times precision \times recall}{precision + recall}$
  

*   **A área baixo a curva ROC (AUC - Area Under ROC)**: A curva ROC é a representación gráfica da sensibilidade frente á especificidade. AUC é a área baixo esa curva.



## Graph of test and train

In [None]:
def graph_train_val(model_name,train_loss,valid_loss, early, acc_train,acc_val, save):

    plt.subplot(211)
    plt.title(model_name)
    plt.plot(range(0,len(train_loss)+1),[1]+train_loss, label='Training Loss')
    plt.plot(range(0,len(valid_loss)+1),[1]+valid_loss,label='Validation Loss')
    if(early):
        plt.axvline(len(acc_val)-PATIENCE, 0, 1, label='Early stopping', color="r")
    plt.xlabel('Epoch')
    plt.ylabel('Train')
    plt.ylim(0, 1)
    plt.legend()


    
    plt.subplot(212)
    plt.title(model_name)
    plt.plot(range(0,len(acc_train)+1),[0.5]+acc_train, label='Training acc')
    plt.plot(range(0,len(acc_val)+1),[0.5]+acc_val,label='Validation acc')

    if(early):
        plt.axvline(len(acc_val)-PATIENCE, 0, 1, label='Early stopping', color="r")

    plt.xlabel('Epoch')
    plt.ylabel('Accuracity')
    plt.ylim(0.5, 1)
    plt.legend()
    plt.show()

In [None]:
def conf_matrix_plot(y_true, y_pred, title):
  result = confusion_matrix(y_true, y_pred)
  
  ax = plt.axes()
  sn.heatmap( result, annot=True, fmt="d",ax=ax)
  ax.set_xlabel("Predicted")
  ax.set_ylabel("Real")
  ax.set_title( title)
  plt.show()
  
  return result

## Main



### Funciones auxiliares

In [None]:
def select_network(name, pretrained):
    if 'Resnet50'==name: 
      return models.resnet50(pretrained=pretrained)
    elif 'Resnet101'==name: 
      return models.resnet101(pretrained=pretrained)
    elif 'VGG16'==name: 
      return models.vgg16(pretrained=pretrained)
    elif 'VGG19'==name: 
      return models.vgg19(pretrained=pretrained)
    elif 'Densenet161'==name:
      return models.densenet161(pretrained=pretrained)
    
def select_optimizer(model, opt, lr): 
  if opt == 'Adam': #SGD: old version dont use6800
      return optim.Adam(model.parameters(), lr=lr)
  else:
      raise "NotImplementYet"

In [None]:
def set_model(model, string, n_classes = 2): 
  
  if ('VGG' == string[:3]):
    model.classifier[-1] = nn.Linear(4096,n_classes, bias = True)   
  elif ('Densenet' == string[:8]):
    model.classifier = nn.Linear(model.classifier.in_features, n_classes, bias = True)  
  else:
    model.fc = nn.Linear(model.fc.in_features, n_classes, bias = True) 

  return model

### Main a ejecutar

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Chosen device :", device)

data_loaders = data_loader_fun(ROOT_DATASETS+DATASET)

model = select_network(MODEL_STR, True)
model = set_model(model, MODEL_STR)
model.to(device)

for param in model.parameters():
    param.requires_grad = True  # Son entrenables = True
    
criterion = nn.CrossEntropyLoss()
optimizer = select_optimizer(model, opt = OPTIM, lr = LR)

Chosen device : cuda:0
Total images in dataset:  2623 
 Train: 1685  Val: 400  Test: 538


In [None]:
###### Train ######
result_train = train_model(device = device, model = model, 
                           data_loaders = data_loaders, criterion = criterion, 
                           optimizer= optimizer, epochs = EPOCHS)

train_loss, valid_loss, stop_epoch, acc_train,acc_val, early_stopping = result_train


████░░░░░░░░░░░░░░░░ 21.5% 

In [None]:
early =  early_stopping.early_stop
title = CLASS +" " + MODEL_STR + '- LR =' + str(LR)+' - '+OPTIM
graph_train_val(title,train_loss,valid_loss, early, acc_train,acc_val, True)

In [None]:
###### Test ######
result_test = test(device, model, optimizer, data_loaders)
y_pred, y_true, y_score, list_fp, list_fn  = result_test
print("----- Probability of the labels")
print("Mean:", np.around(np.mean(y_score), decimals = 3))
print("Standard Deviation:", np.around(np.std(y_score), decimals = 3))

In [None]:
dic_metr = classification_report(y_true, y_pred, 
                                target_names= ['Neg', 'Pos'], output_dict=True)

metricas = pd.DataFrame(dic_metr)

auc = None
if len(np.unique(y_true)) > 1:  # Si no hay neg da error,
  auc = roc_auc_score(y_true, y_score)  

result = conf_matrix_plot(y_true, y_pred,  title)

tn, fp = result[0][0], result[0][1]
fn, tp = result[1][0], result[1][1]

tn, fp, fn, tp 

In [None]:

print("Sensibilidad: ", str(tp/(tp+fn)))
print("Especificidade: ", str(tn/(tn+fp))) 

### Save results, test and train

Dependencies: 


*   Train cells
*   Test + Metric cells



In [None]:
root_dataset_info = ROOT_DATASETS+DATASET+'/info.csv'
root = ROOT_DATA+'results/'+CLASS+'/'+DATASET+"/"+MODEL_STR+"/results.csv"

try:
  df = pd.read_csv(root)
  id = len(df)

except FileNotFoundError:
  df = pd.DataFrame([], columns = ['model', 'lr', 'optim','class', 'EarlyStopping',
         'recall','specificity',  'auc', 'accuracy','fn','fp','tn','tp']) 
  id = 0

model_root = ROOT_DATA+'results/'+CLASS+'/'+DATASET+"/"+MODEL_STR+"/"
root_fp = model_root +"fp/"
root_fn = model_root+"fn/"
root_train = model_root+"train/"

In [None]:
########## Checkpoint
save = ROOT_DATA+'results/'+CLASS+'/'+DATASET+"/"+MODEL_STR+"/"
try:
  os.mkdir(save)
except  FileExistsError:
  print("ya existia esa carpeta:", save)
  
try:
  os.mkdir(root_fp)
  os.mkdir(root_fn)
  os.mkdir(root_train)

except  FileExistsError:
  "It's okey"

In [None]:
####### Warning: Create the net folder

########## MODEL RESULTS RESULTS
row = {'model': MODEL_STR, 
      'EarlyStopping': str(stop_epoch)+"/"+str(EPOCHS), 
      'accuracy': metricas['accuracy'][0],
      'auc':  round(auc,3) if auc != None else np.nan, 
      'class': CLASS,
      'specificity': metricas['Neg']['recall'],    #Binary case
      'recall': metricas['Pos']['recall'],
      'lr': LR, 'optim': OPTIM,
       'batch_size': BATCH,
       'fn':fn, 'fp':fp,
       'tn':tn, 'tp':tp,
       "doubt_percentage": MAX_DOUBT
}    

df = df.append(row, ignore_index=True)
df.to_csv(root, index=False) 

########## TRAIN RESULTS
df_train = pd.DataFrame([], columns = ['epoch','acc_train','acc_val','train_loss','val_loss'])
df_train['acc_train'] = acc_train
df_train['acc_val'] = acc_val
df_train['train_loss'] = train_loss
df_train['val_loss'] = valid_loss
df_train['epoch'] = list(range(1,len(acc_train)+1))
df_train.to_csv(root_train+str(id)+".csv", index=False) 

########### FP
fp = pd.DataFrame([], columns = ['fp', 'dataset'])
fp['dataset'] = np.repeat(root_dataset_info, len(list_fp))
fp['fp'] =  list_fp
fp.to_csv(root_fp+str(id)+".csv", index=False)

########## FN
fn = pd.DataFrame([], columns = ['fn', 'dataset'])
fn['dataset'] = np.repeat(root_dataset_info, len(list_fn))
fn['fn'] = list_fn
fn.to_csv( root_fn+str(id)+".csv", index=False)

print(id)



In [None]:
########## Checkpoint
save = ROOT+'models/'+CLASS+'/'+DATASET+"/"+MODEL_STR+"/"
try:
  os.mkdir(save)
except  FileExistsError:
  print("ya existia esa carpeta:", save)

if early:
  early_stopping.save_checkpoint(save+"checkpoint.pt")
else:
  torch.save(model.state_dict(),save+str(id)+".pt")

  print("Done")