In [1]:
%load_ext autoreload
%autoreload 2

import utils
import pandas as pd
import config
from PIL import Image
import json
from patch_classifier import PatchClassifier
import lightning as L
from dataloader import CustomDataModule
import config
import utils
from pytorch_lightning.loggers import TensorBoardLogger
from torchvision.models import resnet50, ResNet50_Weights
import torch
from torch.utils.data import ConcatDataset, DataLoader

In [2]:
# CUDA
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# HPARAMS
batch_size = 32
learning_rate= 0.001

# TRAINER
tensorboard_logger = TensorBoardLogger("lightning_logs", name="performance_final")
trainer = L.Trainer(
    max_epochs=8,
    enable_progress_bar=True,
    accelerator="gpu" if device == "cuda" else "cpu",
    logger=tensorboard_logger
)

# DATAMODULES
data_module = CustomDataModule(csv_file=config.EXPORT3, num_workers=4, batch_size=batch_size, mode='patch')
data_module.setup()

train_loader = data_module.train_dataloader()   
val_loader = data_module.val_dataloader()

# MODEL DEFINITION
resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
model = PatchClassifier(feature_extractor=resnet, learning_rate=learning_rate)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Using device: cuda


In [3]:
def create_and_train_model(batch_size, learning_rate):

    data_module = CustomDataModule(csv_file=config.EXPORT3, num_workers=4, batch_size=batch_size, mode='patch')
    data_module.setup()

    train_loader = data_module.train_dataloader()   
    val_loader = data_module.val_dataloader()

    model = PatchClassifier(feature_extractor=resnet, learning_rate=learning_rate)

    trainer = L.Trainer(
        max_epochs=8,
        enable_progress_bar=True,
        accelerator="gpu" if device == "cuda" else "cpu"
    )

    trainer.fit(model, train_loader, val_loader)

    metrics = trainer.callback_metrics 

    val_loss = metrics.get('val_loss', None)    
    val_accuracy = metrics.get('val_accuracy', None) 
    val_fbeta = metrics.get('val_fbeta', None)
    val_precision = metrics.get('val_precision', None)
    val_recall = metrics.get('val_recall', None)

    return {
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "val_loss": val_loss,
        "val_accuracy": val_accuracy,
        "val_fbeta": val_fbeta,
        "val_precision": val_precision,
        "val_recall": val_recall
    }


batch_sizes = [32, 64]
learning_rates = [0.01, 0.005, 0.001] 
best_fbeta = 0.0
best_parameters = None

for batch_size in batch_sizes:
    for learning_rate in learning_rates:

        grid_result = create_and_train_model(batch_size=batch_size, learning_rate=learning_rate)
        fbeta = grid_result['val_fbeta']

        # compare with the best validation f-beta
        if fbeta > best_fbeta:
            best_fbeta = fbeta
            best_parameters = grid_result
            print(f"New best model saved with F-beta: {best_fbeta} at batch size {batch_size} and learning rate {learning_rate}")

Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA L40S') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | train
4 | precision         | BinaryPrecision  | 0      | train
5 | roc               | BinaryROC        | 0      | train
6 | accuracy          | BinaryAccuracy   | 0      | train
---------------------------------------------------------------
611 K  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.
Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | train
4 | precision         | BinaryPrecision  | 0      | train
5 | roc               | BinaryROC        | 0      | train
6 | accuracy          | BinaryAccuracy

New best model saved with F-beta: 0.9470765590667725 at batch size 32 and learning rate 0.01


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.
Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | train
4 | precision         | BinaryPrecision  | 0      | train
5 | roc               | BinaryROC        | 0      | train
6 | accuracy          | BinaryAccuracy

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.
Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | train
4 | precision         | BinaryPrecision  | 0      | train
5 | roc               | BinaryROC        | 0      | train
6 | accuracy          | BinaryAccuracy

New best model saved with F-beta: 0.9605153203010559 at batch size 32 and learning rate 0.001


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.
Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | train
4 | precision         | BinaryPrecision  | 0      | train
5 | roc               | BinaryROC        | 0      | train
6 | accuracy          | BinaryAccuracy

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.
Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | train
4 | precision         | BinaryPrecision  | 0      | train
5 | roc               | BinaryROC        | 0      | train
6 | accuracy          | BinaryAccuracy

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.


In [4]:
best_parameters

{'batch_size': 32,
 'learning_rate': 0.001,
 'val_loss': tensor(0.1567),
 'val_accuracy': None,
 'val_fbeta': tensor(0.9605),
 'val_precision': tensor(0.9593),
 'val_recall': tensor(0.9682)}

RETRAINING

In [5]:
trainer = L.Trainer(
    max_epochs=8,
    enable_progress_bar=True,
    accelerator="gpu" if device == "cuda" else "cpu",
    logger=tensorboard_logger
)

model = PatchClassifier(feature_extractor=resnet, learning_rate=best_parameters['learning_rate'])

combined_dataset = ConcatDataset([train_loader.dataset, val_loader.dataset])
combined_loader = DataLoader(combined_dataset, batch_size=best_parameters['batch_size'], shuffle=True, num_workers=4)

trainer.fit(model, combined_loader)

best_model_path = "models/final_classifier.pth"
trainer.save_checkpoint(best_model_path)


Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name              | Type             | Params | Mode 
---------------------------------------------------------------
0 | feature_extractor | ResNet           | 25.6 M | eval 
1 | classifier        | Sequential       | 611 K  | train
2 | fb                | BinaryFBetaScore | 0      | train
3 | recall            | BinaryRecall     | 0      | tr

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=8` reached.


HOLD-OUT

In [3]:
trainer = L.Trainer(
    max_epochs=8,
    enable_progress_bar=True,
    accelerator="gpu" if device == "cuda" else "cpu",
    logger=tensorboard_logger
)

model = PatchClassifier(feature_extractor=resnet, learning_rate=0.001)

trainer.fit(model, train_loader)

trainer.validate(val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                  | Params | Mode 
--------------------------------------------------------------------
0 | feature_extractor | ResNet                | 25.6 M | eval 
1 | confusion_matrix  | BinaryConfusionMatrix | 0      | train
2 | classifier        | Sequential            | 611 K  | train
3 | fb                | BinaryFBetaScore      | 0      | train
4 | recall            | BinaryRecall          | 0      | train
5 | precision         | BinaryPrecision       | 0      | train
6 | roc               | BinaryROC             | 0      | train
7 | accuracy          | BinaryAccuracy        | 0      | train
--------------------------------------------------------------------
611 K     Trainable params
25.6 M    Non-traina

Training: |          | 0/? [00:00<?, ?it/s]