In [1]:
import os 
import torch 
import torch.nn as nn 

import lightning as pl 
from lightning.pytorch.callbacks import (
    ModelSummary,
    ModelCheckpoint,
    LearningRateMonitor,
    LearningRateFinder
)
from lightning.pytorch.tuner import Tuner


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.yolov3 import YOLOV3
from src.loss import YOLOLoss
from src.data import PASCALDataModule
from src.config import (
    DEVICE,
    PASCAL_CLASSES,
    NUM_CLASSES,
    NUM_EPOCHS,
    NUM_WORKERS,
    LEARNING_RATE,
    WEIGHT_DECAY,
    SAVE_MODEL,
    LOAD_MODEL,
    IMAGE_DIR,
    IMAGE_SIZE,
    PIN_MEMORY,
    DATASET,
    BATCH_SIZE,
    LABEL_DIR,
    SHUFFLE
    )

In [3]:
batch_size:int = BATCH_SIZE
shuffle:bool   = SHUFFLE
num_workers:int= NUM_WORKERS
learing_rate:float = LEARNING_RATE
epochs:int     = 3
num_classes:int = NUM_CLASSES
num_devices    = os.cpu_count()-1

In [4]:
datamodule = PASCALDataModule(
    train_csv_path= str(os.path.join('..','..','data','PASCAL_VOC','train.csv')),
    test_csv_path =  str(os.path.join('..','..','data','PASCAL_VOC','test.csv')),
)

In [5]:
datamodule.setup()

In [6]:
maxlr = LEARNING_RATE
scheduler_steps = len(datamodule.train_dataloader())

In [7]:
callbacks = [
    ModelSummary(max_depth=10),
    ModelCheckpoint(
        dirpath="model_checkpoints",
        filename="yolov3_{epoch}",
        monitor="train_loss",
        mode="min",
        save_last=True,
    ),
]


In [8]:
model = YOLOV3(in_channel=3,
    num_classes=num_classes,
    epochs=6,
    loss_fn=YOLOLoss,
    data_module=datamodule,
    learning_rate=LEARNING_RATE,
    maxlr=maxlr,
    scheduler_steps=scheduler_steps,
    device_count=num_devices,
)

In [9]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    num_nodes=1,
    max_epochs=6,
    callbacks=callbacks,
    precision="16-mixed",
    check_val_every_n_epoch=10,
    num_sanity_val_steps=0,
    enable_checkpointing=True,
)

Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [10]:
# tuner = Tuner(trainer)

# lr_finder = tuner.lr_find(
#     model=model, min_lr=1e-5, train_dataloaders=datamodule.train_dataloader()
# )
# maxlr = lr_finder.suggestion()
# fig = lr_finder.plot(suggest=True)

In [11]:
model.learning_rate = 0.0007079457843841378
model.maxlr = 0.0007079457843841378

trainer.fit(model=model, datamodule=datamodule)

You are using a CUDA device ('NVIDIA GeForce RTX 4050 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

    | Name                      | Type            | Params
----------------------------------------------------------------
0   | layers                    | ModuleList      | 61.6 M
1   | layers.0                  | CNNBlock        | 928   
2   | layers.0.conv             | Conv2d          | 864   
3   | layers.0.bn               | BatchNorm2d     | 64    
4   | layers.0.relu             | LeakyReLU       | 0     
5   | layers.1                  | CNNBlock        | 18.6 K
6   | layers.1.conv             | Conv2d          | 18.4 K
7   | layers.1.bn               | BatchNorm2d

Epoch 0:   0%|          | 0/65 [00:00<?, ?it/s] 

RuntimeError: Caught RuntimeError in pin memory thread for device 0.
Original Traceback (most recent call last):
  File "c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 36, in do_one_step
    data = pin_memory(data, device)
  File "c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 72, in pin_memory
    return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
  File "c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 72, in <listcomp>
    return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
  File "c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 72, in pin_memory
    return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
  File "c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 72, in <listcomp>
    return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
  File "c:\Users\muthu\miniconda3\envs\venv\lib\site-packages\torch\utils\data\_utils\pin_memory.py", line 57, in pin_memory
    return data.pin_memory(device)
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



In [None]:
os.path.isfile("../../../data/PASCAL_VOC//labels/2009_001854.txt")
os.path.isfile("../../data/PASCAL_VOC//labels/2009_001854.txt")

True