In [1]:
%reload_ext autoreload
%autoreload 2
from ship_detector.scripts.train_unet_rle import UNetShipSegmentation, create_data_loaders
from ship_detector.scripts.utils import load_config

import os

import torch
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_path = 'configs/unet_rle.yaml'
manifest_path = 'data/airbus-ship-detection/train_ship_segmentations_v2.csv'
output_dir = 'outputs/unet_rle'
verify_data = True

In [3]:
config = load_config(config_path)

In [4]:
pl.seed_everything(42)

Seed set to 42


42

In [5]:
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [6]:
df = pd.read_csv(manifest_path)

In [7]:
df['has_ship'] = df['EncodedPixels'].notna().astype(int)

In [8]:
df['patch_path'] = df['ImageId'].apply(lambda x: f"data/airbus-ship-detection/train_v2/{x}")

In [9]:
train_loader, val_loader = create_data_loaders(manifest_path=manifest_path, config=config, verify_first_batch=verify_data)

INFO:ship_detector.scripts.train_unet_rle:Loaded manifest with 231723 entries
INFO:ship_detector.scripts.train_unet_rle:Filtered to 81723 ship patches
INFO:ship_detector.scripts.train_unet_rle:Training samples: 65378
INFO:ship_detector.scripts.train_unet_rle:Validation samples: 16345
INFO:ship_detector.scripts.train_unet_rle:Dataset initialized with 65378 patches
INFO:ship_detector.scripts.train_unet_rle:Dataset initialized with 16345 patches
INFO:ship_detector.scripts.train_unet_rle:Verifying first batch...
INFO:ship_detector.scripts.train_unet_rle:✓ First batch loaded successfully
INFO:ship_detector.scripts.train_unet_rle:  Images shape: torch.Size([16, 3, 768, 768])
INFO:ship_detector.scripts.train_unet_rle:  Masks shape: torch.Size([16, 1, 768, 768])
INFO:ship_detector.scripts.train_unet_rle:  Image range: [-2.12, 2.64]
INFO:ship_detector.scripts.train_unet_rle:  Mask range: [0.00, 0.00]
INFO:ship_detector.scripts.train_unet_rle:  Mask unique values: [0.0]...


In [10]:
model = UNetShipSegmentation(config)

In [11]:
callbacks = [
    ModelCheckpoint(
        dirpath=os.path.join(output_dir, 'checkpoints'),
        filename='unet-{epoch:02d}-{val_iou:.3f}',
        monitor='val_iou',
        mode='max',
        save_top_k=3,
        save_last=True
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=config['training']['early_stopping_patience'],
        mode='min',
    ),
    LearningRateMonitor(logging_interval='epoch')
]

In [12]:
logger_tb = TensorBoardLogger(
    save_dir=output_dir,
    name='unet_logs'
)

In [13]:
trainer = pl.Trainer(
    max_epochs=config['training']['max_epochs'],
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    callbacks=callbacks,
    logger=logger_tb,
    log_every_n_steps=10,
    precision=config['training'].get('precision', 32)
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_loader, val_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 5060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type        | Params | Mode 
--------------------------------------------------
0 | model     | Unet        | 20.2 M | train
1 | criterion | DiceBCELoss | 0      | train
--------------------------------------------------
20.2 M    Trainable params
0         Non-trainable params
20.2 M    Total params
80.903    Total estimated model params size (MB)
565       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:ship_detector.scripts.train_unet_rle:Saved predictions to predictions_epoch_0.png


Epoch 0:   0%|          | 0/4086 [00:00<?, ?it/s]                          

INFO:ship_detector.scripts.train_unet_rle:First batch - Images: torch.Size([16, 3, 768, 768]), Masks: torch.Size([16, 1, 768, 768])
