In [26]:
%reload_ext autoreload
%autoreload 2


from ship_detector.scripts.train_vit_efficient import (create_efficient_data_loaders,
                                                       EfficientViTClassifier,
                                                       MemoryMonitor)

import os
import yaml
import timm
import torch
import torch.nn as nn
from pathlib import Path
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping

In [3]:
config_path = "configs/vit_efficient.yaml"
manifest_path = "data/airbus-ship-detection/train_ship_segmentations_v2.csv"
output_dir = "outputs/efficient"

In [4]:
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
pl.seed_everything(config.get('seed', 42))
Path(output_dir).mkdir(parents=True, exist_ok=True)

Seed set to 42


In [5]:
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

In [6]:
memory_monitor = MemoryMonitor()
for key, value in memory_monitor.get_memory_usage().items():
    print(f"{key}: {value:.2f} GB")

ram_used_gb: 16.11 GB
ram_available_gb: 18.05 GB
ram_percent: 47.20 GB
process_ram_gb: 0.76 GB
gpu_allocated_gb: 0.00 GB
gpu_reserved_gb: 0.00 GB
gpu_free_gb: 17.10 GB


In [7]:
train_loader, val_loader = create_efficient_data_loaders(
    manifest_path=manifest_path,
    config=config,
    memory_monitor=memory_monitor,
)

Training samples: 185378 (Ships: 65378)
Validation samples: 46345 (Ships: 16345)


In [33]:
model = EfficientViTClassifier(config)

LoRA Setup Complete:
  Trainable parameters: 2,359,296
  Total parameters: 88,157,952
  Trainable ratio: 0.0268


In [34]:
callbacks = [
    ModelCheckpoint(
        dirpath=os.path.join(output_dir, 'lora/checkpoints'),
        filename='vit-{epoch:02d}-{val_acc:.3f}',
        monitor='val_acc',
        mode='max',
        save_top_k=2,
        save_last=True
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=config['training']['early_stopping_patience'],
        mode='min'
    ),
    LearningRateMonitor(logging_interval='epoch')
]

In [35]:
trainer = pl.Trainer(
    max_epochs=config['training']['max_epochs'],
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    callbacks=callbacks,
    accumulate_grad_batches=config['training'].get('accumulate_grad_batches', 1),
    gradient_clip_val=config['training'].get('gradient_clip_val', 1.0),
    precision=config['training'].get('precision', 16),
    log_every_n_steps=10,
    val_check_interval=config['training'].get('val_check_interval', 1.0),
    limit_train_batches=config['training'].get('limit_train_batches', 1.0),
    limit_val_batches=config['training'].get('limit_val_batches', 1.0)
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [None]:
trainer.fit(model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params | Mode 
---------------------------------------------------------
0 | backbone   | VisionTransformer | 88.2 M | train
1 | classifier | Sequential        | 769    | train
2 | criterion  | BCEWithLogitsLoss | 0      | train
---------------------------------------------------------
2.4 M     Trainable params
85.8 M    Non-trainable params
88.2 M    Total params
352.635   Total estimated model params size (MB)
328       Modules in train mode
0         Modules in eval mode


Epoch 0:  42%|████▏     | 2411/5794 [06:00<08:26,  6.69it/s, v_num=1]      