In [20]:
%reload_ext autoreload
%autoreload 2


from ship_detector.scripts.train_vit_efficient import (create_efficient_data_loaders,
                                                       EfficientViTClassifier,
                                                       MemoryMonitor)

import os
import yaml
import  torch
from pathlib import Path
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor, EarlyStopping

In [2]:
config_path = "configs/vit_efficient.yaml"
manifest_path = "data/airbus-ship-detection/train_ship_segmentations_v2.csv"
output_dir = "outputs/efficient"

In [12]:
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
pl.seed_everything(config.get('seed', 42))
Path(output_dir).mkdir(parents=True, exist_ok=True)

Seed set to 42


In [13]:
config

{'model': {'name': 'vit_base_patch16_224',
  'pretrained': True,
  'use_lora': True,
  'lora_rank': 16,
  'lora_alpha': 16.0,
  'lora_target_modules': ['qkv', 'proj', 'fc1', 'fc2'],
  'gradient_checkpointing': False,
  'freeze_backbone_epochs': 0},
 'training': {'batch_size': 32,
  'max_epochs': 30,
  'early_stopping_patience': 5,
  'pos_weight': 3.0,
  'accumulate_grad_batches': 4,
  'gradient_clip_val': 1.0,
  'precision': 16,
  'limit_train_batches': 1.0,
  'limit_val_batches': 1.0,
  'val_check_interval': 1.0},
 'optimizer': {'name': 'adamw', 'lr': 0.0005, 'weight_decay': 0.01},
 'scheduler': {'name': 'cosine', 'T_max': 30, 'eta_min': 1e-06},
 'data': {'use_streaming': False,
  'cache_size': 100,
  'chunk_size': 1000,
  'num_workers': 2,
  'pin_memory': True,
  'persistent_workers': False,
  'prefetch_factor': 2,
  'val_split': 0.2,
  'random_seed': 42,
  'train': 'data/airbus-ship-detector/train_v2',
  'val': 'data/airbus-ship-detector/test_v2'},
 'profiles': {'low_memory': {'trai

In [14]:
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

In [15]:
memory_monitor = MemoryMonitor()
for key, value in memory_monitor.get_memory_usage().items():
    print(f"{key}: {value:.2f} GB")

ram_used_gb: 18.11 GB
ram_available_gb: 16.05 GB
ram_percent: 53.00 GB
process_ram_gb: 0.96 GB
gpu_allocated_gb: 0.00 GB
gpu_reserved_gb: 0.00 GB
gpu_free_gb: 17.10 GB


In [16]:
train_loader, val_loader = create_efficient_data_loaders(
    manifest_path=manifest_path,
    config=config,
    memory_monitor=memory_monitor,
)

Training samples: 185378 (Ships: 65378)
Validation samples: 46345 (Ships: 16345)


In [18]:
model = EfficientViTClassifier(config)

LoRA enabled with rank=16
Total parameters: 88,158,721
Trainable parameters: 2,359,296
Reduction: 97.3%


In [22]:
callbacks = [
    ModelCheckpoint(
        dirpath=os.path.join(output_dir, 'lora/checkpoints'),
        filename='vit-{epoch:02d}-{val_acc:.3f}',
        monitor='val_acc',
        mode='max',
        save_top_k=2,
        save_last=True
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=config['training']['early_stopping_patience'],
        mode='min'
    ),
    LearningRateMonitor(logging_interval='epoch')
]

In [24]:
trainer = pl.Trainer(
    max_epochs=config['training']['max_epochs'],
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    callbacks=callbacks,
    accumulate_grad_batches=config['training'].get('accumulate_grad_batches', 1),
    gradient_clip_val=config['training'].get('gradient_clip_val', 1.0),
    precision=config['training'].get('precision', 16),
    log_every_n_steps=10,
    val_check_interval=config['training'].get('val_check_interval', 1.0),
    limit_train_batches=config['training'].get('limit_train_batches', 1.0),
    limit_val_batches=config['training'].get('limit_val_batches', 1.0)
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [25]:
trainer.fit(model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type              | Params | Mode 
--------------------------------------------------------
0 | model     | VisionTransformer | 88.2 M | train
1 | criterion | BCEWithLogitsLoss | 0      | train
--------------------------------------------------------
2.4 M     Trainable params
85.8 M    Non-trainable params
88.2 M    Total params
352.635   Total estimated model params size (MB)
374       Modules in train mode
0         Modules in eval mode


Epoch 0:   0%|          | 0/5794 [00:00<?, ?it/s]                          

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn