## PATH SETUP

In [1]:
# MUST be first cell: set multiprocessing method for Windows
import torch.multiprocessing as mp
try:
    mp.set_start_method("spawn", force=True)
except RuntimeError:
    pass  # Already set

In [2]:
import os
import sys
from pathlib import Path

# FOR LOCAL USE THIS LINES
current = Path.cwd()
src_path = current / "src" if (current / "src").exists() else current.parent

# FOR COLAB USE THIS LINE INSTEAD
# BRANCH_NAME = "main"  # Change this to switch branches
# !git clone -b {BRANCH_NAME} https://github.com/MatteoCamillo-code/GeoLoc-CVCS.git
# !cd /content/GeoLoc-CVCS && git pull origin {BRANCH_NAME} && cd ..
# src_path = Path("/content/GeoLoc-CVCS/src").resolve()

sys.path.insert(0, str(src_path))

from utils.paths import find_project_root

# Set working directory and sys.path properly
project_root = find_project_root(src_path)
data_dir = project_root / "data"
os.chdir(project_root)
sys.path.insert(0, str(project_root / "src"))
print("CWD:", Path.cwd())

CWD: F:\InfTech\Prodotti\Python\GeoLocGit\GeoLoc-CVCS


## IMPORT

In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torchvision.models import resnet50, ResNet50_Weights
from torch.optim.lr_scheduler import StepLR

from configs.baseline_multi_head_ISN import TrainConfig

from utils.seed import seed_everything
from utils.io import save_json
from training.runner import fit

from src.utils.logging import get_logger
from src.utils.paths import abs_path

from models.multi_head_classifier import MultiHeadClassifier


In [4]:
cfg = TrainConfig()
seed_everything(cfg.seed)

device = cfg.device if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [5]:
import kagglehub

path = kagglehub.dataset_download("josht000/osv-mini-129k")
path = path + "/osv5m"
print("Path to dataset files:", path)

image_root = path + "/train_images"


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\camil\.cache\kagglehub\datasets\josht000\osv-mini-129k\versions\1/osv5m


In [6]:
train_val_path = data_dir / "metadata/s2-geo-cells/train_val_split_geocells.csv"
cell_centers_path = data_dir / "metadata/s2-geo-cells/cell_center_dataset.csv"

train_val_meta = pd.read_csv(train_val_path)
cell_centers_df = pd.read_csv(cell_centers_path)

print("Train/val CSV:", train_val_path)
print("Cell centers CSV:", cell_centers_path)


Train/val CSV: F:\InfTech\Prodotti\Python\GeoLocGit\GeoLoc-CVCS\data\metadata\s2-geo-cells\train_val_split_geocells.csv
Cell centers CSV: F:\InfTech\Prodotti\Python\GeoLocGit\GeoLoc-CVCS\data\metadata\s2-geo-cells\cell_center_dataset.csv


## DATALOADER

In [7]:
from dataset.dataloader_utils import create_dataloaders

IMG_SIZE = 224
TRAIN_SUBSET_PCT = 100.0  # Use 100% of training data (or set to e.g., 10.0 for 10%)
VAL_SUBSET_PCT = 100.0    # Use 100% of validation data

# Create all dataloaders with a single function call
loader_dict = create_dataloaders(
    image_root=image_root,
    csv_path=train_val_path,
    batch_size=cfg.batch_size,
    num_workers=cfg.num_workers,
    img_size=IMG_SIZE,
    seed=cfg.seed,
    train_subset_pct=TRAIN_SUBSET_PCT,
    val_subset_pct=VAL_SUBSET_PCT,
    scenes=cfg.scenes,
    augment=True,
    prefetch_factor=4,
    persistent_workers=True if cfg.num_workers > 0 else False,
    coarse_label_idx=cfg.coarse_label_idx,
)

## MODEL

In [8]:
weights = ResNet50_Weights.IMAGENET1K_V2
models = {}

for sc in cfg.scenes:
    resnet = resnet50(weights=weights)

    # number of classes depends on partition
    num_classes = list(map(
        lambda idx: len(loader_dict[sc]["label_maps"][f"label_config_{idx + 1}"]),
        cfg.coarse_label_idx
    ))

    backbone = nn.Sequential(
        *list(resnet.children())[:-1],
        nn.Flatten(1)
    )

    FEAT_DIM = 2048  # resnet50 feature dimension

    resnet = resnet.to(device)
    # Optional: comment out if it causes issues on Windows/your PyTorch version
    # model = torch.compile(model, backend="aot_eager")

    model = MultiHeadClassifier(
        backbone=backbone,
        feat_dim=FEAT_DIM,
        head_dims=num_classes,
        dropout=cfg.dropout,
        coarse_level_idx=cfg.coarse_label_idx,
    ).to(device)
    
    models[sc] = model

    print(f"Output classes {sc}:", num_classes)


Output classes urban: [3101, 1850, 1091]
Output classes natural: [4561, 2472, 1328]


In [9]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)

# Create optimizer, scheduler, and scaler for each model
optimizers = {}
schedulers = {}
scalers = {}

for scene, model in models.items():
    optimizers[scene] = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    schedulers[scene] = StepLR(optimizers[scene], step_size=cfg.scheduler_step_size, gamma=cfg.scheduler_gamma)
    scalers[scene] = torch.amp.GradScaler(device=cfg.device, enabled=cfg.amp)

torch.backends.cudnn.benchmark = True
print(f"Initialized training components for {len(models)} scenes: {list(models.keys())}")

Initialized training components for 2 scenes: ['urban', 'natural']


## TRAINING LOOP

In [10]:
# Train each scene's model separately
histories = {}
logger = get_logger(log_file=str(abs_path(cfg.output_dir, "logs", "train.log")))

for scene in models.keys():
    history = fit(
        cfg=cfg,
        model=models[scene],
        train_loader=loader_dict[scene]["train_loader"],
        val_loader=loader_dict[scene]["val_loader"],
        optimizer=optimizers[scene],
        criterion=criterion,
        scaler=scalers[scene],
        use_tqdm=cfg.use_tqdm,
        scheduler=schedulers[scene],
        logger=logger,
        scene=scene,
    )
    
    histories[scene] = history

[22:32:52] INFO - Starting training baseline_multi_head_ISN for scene urban ...
[22:34:29] INFO - Epoch 1/25 | train loss=6.7759 acc=2.87% | val loss=6.2849 acc=5.89% | time=96.87s
[22:35:24] INFO - Epoch 2/25 | train loss=5.7024 acc=12.27% | val loss=5.8295 acc=9.41% | time=54.19s
[22:36:18] INFO - Epoch 3/25 | train loss=5.0323 acc=20.42% | val loss=5.5118 acc=12.12% | time=54.37s
[22:37:12] INFO - Epoch 4/25 | train loss=4.4944 acc=28.80% | val loss=5.2835 acc=13.59% | time=54.04s
[22:38:07] INFO - Epoch 5/25 | train loss=4.0440 acc=36.35% | val loss=5.0888 acc=15.06% | time=54.17s
[22:39:01] INFO - Epoch 6/25 | train loss=3.6780 acc=44.72% | val loss=5.0096 acc=16.49% | time=53.56s
[22:39:55] INFO - Epoch 7/25 | train loss=3.5000 acc=48.15% | val loss=4.9513 acc=16.70% | time=53.61s
[22:40:48] INFO - Epoch 8/25 | train loss=3.3411 acc=50.96% | val loss=4.8837 acc=17.35% | time=53.59s
[22:41:42] INFO - Epoch 9/25 | train loss=3.1936 acc=53.56% | val loss=4.8243 acc=18.11% | time=53.

In [11]:
# Save history for each scene
for scene, history in histories.items():
    history_path = project_root / "outputs" / "history" / f"{cfg.model_name}_{scene}_history.json"
    save_json(obj=history, path=history_path)
    print(f"Saved history for {scene}: {history_path}")

Saved history for urban: F:\InfTech\Prodotti\Python\GeoLocGit\GeoLoc-CVCS\outputs\history\baseline_multi_head_ISN_urban_history.json
Saved history for natural: F:\InfTech\Prodotti\Python\GeoLocGit\GeoLoc-CVCS\outputs\history\baseline_multi_head_ISN_natural_history.json
