In [1]:
import os
import win32com.client
def resolve_shortcut(path):
    shell = win32com.client.Dispatch("WScript.Shell")
    shortcut = shell.CreateShortCut(path)
    return shortcut.Targetpath

data_path = resolve_shortcut(r"G:\\My Drive\\dc4data.lnk")
benthic_path = resolve_shortcut(data_path+r"\\benthic_datasets.lnk")
coralbleaching_path = resolve_shortcut(data_path+r"\\coral_bleaching.lnk")
if not os.path.exists(r"G:\.shortcut-targets-by-id\1v4g4qOrbisBvrpqOxLrYn96nd_gPG_Ge\dc4data\coralscapes"):
     coralscapes_path = resolve_shortcut(data_path+r"\\coralscapes.lnk")
else:
        coralscapes_path = r"G:\.shortcut-targets-by-id\1v4g4qOrbisBvrpqOxLrYn96nd_gPG_Ge\dc4data\coralscapes"
for p in [data_path, benthic_path, coralbleaching_path, coralscapes_path]:
    if os.path.exists(p):
        print(f"Path exists: {p}")
    if not os.path.exists(p):
        raise FileNotFoundError(f"Path does not exist: {p}")
    

Path exists: G:\.shortcut-targets-by-id\1v4g4qOrbisBvrpqOxLrYn96nd_gPG_Ge\dc4data
Path exists: G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets
Path exists: G:\.shortcut-targets-by-id\1jGkNA1n0znoxKnQBHTJZuPgvkiu_OBM8\coral_bleaching
Path exists: G:\.shortcut-targets-by-id\1v4g4qOrbisBvrpqOxLrYn96nd_gPG_Ge\dc4data\coralscapes


## Benthic Datset

##

In [2]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import os

benthic_paths = [r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\SEAFLOWER_BOLIVAR",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\SEAFLOWER_COURTOWN",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\SEAVIEW_PAC_USA",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\SEAVIEW_IDN_PHL",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\SEAVIEW_PAC_AUS",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\TETES_PROVIDENCIA",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\SEAVIEW_ATL",
r"G:\.shortcut-targets-by-id\1mx2OJcVKp1mRbTbjezqWucDXpbGrd_OA\benthic_datasets\mask_labels\reef_support\UNAL_BLEACHING_TAYRONA",]

class SegmentationDataset(Dataset):
    def __init__(self, img_dir, mask_dir, transform=None):
        self.img_dir = img_dir
        self.mask_dir = mask_dir
        self.images = os.listdir(img_dir)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.images[idx])
        mask_path = os.path.join(self.mask_dir, self.images[idx])
        image = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)

        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)

        return image, mask

def get_mask(benthic_folder):
    mask_path = os.path.join(benthic_folder, 'masks')
    return mask_path

def get_image(benthic_folder):
    image_path = os.path.join(benthic_folder, 'images')
    return image_path


#DATASETS
SEAFLOWER_BOLIVAR = SegmentationDataset(get_image(benthic_paths[0]), get_mask(benthic_paths[0]))
SEAFLOWER_COURTOWN = SegmentationDataset(get_image(benthic_paths[1]), get_mask(benthic_paths[1]))
SEAVIEW_PAC_USA = SegmentationDataset(get_image(benthic_paths[2]), get_mask(benthic_paths[2]))
SEAVIEW_IDN_PHL = SegmentationDataset(get_image(benthic_paths[3]), get_mask(benthic_paths[3]))
SEAVIEW_PAC_AUS = SegmentationDataset(get_image(benthic_paths[4]), get_mask(benthic_paths[4]))
TETES_PROVIDENCIA = SegmentationDataset(get_image(benthic_paths[5]), get_mask(benthic_paths[5]))
SEAVIEW_ATL = SegmentationDataset(get_image(benthic_paths[6]), get_mask(benthic_paths[6]))
UNAL_BLEACHING_TAYRONA = SegmentationDataset(get_image(benthic_paths[7]), get_mask(benthic_paths[7]))

## Coral Scapes

In [3]:
from pathlib import Path
from collections import OrderedDict

class CoralScapesParquet(Dataset):
    """
    Reads shards produced by your pipeline:
      columns: split (str), index (int), image_png (bytes), label_health_rgb_png (bytes)
    Returns: (C,H,W) float tensors in [0,1] for image and mask.
    """
    def __init__(self, parquet_dir: str | Path, return_masks_as_long: bool = False, cache_files: int = 2):
        self.dir = Path(parquet_dir)
        self.paths = sorted(self.dir.glob("*.parquet"))
        if not self.paths:
            raise FileNotFoundError(f"No parquet files in {self.dir}")
        # Build global index: which file and which row
        self.file_row_offsets = []  # cumulative row counts
        self.row_index = []         # list of (file_idx, row_idx)
        self._meta_row_counts = []
        for fidx, p in enumerate(self.paths):
            pf = pq.ParquetFile(p)
            nrows = pf.metadata.num_rows
            self._meta_row_counts.append(nrows)
            self.row_index.extend([(fidx, r) for r in range(nrows)])
        # Simple LRU cache for loaded tables
        self._cache = OrderedDict()
        self._cache_limit = max(1, int(cache_files))
        self.return_masks_as_long = return_masks_as_long


root = Path("data_preprocessing/coralscapes_export_with_images/parquet")
coral_scapes_datasets = {s: CoralScapesParquet(root / s, return_masks_as_long=True) for s in ("test", "train", "validation")}

FileNotFoundError: No parquet files in data_preprocessing\coralscapes_export_with_images\parquet\test

## Coral Bleaching

In [4]:
coral_bleaching_images = r"g:\.shortcut-targets-by-id\1jGkNA1n0znoxKnQBHTJZuPgvkiu_OBM8\coral_bleaching\reef_support\UNAL_BLEACHING_TAYRONA\images"
coral_bleaching_combined_masks = r"data_preprocessing/coralbleaching/combined_masks"
coral_bleaching_single_masks = r"data_preprocessing/coralbleaching/single_masks"


In [5]:
from pathlib import Path
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

def pil_to_tensor(img):
    a = np.asarray(img.convert("RGB"), dtype=np.uint8)  # (H,W,3)
    return torch.from_numpy(a).permute(2,0,1).float()/255.0  # (3,H,W)

class CoralBleachingDataset(Dataset):
    def __init__(self, images_dir, combined_dir, single_dir):
        self.images_dir = Path(images_dir)
        self.combined_dir = Path(combined_dir)
        self.single_bleached = Path(single_dir) / "bleached_blue"
        self.single_non = Path(single_dir) / "non_bleached_red"

        imgs = []
        for e in ("*.png","*.jpg","*.jpeg"):
            imgs += list(self.images_dir.glob(e))
        self.images = sorted(imgs)

        self.pairs = self._match_pairs()

    def _match_pairs(self):
        def index_dir(d):
            out={}
            for e in ("*.png","*.jpg","*.jpeg"):
                for p in d.glob(e): out[p.stem.lower()] = p
            return out
        cmb = index_dir(self.combined_dir)
        ble = index_dir(self.single_bleached)
        non = index_dir(self.single_non)

        pairs=[]
        for img in self.images:
            key = img.stem.lower()
            k_cmb = f"{key}_combined"
            if k_cmb in cmb: pairs.append((img, cmb[k_cmb])); continue
            cand = [p for k,p in ble.items() if k.startswith(key) or key in k]
            if cand: pairs.append((img, cand[0])); continue
            cand = [p for k,p in non.items() if k.startswith(key) or key in k]
            if cand: pairs.append((img, cand[0]))
        return pairs

    def __len__(self): return len(self.pairs)
    def __getitem__(self, i):
        ip, mp = self.pairs[i]
        x = pil_to_tensor(Image.open(ip))
        y = pil_to_tensor(Image.open(mp))
        return x, y  # (3,H,W), (3,H,W)

def pad_collate(batch):
    # batch: list of (img, mask) with varying H,W
    imgs, masks = zip(*batch)
    C = imgs[0].shape[0]
    H = max(t.shape[1] for t in imgs)
    W = max(t.shape[2] for t in imgs)
    xb = torch.zeros(len(imgs), C, H, W, dtype=imgs[0].dtype)
    yb = torch.zeros(len(masks), C, H, W, dtype=masks[0].dtype)
    for i, (x, y) in enumerate(zip(imgs, masks)):
        h, w = x.shape[1], x.shape[2]
        xb[i, :, :h, :w] = x
        yb[i, :, :h, :w] = y
    return xb, yb

# ---- use it ----
dataset = CoralBleachingDataset(
    images_dir=r"g:\.shortcut-targets-by-id\1jGkNA1n0znoxKnQBHTJZuPgvkiu_OBM8\coral_bleaching\reef_support\UNAL_BLEACHING_TAYRONA\images",
    combined_dir=r"data_preprocessing/coralbleaching/combined_masks",
    single_dir=r"data_preprocessing/coralbleaching/single_masks"
)
loader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=pad_collate)

xb, yb = next(iter(loader))
print(xb.shape, yb.shape)  # -> (B,3,H_max,W_max) (B,3,H_max,W_max)


  return torch.from_numpy(a).permute(2,0,1).float()/255.0  # (3,H,W)


torch.Size([8, 3, 3217, 4301]) torch.Size([8, 3, 3217, 4301])
