In [1]:
!ls /kaggle/input/pyvips-python-and-deb-package
# intall the deb packages
!dpkg -i --force-depends /kaggle/input/pyvips-python-and-deb-package/linux_packages/archives/*.deb
# install the python wrapper
!pip install pyvips -f /kaggle/input/pyvips-python-and-deb-package/python_packages/ --no-index

'ls' is not recognized as an internal or external command,
operable program or batch file.
'dpkg' is not recognized as an internal or external command,
operable program or batch file.


Looking in links: /kaggle/input/pyvips-python-and-deb-package/python_packages/

ERROR: Could not find a version that satisfies the requirement pyvips (from versions: none)
ERROR: No matching distribution found for pyvips





In [2]:
import os, glob
import pyvips
import numpy as np
from PIL import Image

os.environ['VIPS_DISC_THRESHOLD'] = '9gb'

DATASET_IMAGES = "/kaggle/input/UBC-OCEAN/train_images"
DATASET_MASKS = "/kaggle/input/ubc-ovarian-cancer-competition-supplemental-masks"

!mkdir -p /kaggle/temp/images
!mkdir -p /kaggle/temp/annotations
!mkdir -p /kaggle/temp/masks

!cp /kaggle/input/UBC-OCEAN/train.csv .

ModuleNotFoundError: No module named 'pyvips'

In [3]:
ls_masks = sorted(glob.glob(os.path.join(DATASET_MASKS, "*.png")))[:75]
print(f"found masks: {len(ls_masks)}")

found masks: 75


# Decompose image to tiles/grid 🖽

In [4]:
def extract_image_tiles(
    p_img, folder, size: int = 2048, scale: float = 0.5,
    drop_thr: float = 0.85, inds = None
) -> list:
    name, _ = os.path.splitext(os.path.basename(p_img))
    im = pyvips.Image.new_from_file(p_img)
    w = h = size
    if not inds:
        # https://stackoverflow.com/a/47581978/4521646
        inds = [(y, y + h, x, x + w)
                for y in range(0, im.height, h)
                for x in range(0, im.width, w)]
    files, idxs, k = [], [], 0
    for idx in inds:
        y, y_, x, x_ = idx
        # https://libvips.github.io/pyvips/vimage.html#pyvips.Image.crop
        tile = im.crop(x, y, min(w, im.width - x), min(h, im.height - y)).numpy()[..., :3]
        if drop_thr is not None:
            mask_bg = np.sum(tile, axis=2) == 0
            if np.sum(mask_bg) >= (np.prod(mask_bg.shape) * drop_thr):
                #print(f"skip almost empty tile: {k:06}_{int(x_ / w)}-{int(y_ / h)}")
                continue
        if tile.shape[:2] != (h, w):
            tile_ = tile
            tile_size = (h, w) if tile.ndim == 2 else (h, w, tile.shape[2])
            tile = np.zeros(tile_size, dtype=tile.dtype)
            tile[:tile_.shape[0], :tile_.shape[1], ...] = tile_
        p_img = os.path.join(folder, f"{k:05}_{int(x_ / w)}-{int(y_ / h)}.png")
        # print(tile.shape, tile.dtype, tile.min(), tile.max())
        new_size = int(size * scale), int(size * scale)
        Image.fromarray(tile).resize(new_size, Image.LANCZOS).save(p_img)
        files.append(p_img)
        idxs.append(idx)
        k += 1
    return files, idxs

## Show the image tiles with segmentations

In [5]:
from PIL import Image
import matplotlib.pyplot as plt

# Conver RGB annotation to labels

In [6]:
def convert_rgb_to_labels(img_path: str, folder: str):
    name = os.path.basename(img_path)
    img = np.array(Image.open(img_path))
    #plt.imshow(img)
    bg = np.ones((img.shape[0], img.shape[1], 1)) * 128
    stack = np.concatenate((bg, img), axis=2)
    mask = np.argmax(stack, axis=2).astype(np.uint8)
    #print(np.unique(mask))
    #plt.imshow(mask)
    img_path = os.path.join(folder, name)
    Image.fromarray(mask).save(img_path) 
    return img_path



## Export all image tiles¶

In [7]:
!mkdir -p /kaggle/tmp/train_images
!mkdir -p /kaggle/tmp/train_annotations
!mkdir -p /kaggle/tmp/train_masks

In [8]:
def extract_tiles_masks(
    idx_name,
    folder_img: str = "/kaggle/tmp/train_images",
    folder_seg: str = "/kaggle/tmp/train_annotations",
    folder_mask: str = "/kaggle/tmp/train_masks",
    size: int = 2048, scale: float = 0.5, drop_thr: float = 0.6
) -> None:
    idx, name = idx_name
    print(f"processing #{idx}: {name}")
    
    folder_img = os.path.join(folder_img, name)
    os.makedirs(folder_img, exist_ok=True)
    folder_seg = os.path.join(folder_seg, name)
    os.makedirs(folder_seg, exist_ok=True)
    folder_mask = os.path.join(folder_mask, name)
    os.makedirs(folder_mask, exist_ok=True)
    
    _, idxs = extract_image_tiles(
        os.path.join(DATASET_IMAGES, f"{name}.png"),
        folder_img, size=size, scale=scale,
        drop_thr=drop_thr,
    )
    tiles_seg, _ = extract_image_tiles(
        os.path.join(DATASET_MASKS, f"{name}.png"),
        folder_seg, size=size, scale=scale,
        drop_thr=None, inds=idxs,
    )
    tiles_mask = [
        convert_rgb_to_labels(p, folder_mask) for p in tiles_seg
    ]

Run the cutting in parallel with multiple workers

In [9]:
from tqdm.auto import tqdm
from joblib import Parallel, delayed

names = [os.path.splitext(os.path.basename(p))[0] for p in ls_masks]
    
_= Parallel(n_jobs=2)(
    delayed(extract_tiles_masks)
    (id_name, size=2048, drop_thr=0.6, scale=0.5)
    for id_name in tqdm(enumerate(names), total=len(names))
)

  0%|          | 0/75 [00:00<?, ?it/s]

## Show some samples

In [10]:
masks = sorted(glob.glob('/kaggle/tmp/train_masks/*/*.png'))

imgs_with_masks = []
for an in tqdm(masks):
    img = Image.open(an)
    if np.sum(np.array(img)==1)>0:
        imgs_with_masks.append(an)

  0%|          | 0/17721 [00:00<?, ?it/s]

In [11]:
!mkdir -p /kaggle/tmp/train_images_filter

for p in tqdm(imgs_with_masks):
    os.system(f'mkdir -p /kaggle/tmp/train_images_filter/{p.split("/")[-2]}')
    os.system(f'cp {p.replace("train_masks", "train_images")} /kaggle/tmp/train_images_filter/{p.split("/")[-2]}/')

  0%|          | 0/7095 [00:00<?, ?it/s]

processing #0: 10143
processing #3: 10252
processing #4: 10800
processing #6: 1101
processing #7: 11263
processing #9: 11557
processing #10: 12442
processing #14: 14401
processing #16: 14542
processing #17: 15139
processing #19: 15209
processing #22: 15671
processing #23: 16064
processing #25: 16876
processing #27: 17174
processing #29: 17738
processing #30: 17854
processing #33: 18607
processing #34: 19030
processing #35: 1925
processing #37: 1952
processing #38: 19569
processing #40: 21432
processing #43: 21929
processing #44: 22155
processing #47: 22425
processing #48: 22489
processing #50: 24617
processing #52: 26219
processing #55: 26950
processing #57: 27315
processing #58: 27950
processing #60: 28562
processing #63: 30738
processing #64: 31033
processing #66: 32432
processing #67: 33708
processing #69: 34247
processing #71: 35239
processing #74: 36499
processing #1: 1020
processing #2: 10246
processing #5: 10896
processing #8: 11431
processing #11: 1252
processing #12: 12522
pro

In [12]:
!tar -zcf train_images.tar.gz -C "/kaggle/tmp/train_images_filter/" .