In [3]:
!conda install /kaggle/input/how-to-use-pyvips-offline/*.tar.bz2



Downloading and Extracting Packages
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
######################################################################## | 100% 
########################################################################

In [7]:
import os
import sys
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import pyvips
import warnings
import random
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

from tqdm import tqdm
from PIL import Image
from random import randrange

from pathlib import Path
from glob import glob

from skimage.exposure import is_low_contrast
from scipy.ndimage import zoom, rotate
from skimage.io import imread, imsave

from collections import defaultdict
from openslide import OpenSlide

random.seed = 40

TensorFlow version: 2.6.4


In [8]:
data_path='../input/mayo-clinic-strip-ai/'
train_path = data_path + 'train/'
train_label_df = pd.read_csv(data_path + 'train.csv')
train_images_data = glob(train_path + "*")
print(f"Number of images in a training set: {len(train_images_data)}")

Number of images in a training set: 754


In [9]:
train_label_df.head()

Unnamed: 0,image_id,center_id,patient_id,image_num,label
0,006388_0,11,006388,0,CE
1,008e5c_0,11,008e5c,0,CE
2,00c058_0,11,00c058,0,LAA
3,01adc5_0,11,01adc5,0,LAA
4,026c97_0,4,026c97,0,CE


In [10]:
train_label_df['label'].value_counts()

CE     547
LAA    207
Name: label, dtype: int64

In [11]:
def get_img_info(images_data, label_df):
    img_prop = defaultdict(list)    
    for i, path in enumerate(images_data):

        img_path = images_data[i]
        slide = OpenSlide(img_path)    

        big_dim = 'none'
        max_min_dim_ratio = 1.0

        img_width = slide.dimensions[0]
        img_height = slide.dimensions[1]

        if(img_width > img_height):
            big_dim = 'width'
            max_min_dim_ratio = round(img_width/img_height, 2)
        elif(img_width < img_height):
            big_dim = 'height'
            max_min_dim_ratio = round(img_height/img_width, 2)

        img_prop['image_id'].append(img_path[-12:-4])
        img_prop['width'].append(img_width)
        img_prop['height'].append(img_height)
        img_prop['big_dim'].append(big_dim)
        #img_prop['size'].append(round(os.path.getsize(img_path) / 1e6, 2))
        img_prop['max_min_dim_ratio'].append(max_min_dim_ratio)
        
        #if(max_min_dim_ratio < 2.0):
        #    split_size = round(max_min_dim_ratio)
        #else:
        #    split_size = math.floor(max_min_dim_ratio)
        split_size = round(max_min_dim_ratio)
        img_prop['split_size'].append(split_size)
        img_prop['path'].append(img_path)
        
        img_info = pd.DataFrame(img_prop)
        img_info.sort_values(by='image_id', inplace=True)
        img_info.reset_index(inplace=True, drop=True)
        img_info = img_info.merge(label_df, on='image_id')

    return img_info

In [12]:
train_img_info = get_img_info(train_images_data, train_label_df)

In [13]:
print(train_img_info['split_size'].unique())

[2 5 4 1 3 6 7 8]


In [14]:
train_images_count = train_img_info['split_size'].sum()
print('total number of train images after split: ', train_images_count)

total number of train images after split:  1713


In [16]:
IMG_SIZE = 224
IMG_CHANNELS = 3

In [None]:
# function to check if the image contains useful information after splitting.
def get_score(img):
    imgray = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(imgray, 127, 255, 0)
    contours, hierarchy = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    return len(contours)

In [None]:
def save_tile(path, name, vips_tile, label):
    img = vips_tile.numpy()
    im = Image.fromarray(img)
    im.save(path + name + '_0.tif')
    im.rotate(90).save(path + name + '_1.tif')
    
    if(label == 'LAA'):
        Image.fromarray(rotate(img, 135, reshape=False, mode='reflect')).save(path + name + '_2.tif')
        im.rotate(180).save(path + name + '_3.tif')
        Image.fromarray(rotate(img, 225, reshape=False, mode='reflect')).save(path + name + '_4.tif')
    print(name, 'done saving a tile...')

In [None]:
def split_save_tiles(row, last_img_index, path_tiles):    
    image_id = row['image_id']
    
    width = row['width']
    height = row['height']
    big_dim = row['big_dim']
    split_size = row['split_size']
    input_train_path = row['path']
    label = row['label']
    center_id = row['center_id']
    
    n_across = 1
    n_down = 1

    vips_img = pyvips.Image.new_from_file(input_train_path, access='sequential')
    
    if(split_size == 1):
        crop_width = width
        crop_height = height
    elif(big_dim == 'width'):
        crop_width = width//split_size
        crop_height = height
        n_across = split_size
    else:
        crop_height = height//split_size
        crop_width = width
        n_down = split_size

    for x in range(n_across):
        for y in range(n_down):
            vips_tile = None
            if(split_size > 1):
                vips_tile = vips_img.crop(x*crop_width, y*crop_height, crop_width, crop_height)
            else:
                vips_tile = vips_img
            print(last_img_index, image_id, 'processing image with splits(', split_size, ')' , crop_width, ' X ', crop_height)            
            vips_tile = vips_tile.thumbnail_image(IMG_SIZE, height=IMG_SIZE, size='force')
            if(is_low_contrast(vips_tile)):
#                 print('low contrast image')
                continue
            tile_name = image_id + '_' + str(x+y)
            save_tile(path_tiles, tile_name, vips_tile, label)
            
            last_img_index += 1
    
    vips_img = None
    return last_img_index

In [None]:
def process_tiles():
    path_tiles = 'output/train/tiles2/'
    # Check whether the specified path exists or not
    exists_tiles = os.path.exists(path_tiles)
    
    if(not exists_tiles):
        print('creating folder', path_tiles)
        os.makedirs(path_tiles)

    last_img_index = 0
    for ind, row in train_img_info.iterrows():
#         if(ind != 2):
#             continue
        print('started processing image:', str(ind + 1))
        last_img_index = split_save_tiles(row, last_img_index, path_tiles)
    print('processed train images:', last_img_index)

In [None]:
process_tiles()

In [None]:
#tiles = os.listdir('output/train/tiles2/')
#os.listdir('output/train/tiles2/')

In [None]:
# def show_crop(path):
#     image = Image.open(path)
#     print(image.size)
#     image = np.asarray(image)
#     plt.figure()
#     plt.imshow(image, cmap='gray')
#     plt.colorbar()
#     plt.grid(False)
#     plt.show()

In [None]:
# show_crop('output/train/tiles2/' + tiles[14])