In [None]:
###
# This code processes Whole Slide Images (WSI) that have been scanned in formats such as NDPI, VSI, and SVS, 
# then compressed to 1/8 of their original dimensions and saved as PNG images. 
# Using these compressed images along with their corresponding YOLO annotations, 
# the code generates patch images. 
# The example provided uses a crop size of 680 pixels, which can be adjusted as needed.
###

In [None]:
import os, glob, shutil, tifffile, json, pathlib
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageFilter, ImageDraw 
from joblib import Parallel, delayed
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import warnings
warnings.simplefilter('ignore', Image.DecompressionBombWarning)

seed = 0

Image.MAX_IMAGE_PIXELS = None
%matplotlib inline

In [None]:
# Assuming images and labels are saved in the same directory

img_d = '/PATH/TO/YOUR/DIRECTORY'
label_d = img_d
imgs = sorted(glob.glob(img_d + '/*.png'))
labels = sorted(glob.glob(label_d + '/*.txt'))
print(len(imgs))
print(len(labels))

In [None]:
imgs_base = [os.path.basename(i) for i in imgs]
labels_base = [os.path.basename(i) for i in labels]

for i, j in zip(imgs_base, labels_base):
    if os.path.splitext(i)[0] != os.path.splitext(j)[0]:
        print(i)

In [None]:
### duplecate check ####

In [None]:
def find_duplicates(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    line_set = set()
    duplicates = False

    for line in lines:
        if line.strip() in line_set:
            duplicates = True
            break
        else:
            line_set.add(line.strip())

    return duplicates

def scan_directory_for_duplicates(directory):
    duplicate_files = []

    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            if find_duplicates(file_path):
                duplicate_files.append(filename)

    return duplicate_files

directory_path = label_d
duplicate_files = scan_directory_for_duplicates(directory_path)

print("file with duplicate:")
for file in duplicate_files:
    print(file)

In [None]:
def remove_duplicates(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    unique_data = set()
    new_lines = []

    for line in lines:
        if line.strip() not in unique_data:
            unique_data.add(line.strip())
            new_lines.append(line)

    with open(file_path, 'w') as file:
        file.writelines(new_lines)

def process_directory(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            remove_duplicates(os.path.join(directory, filename))
            
directory_path = label_d
process_directory(directory_path)

In [None]:
directory_path = label_d
duplicate_files = scan_directory_for_duplicates(directory_path)

print("file with duplicate:")
for file in duplicate_files:
    print(file)

In [None]:
imgs = sorted(glob.glob(img_d + '/*.png'))
labels = sorted(glob.glob(label_d + '/*.txt'))
print(len(imgs))
print(len(labels))

In [None]:
##### 

In [None]:
x = []
y = []
for i in imgs:
    temp = Image.open(i)
    x.append(temp.size[0])
    y.append(temp.size[1])
print(max(x))
print(max(y))
print(min(x))
print(min(y))

In [None]:
plt.hist(x)
plt.hist(y)

In [None]:
def process_image(i, j, grid, crop_save_dir):
    img_pil = Image.open(i)
    img_w = img_pil.size[0]
    img_h = img_pil.size[1]
    range_x = img_w // grid
    range_y = img_h // grid

    with open(j) as f:
        s = [r.rstrip() for r in f.readlines()]
        s = [r.split(' ') for r in s]
    x1, y1, x2, y2, class_na = [], [], [], [], []
    for k in s:
        x_center = int(float(k[1]) * int(img_w))
        y_center = int(float(k[2]) * int(img_h))
        x_width = int(float(k[3]) * int(img_w))
        y_width = int(float(k[4]) * int(img_h))
        x1.append(int(x_center - (0.5*x_width)))
        y1.append(int(y_center - (0.5*y_width)))
        x2.append(int(x_center + (0.5*x_width)))
        y2.append(int(y_center + (0.5*y_width)))
        class_na.append(int(k[0]))
    plot_coordinates = np.vstack([x1, y1, x2, y2, class_na]).T
    
    for x in range(0, range_x):
        for y in range(0, range_y):
            cropped_img = img_pil.crop((x*grid, y*grid, (x+1)*grid, (y+1)*grid))
            cropped_img.save(crop_save_dir + '/'+ 
                             os.path.basename(os.path.splitext(i)[0]) + 
                             '_x_' +  str(x).zfill(3) + '_y_' + str(y).zfill(3) + '.png')
            
            cor_temp = []
            
            for cor in plot_coordinates:
                if (x*grid <= cor[0] < (x+1)*grid) and (y*grid <= cor[1] < (y+1)*grid):
                
                    if (x*grid < cor[2] <= (x+1)*grid) and (y*grid < cor[3] <= (y+1)*grid):
                        cor_temp.append((cor[0] - x*grid, cor[1] - y*grid, cor[2] - x*grid, cor[3] - y*grid, cor[4]))
                    
                    if ((x+1)*grid <= cor[2]) and (y*grid < cor[3] <= (y+1)*grid):
                        cor_temp.append((cor[0] - x*grid, cor[1] - y*grid, grid, cor[3] - y*grid, cor[4]))
                    
                    if (x*grid < cor[2] <= (x+1)*grid) and ((y+1)*grid <= cor[3]):
                        cor_temp.append((cor[0] - x*grid, cor[1] - y*grid, cor[2] - x*grid, grid, cor[4]))
                    
                    if ((x+1)*grid <= cor[2]) and ((y+1)*grid <= cor[3]):
                        cor_temp.append((cor[0] - x*grid, cor[1] - y*grid, grid, grid, cor[4]))
                    
                if (x*grid <= cor[2] < (x+1)*grid) and (y*grid <= cor[3] < (y+1)*grid):
                
                    if ((x-1)*grid <= cor[0] < x*grid) and ((y-1)*grid <= cor[1] < y*grid):
                        cor_temp.append((0,0,cor[2] - x*grid, cor[3] - y*grid, cor[4]))
                    
                    if ((x-1)*grid <= cor[0] < x*grid) and (y*grid <= cor[1] <(y+1)*grid):
                        cor_temp.append((0, cor[1] - y*grid, cor[2] - x*grid, cor[3] - y*grid, cor[4]))
                    
                    if (x*grid <= cor[0] < (x+1)*grid) and ((y-1)*grid <= cor[1] < y*grid):
                        cor_temp.append((cor[0] - x*grid, 0, cor[2] - x*grid, cor[3] - y*grid, cor[4]))
                        
                if (cor[0] < x*grid) and (y*grid <= cor[1] < (y+1)*grid) and (x*grid < cor[2] <= (x+1)*grid) and ((y+1)*grid < cor[3]):
                    cor_temp.append((0, cor[1] - y*grid, cor[2] - x*grid, grid, cor[4]))
                    
                if (x*grid <= cor[0] < (x+1)*grid) and (cor[1] <= y*grid) and ((x+1)*grid <= cor[2]) and (y*grid < cor[3] <= (y+1)*grid):
                    cor_temp.append((cor[0] - x*grid, 0, grid, cor[3] - y*grid, cor[4]))
                        
            np.save(crop_save_dir + '/' + 
                    os.path.basename(os.path.splitext(i)[0]) + 
                    '_x_' +  str(x).zfill(3) + '_y_' + str(y).zfill(3) + '.npy', cor_temp)
            temp_img_overlay = cropped_img
            draw = ImageDraw.Draw(temp_img_overlay)
            for c in cor_temp:
                draw.rectangle((c[0], c[1], c[2], c[3]), fill=(255, 255, 255))
            temp_img_overlay.save(crop_save_dir + '/' + 
                    os.path.basename(os.path.splitext(i)[0]) + 
                    '_x_' +  str(x).zfill(3) + '_y_' + str(y).zfill(3) + '_overlayed.png')

In [None]:
grid = 680 # crop size
crop_save_dir =  img_d + '/crop_save_0' +  str(grid)
os.makedirs(crop_save_dir)

In [None]:
Parallel(n_jobs=-1)(delayed(process_image)(i, j, grid, crop_save_dir) for i, j in zip(imgs, labels))

In [None]:
######

In [None]:
d = os.path.join(img_d,'crop_save_0680' )
img_f = sorted(glob.glob(d + '/*.png'))
img_f = [i for i in img_f if not i.endswith('overlayed.png')]
npy_f = sorted(glob.glob(d + '/*.npy'))
print(len(img_f))
print(len(img_f) == len(npy_f))

In [None]:
temp = []
for i, j in zip(img_f, npy_f):
    temp.append(os.path.basename(os.path.splitext(i)[0]) ==  os.path.basename(os.path.splitext(j)[0]))
all(temp)

In [None]:
for i in img_f:
    img = Image.open(i)
    if img.size[0] != img.size[1]:
        print(i)
    else:
        print('OK')

In [None]:
target_d = d + '/resized'
os.makedirs(target_d)

In [None]:
SIZE = 640

def resize_img0640(i, n):
    img = Image.open(i)
    imgsize = img.size[0]
    img.resize((SIZE, SIZE), Image.LANCZOS).save(target_d + '/' + 
                                                 os.path.basename(os.path.splitext(i)[0]) + 
                                                 '_0680_resized_to_' + str(SIZE) + '.png')
    temp_npy = np.load(n).astype('float32')
    result = []
    #label = 0
    for t in temp_npy:
        #rescale = imgsize/SIZE
        x_min = t[0]/imgsize 
        y_min = t[1]/imgsize
        x_max = t[2]/imgsize
        y_max = t[3]/imgsize
        label = int(t[4])
        x_center = x_min + (x_max - x_min)/2
        y_center = y_min + (y_max - y_min)/2
        width = x_max - x_min
        height = y_max - y_min
        result.append(str(label)+str(' ')+str(x_center)+str(' ')+str(y_center)+str(' ')+
                      str(width)+str(' ')+str(height)+str('\n'))
        with open(target_d + '/' + 
                  os.path.basename(os.path.splitext(i)[0]) + 
                  '_0680_resized_to_' + str(SIZE) +  '.txt', 'w') as f:
            f.writelines(result) 

In [None]:
Parallel(n_jobs=-1)(delayed(resize_img0640)(i, n) for i, n in zip(img_f, npy_f))

In [None]:
img_f = sorted(glob.glob(target_d + '/*.png'))
txt_f = sorted(glob.glob(target_d + '/*.txt'))
print(len(img_f))
print(len(txt_f))

In [None]:
img_only_d = d + '/resized_imgonly'
os.makedirs(img_only_d)

In [None]:
txt_base = [os.path.basename(os.path.splitext(i)[0]) for i in txt_f]
for i in img_f:
    if os.path.basename(os.path.splitext(i)[0]) not in set(txt_base):
        shutil.move(i, img_only_d)

In [None]:
img_f = sorted(glob.glob(target_d + '/*.png'))
txt_f = sorted(glob.glob(target_d + '/*.txt'))
len(img_f) == len(txt_f)

In [None]:
temp = []
for i, j in zip(img_f, txt_f):
    temp.append(os.path.basename(os.path.splitext(i)[0]) ==  os.path.basename(os.path.splitext(j)[0]))
all(temp)

In [None]:
target_img_d = os.path.join(img_d, 'target_img')
target_lab_d = os.path.join(img_d, 'target_lab')
os.makedirs(target_img_d)
os.makedirs(target_lab_d)
for i in img_f:
    shutil.move(i, target_img_d)
for i in txt_f:
    shutil.move(i, target_lab_d)

In [None]:
img_only_f = sorted(glob.glob(img_only_d + '/*.png'))
print(len(img_only_f))

In [None]:
for i in img_only_f:
    touch_file = pathlib.Path(os.path.splitext(i)[0] + '.txt')
    touch_file.touch()

In [None]:
img_f = sorted(glob.glob(img_only_d + '/*.png'))
txt_f = sorted(glob.glob(img_only_d + '/*.txt'))
print(len(img_f))
print(len(img_f) == len(txt_f))

In [None]:
for i in img_f:
    shutil.move(i, target_img_d)
for i in txt_f:
    shutil.move(i, target_lab_d)