In [None]:
import matplotlib.pyplot as plt
import tifffile as tif
import pandas as pd
import PIL.Image as Image
import numpy as np
import colorsys
import zipfile
import os
from explore import *

In [None]:
import os, pickle, shutil

# Load from the .pkl file
with open("../data_preprocess/modalities/modalities.pkl", "rb") as f:  # "rb" means read binary
    loaded_data = pickle.load(f)

len([y for x in loaded_data.values() for y in x])  # Check the sum of the loaded data

In [None]:
dataroot = "../../data"
zenodo = ZenodoNeurIPS('/zenodo')
unzip_dataset(dataroot + "/raw", folder= zenodo.root + "/")
files_by_type = list_dataset(dataroot + "/raw", folder= zenodo.root + "/")

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)
plt.bar(IMAGE_TYPES, [len(files_by_type[type]) for type in IMAGE_TYPES], label='Files')
plt.title("File types in dataset")
plt.xlabel("File type")
plt.ylabel("Count")
plt.legend()
plt.show()


In [None]:
file_types = {cat:{type:set() for type in IMAGE_TYPES} for cat in [LABELED, MASK, UNLABELED, SYNTHETIC]}

for type in IMAGE_TYPES:
    for filepath in files_by_type[type]:
        file_types[zenodo.categorize(filepath)][type].add(filepath)

for cat in [LABELED, MASK, UNLABELED, SYNTHETIC]:
    types = file_types[cat]
    counts = {k:len(s) for k, s in types.items()}
    print(cat, sum(counts.values()), counts)


In [None]:
x = np.arange(len(IMAGE_TYPES))  # the label locations
width = 0.25

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)

index = 0
for cat, data in file_types.items():
    plt.bar(x + index * width, [len(data[key]) for key in IMAGE_TYPES], width, label=cat)
    index += 1
    
plt.title("File types in dataset")
plt.xlabel("File type")
plt.xticks(x + width, IMAGE_TYPES)
plt.ylabel("Count")
plt.legend()
plt.show()

In [None]:
prepare_metaframes(dataroot)
for name, df in enumerate_frames(dataroot):
    category = MASK if (".labels" in name) else SYNTHETIC
    assert set(df["Mask"]) == file_types[category][".tiff"]
    #preprocess_images(dataroot, df)
    preprocess_masks(dataroot, df)
    save_maskframes(dataroot, df)
    pass

In [None]:
def sanity_check_frame(mask_path, meta_path):
    tensor = cv2.imread(mask_path)
    df = pd.read_csv(meta_path)
    width = df['Right'].max()
    height = df['Bottom'].max()
    assert tensor.shape[0] == height
    assert tensor.shape[1] == width
    area = df['Area'].sum()
    assert (width * height) == area
    assert area != 0
    for index in range(len(df)):
        bdf = df.iloc[index]
        sanity_check_box(width, height, bdf)

def sanity_check_box(width, height, bdf):
        bwidth = bdf['Right'] - bdf['Left']
        bheight = bdf['Bottom'] - bdf['Top']
        barea = bwidth * bheight
        #print(width, height, bdf)
        assert 0 <= bwidth <= width
        assert 0 <= bheight <= height
        assert 0 <= bdf['Area'] <= barea
        assert 0 <= bdf['Left'] <= width
        assert 0 <= bdf['Right'] <= width
        assert 0 <= bdf['Top'] <= height
        assert 0 <= bdf['Bottom'] <= height

class DataSample:
    def __init__(self, dataroot, df):
        self.dataroot = dataroot
        self.df = df
        self.init_paths(df["Path"], df["Mask"])
    
    def init_paths(self, image_path, mask_path):
        self.raw_image = dataroot + "/raw" + image_path
        self.raw_mask = dataroot + "/raw" + mask_path
        self.norm_image = dataroot + "/processed" + target_file(image_path, ".png")
        self.bw_mask = dataroot + "/processed" + target_file(mask_path, ".png")
        self.meta_frame = dataroot + "/processed" + target_file(mask_path, ".csv")
        self.normal_image = dataroot + "/preprocessing_outputs/normalized_data" + target_file(image_path, ".png")
        self.normal_mask = dataroot + "/preprocessing_outputs/normalized_data" + target_file(mask_path, ".png")
    
    def __str__(self):
        return str(self.df)

    # P(width, height) = width * height / maxArea
    # P(width, height | area) = 
    def randboxbad4(self, random, width, height):
        rwidth = int(random.triangular(0, width+1, width+1))
        rheight = int(random.triangular(0, height+1, height+1))
        left = random.randint(0, width+1 - rwidth)
        right = left + rwidth
        top = random.randint(0, height+1 - rheight)
        bottom = top + rheight
        area = rwidth * rheight
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': rwidth, 'Height': rheight, 'Area': area }

    def randboxu(self, random, width, height):
        # area = height**2 * ratio
        ratio = width/height if height else 1
        rarea = random.randint(0, width * height + 1)
        rheight = int(np.sqrt(rarea / ratio))
        rwidth = int(rheight * ratio)
        left = random.randint(0, width+1 - rwidth)
        right = left + rwidth
        top = random.randint(0, height+1 - rheight)
        bottom = top + rheight
        area = rwidth * rheight
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': rwidth, 'Height': rheight, 'Area': area }

    def randboxbad3(self, random, width, height, rwidth=None, rheight=None):
        rwidth = rwidth or random.randint(0, width+1)
        rheight = rheight or random.randint(0, height+1)
        left = random.randint(0, width+1 - rwidth)
        right = left + rwidth
        top = random.randint(0, height+1 - rheight)
        bottom = top + rheight
        area = rwidth * rheight
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': rwidth, 'Height': rheight, 'Area': area }
    
    def randboxbad2(self, random, width, height):
        x1, x2 = random.randint(0, width+1), random.randint(0, width+1)
        left, right = min(x1, x2), max(x1, x2)
        y1, y2 = random.randint(0, height+1), random.randint(0, height+1)
        top, bottom = min(y1, y2), max(y1, y2)
        width = right - left
        height = bottom - top
        area = width * height
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': width, 'Height': height, 'Area': area }
    
    def randboxbad(self, random, width, height):
        left = random.randint(0, width+1)
        right = random.randint(left, width+1)
        top = random.randint(0, height+1)
        bottom = random.randint(top, height+1)
        width = right - left
        height = bottom - top
        area = width * height
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': width, 'Height': height, 'Area': area }
    
    def randbox(self, random, width, height, df):
        left = random.randint(0, df['Left']+1)
        right = random.randint(df['Right'], width+1)
        top = random.randint(0, df['Top']+1)
        bottom = random.randint(df['Bottom'], height+1)
        width = right - left
        height = bottom - top
        area = width * height
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': width, 'Height': height, 'Area': area }

    def lerp(self, a, b):
        return a + b - (a * b)
    
    def randobject(self, random, df):
        weights = df['Area'].sum() / df['Area']
        weights = list(weights / weights.sum())
        # sample an integer according to given weights
        return random.choice(len(df), p=weights)
    
    def randboxsmart(self, random):
        width = self.df['Width']
        height = self.df['Height']
        scalar = self.randscalar(random)
        relbox = self.relbox(scalar)
        return self.absbox(width, height, relbox)
    
    def relscalar(self, box):
        [left, top, right, bottom] = box
        sx = right - left
        sy = bottom - top
        x = left / (1 - sx) if sx != 1 else 0.5
        y = top / (1 - sy) if sy != 1 else 0.5
        return [x, y, sx, sy]
    
    def relbox(self, scalar):
        [x, y, sx, sy] = scalar
        # x = left/(1 - s)
        left = x * (1 - sx)
        right = left + sx
        top = y * (1 - sy)
        bottom = top + sy
        return [left, top, right, bottom]

    def absbox(self, width, height, box):
        [left, top, right, bottom] = box
        left = int(np.rint(left * width))
        right = int(np.rint(right * width))
        top = int(np.rint(top * height))
        bottom = int(np.rint(bottom * height))
        width = right - left
        height = bottom - top
        area = width * height
        return { 'Left': left, 'Top': top, 'Right': right, 'Bottom': bottom, 'Width': width, 'Height': height, 'Area': area }

    def randscalar(self, random):
        x = random.beta(0.5, 0.5)
        y = random.beta(0.5, 0.5)
        sx = random.beta(2, 1)
        sy = sx #random.beta(2, 1)
        return [x, y, sx, sy]
    
    def select_slices(self, random):
        df = pd.read_csv(self.meta_frame)
        choice = self.randobject(random, df)
        if not choice:
            return self.randboxsmart(random)
        df = df.iloc[choice]
        width, height = self.df['Width'], self.df['Height']
        relbox = [df['Left']/width, df['Top']/height, df['Right']/width, df['Bottom']/height]
        [x, y, sx, sy] = self.relscalar(relbox)
        [_, _, s, _] = self.randscalar(random)
        scalar = [x, y, self.lerp(sx, s), self.lerp(sy, s)]
        relbox = self.relbox(scalar)
        return self.absbox(width, height, relbox)
    
empty = DataSample.absbox(None, 0, 0, [0, 0, 0, 0])
print(empty)
samples = {key:[] for key in empty.keys()}
tsamples = np.zeros((1536, 2040))
osamples = [] #[0] * 27
for name, df in enumerate_frames(dataroot):
    random = np.random.RandomState()
    for index in range(len(df[:1])):
        sample = DataSample(dataroot, df[1:2].iloc[index])
        print(sample.df['Width'], sample.df['Height'])
        for _ in tqdm(range(sample.df['Objects'] * 1000)):
            #sanity_check_frame(sample.bw_mask, sample.meta_frame)
            crop = sample.select_slices(random)#, rwidth=256, rheight=256)
            #print(sample.df['Width'], sample.df['Height'], crop)
            sanity_check_box(sample.df['Width'], sample.df['Height'], crop)
            for key in crop.keys():
                samples[key].append(crop[key])
            #tensor = tif.imread(sample.raw_mask)
            #tsamples[crop['Top']:crop['Bottom'], crop['Left']:crop['Right']] += 1
            #tensor = tensor[crop['Top']:crop['Bottom'], crop['Left']:crop['Right']]
            #tif.imshow(tensor, show = True)
            #osamples += list(np.unique(tensor))

    break

In [None]:

#tsamples = np.log(tsamples)
#timg = Image.fromarray((tsamples / tsamples.max() * 255).astype('uint8'))
#timg.save("samples.png")

print(sample.meta_frame)
odf = pd.read_csv(sample.meta_frame)
print(len(np.unique(osamples)), np.unique(osamples))

plt.figure()
plt.subplot(2, 1, 1)
#plt.hist(osamples, bins=np.max(osamples)+1, label='Objects')
#plt.bar(list(range(20)), odf['Area'], label='Objects2')
#plt.hist(tsamples.flatten(), bins=100)
plt.hist(samples['Area'], bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Frequency of samples")
plt.xlabel("Value")
#plt.xticks([2.0 ** x for x in range(8)])
plt.ylabel("Count")
plt.show()


In [None]:
#sample = "/zenodo/Testing/Public/labels/OpenTest_006_label"
sample = "/zenodo/Training-labeled/labels/cell_00854_label"
maskDF = pd.read_csv(dataroot + "/processed" + sample + ".csv")
save_hue_mask(dataroot + "/raw", dataroot + "/processed", sample + ".tiff", maskDF)
maskDF

In [None]:
data_map, synth_map = [df for name, df in enumerate_frames(dataroot)]

print("Valeurs aberrantes?")
data_map[data_map["Objects"] > 2000]

In [None]:
from skimage import io, segmentation, morphology, exposure
from tqdm import tqdm
from monai.data import PILReader

def load_image(img_path):
    dirpath, name, ext = split_filepath(img_path)
    if ext in ['.tif', '.tiff']:
        return tif.imread(img_path)
    else:
        return PILReader.read(img_path)[0]
        return io.imread(img_path)

exts = set()
shapes = set()
dataset = list(zip(data_map["Path"], data_map["Mask"]))[180:]
for filepath, maskpath in tqdm(dataset):
    if "WSI" in filepath: continue
    folder, name, ext = split_filepath(filepath)
    exts.add(ext)
    norm_target = f"{dataroot}/preprocessing_outputs/normalized_data"
    target = norm_target + folder + name + ".png"
    img, _ = PILReader().get_data(PILReader().read(target))
    #img = load_image(dataroot + "/raw" + filepath)

    folder, name, ext = split_filepath(maskpath)
    meta_path = f"{dataroot}/processed" + folder + name + ".csv"
    df = pd.read_csv(meta_path)
    width, height = df['Right'].max(), df['Bottom'].max()
    #print(img.shape, width, height, os.path.split(meta_path)[1])
    if width == img.shape[0] and height == img.shape[1]:
        shapes.add(img.shape)
    else:
        print(mask_frame(f"{dataroot}/raw", maskpath))
exts, shapes #file_types[LABELED][".tiff"]

In [None]:
set([s[2] for s in shapes if len(s) > 2])

In [None]:

num_obj = data_map["Objects"]
print(num_obj[num_obj > 2000])
num_obj = num_obj[num_obj < 2000]
print(len(num_obj), sum(num_obj), set(num_obj))

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2([x for x in num_obj]), bins=100)
plt.title("Number of segmented objects per file")
plt.xlabel("Number of objects (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
print(len(set(n for n in (data_map["Width"] * data_map["Height"]))))

area = data_map["Width"] * data_map["Height"]
print("Too big:", area[area > 10 ** 7].count())
#area = area[area < 10 ** 7]

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2(area), bins=100)
#plt.hist([np.log2(n) for n in numbers if n < 2000], bins=100)
plt.title("Size of of mask files")
plt.xlabel("Size of file (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
#print(set(n[2] for n in numbers))

density = 1 - (data_map["Background"] / data_map["Width"] / data_map["Height"])

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(density, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Density of segmented objects per file")
plt.xlabel("Density of objects")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
radius = (density / data_map["Objects"] / np.pi) ** .5

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(radius, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Average radius of segmented objects per file")
plt.xlabel("Radius of objects (in %)")
#plt.xticks([2.0 ** x for x in range(8)])
plt.ylabel("Count")
plt.show()