In [None]:
import matplotlib.pyplot as plt
import tifffile as tif
import pandas as pd
import PIL.Image as Image
import numpy as np
import colorsys
import zipfile
import os
from explore import *

In [None]:
dataroot = "../../data"
zenodo = ZenodoNeurIPS('/zenodo')
unzip_dataset(dataroot + "/raw", folder= zenodo.root + "/")
files_by_type = list_dataset(dataroot + "/raw", folder= zenodo.root + "/")

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)
plt.bar(IMAGE_TYPES, [len(files_by_type[type]) for type in IMAGE_TYPES], label='Files')
plt.title("File types in dataset")
plt.xlabel("File type")
plt.ylabel("Count")
plt.legend()
plt.show()


In [None]:
file_types = {cat:{type:set() for type in IMAGE_TYPES} for cat in [LABELED, MASK, UNLABELED, SYNTHETIC]}

for type in IMAGE_TYPES:
    for filepath in files_by_type[type]:
        file_types[zenodo.categorize(filepath)][type].add(filepath)

for cat in [LABELED, MASK, UNLABELED, SYNTHETIC]:
    types = file_types[cat]
    counts = {k:len(s) for k, s in types.items()}
    print(cat, sum(counts.values()), counts)


In [None]:
x = np.arange(len(IMAGE_TYPES))  # the label locations
width = 0.25

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)

index = 0
for cat, data in file_types.items():
    plt.bar(x + index * width, [len(data[key]) for key in IMAGE_TYPES], width, label=cat)
    index += 1
    
plt.title("File types in dataset")
plt.xlabel("File type")
plt.xticks(x + width, IMAGE_TYPES)
plt.ylabel("Count")
plt.legend()
plt.show()

In [None]:
prepare_metaframes(dataroot)
for name, df in enumerate_frames(dataroot):
    category = MASK if (".labels" in name) else SYNTHETIC
    assert set(df["Mask"]) == file_types[category][".tiff"]
    #preprocess_images(dataroot, df)
    preprocess_masks(dataroot, df)
    save_maskframes(dataroot, df)
    pass

In [None]:
#sample = "/zenodo/Testing/Public/labels/OpenTest_006_label"
sample = "/zenodo/Training-labeled/labels/cell_00854_label"
maskDF = pd.read_csv(dataroot + "/processed" + sample + ".csv")
save_hue_mask(dataroot + "/raw", dataroot + "/processed", sample + ".tiff", maskDF)
maskDF

In [None]:
data_map, synth_map = [df for name, df in enumerate_frames(dataroot)]

print("Valeurs aberrantes?")
data_map[data_map["Objects"] > 2000]

In [None]:
from skimage import io, segmentation, morphology, exposure
from tqdm import tqdm
from monai.data import PILReader

def load_image(img_path):
    dirpath, name, ext = split_filepath(img_path)
    if ext in ['.tif', '.tiff']:
        return tif.imread(img_path)
    else:
        return PILReader.read(img_path)[0]
        return io.imread(img_path)

exts = set()
shapes = set()
dataset = list(zip(data_map["Path"], data_map["Mask"]))[180:]
for filepath, maskpath in tqdm(dataset):
    if "WSI" in filepath: continue
    folder, name, ext = split_filepath(filepath)
    exts.add(ext)
    norm_target = f"{dataroot}/preprocessing_outputs/normalized_data"
    target = norm_target + folder + name + ".png"
    img, _ = PILReader().get_data(PILReader().read(target))
    #img = load_image(dataroot + "/raw" + filepath)

    folder, name, ext = split_filepath(maskpath)
    meta_path = f"{dataroot}/processed" + folder + name + ".csv"
    df = pd.read_csv(meta_path)
    width, height = df['Right'].max(), df['Bottom'].max()
    #print(img.shape, width, height, os.path.split(meta_path)[1])
    if width == img.shape[0] and height == img.shape[1]:
        shapes.add(img.shape)
    else:
        print(mask_frame(f"{dataroot}/raw", maskpath))
exts, shapes #file_types[LABELED][".tiff"]

In [None]:
set([s[2] for s in shapes if len(s) > 2])

In [None]:

num_obj = data_map["Objects"]
print(num_obj[num_obj > 2000])
num_obj = num_obj[num_obj < 2000]
print(len(num_obj), sum(num_obj), set(num_obj))

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2([x for x in num_obj]), bins=100)
plt.title("Number of segmented objects per file")
plt.xlabel("Number of objects (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
print(len(set(n for n in (data_map["Width"] * data_map["Height"]))))

area = data_map["Width"] * data_map["Height"]
print("Too big:", area[area > 10 ** 7].count())
#area = area[area < 10 ** 7]

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2(area), bins=100)
#plt.hist([np.log2(n) for n in numbers if n < 2000], bins=100)
plt.title("Size of of mask files")
plt.xlabel("Size of file (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
#print(set(n[2] for n in numbers))

density = 1 - (data_map["Background"] / data_map["Width"] / data_map["Height"])

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(density, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Density of segmented objects per file")
plt.xlabel("Density of objects")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
radius = (density / data_map["Objects"] / np.pi) ** .5

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(radius, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Average radius of segmented objects per file")
plt.xlabel("Radius of objects (in %)")
#plt.xticks([2.0 ** x for x in range(8)])
plt.ylabel("Count")
plt.show()