In [None]:
import matplotlib.pyplot as plt
import tifffile as tif
import pandas as pd
import PIL.Image as Image
import numpy as np
import colorsys
import zipfile
import os
from explore import *

In [None]:
files_by_type = {type:set() for type in (IMAGE_TYPES + MISC_TYPES)}

dataroot = "../../data"
for filepath, dirpath, name, ext in enumerate_dataset(dataroot + "/raw", folder= "/zenodo/"):
    if not ext:
        continue
    files_by_type[ext].add(filepath)

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)
plt.bar(IMAGE_TYPES, [len(files_by_type[type]) for type in IMAGE_TYPES], label='Files')
plt.title("File types in dataset")
plt.xlabel("File type")
plt.ylabel("Count")
plt.legend()
plt.show()


In [None]:
LABELED = 'Labeled'
MASK = 'Mask'
UNLABELED = 'Unlabeled'
SYNTHETIC = 'Synthetic'

file_types = {cat:{type:set() for type in (IMAGE_TYPES + MISC_TYPES)} for cat in [LABELED, MASK, UNLABELED, SYNTHETIC]}

def mask_map(category):
    if category == MASK:
        yield ("/Public/images", "/Public/labels")
        yield ("/Public/WSI", "/Public/WSI-labels")
        yield ("/Training-labeled/images", "/Training-labeled/labels")
        yield ("/Tuning/images", "/Tuning/labels")
    if category == SYNTHETIC:
        yield ("/Hidden/images", "/Hidden/osilab_seg")
        yield ("/Public/images", "/Public/1st_osilab_seg")
        yield ("/Public/WSI", "/Public/osilab_seg_WSI")

def categorize(dirpath):
    for (img, mask) in mask_map(MASK):
        if mask in dirpath:
            return MASK
        elif img in dirpath:
            return LABELED
    for (img, mask) in mask_map(SYNTHETIC):
        if mask in dirpath:
            return SYNTHETIC
    return UNLABELED

for type, paths in files_by_type.items():
    for filepath in paths:
        file_types[categorize(filepath)][type].add(filepath)

In [None]:
for cat in [LABELED, MASK, UNLABELED, SYNTHETIC]:
    types = file_types[cat]
    counts = {k:len(s) for k, s in types.items()}
    print(cat, sum(counts.values()), counts)

x = np.arange(len(IMAGE_TYPES))  # the label locations
width = 0.25

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)

index = 0
for cat, data in file_types.items():
    plt.bar(x + index * width, [len(data[key]) for key in IMAGE_TYPES], width, label=cat)
    index += 1
    
plt.title("File types in dataset")
plt.xlabel("File type")
plt.xticks(x + width, IMAGE_TYPES)
plt.ylabel("Count")
plt.legend()
plt.show()

In [None]:
mask_assoc = {}
visited = set(file_types[MASK][".tiff"])

def get_mask_path(img_path):
    folder, name, ext = split_filepath(img_path)
    if "WSI" in folder:
        return folder.replace("/WSI", "/WSI-labels") + name + "_label.tiff"
    if "/images" in folder:
        return folder.replace("/images", "/labels") + name + "_label.tiff"
    return None

for ext in IMAGE_TYPES:
    for path in file_types[LABELED][ext]:
        mask_path = get_mask_path(path)
        if mask_path and os.path.exists(dataroot + "/raw" + mask_path):
            mask_assoc[path] = mask_path
            visited.remove(mask_path)
        else:
            print("Missing mask: ", dataroot + "/raw", mask_path, path)

print(len(mask_assoc))
print(visited)
assert not visited

In [None]:
synth_assoc = {}
visited = set(file_types[SYNTHETIC][".tiff"])

def get_synth_path(img_path):
    folder, name, ext = split_filepath(img_path)
    if "/Hidden" in folder:
        return folder.replace("/images", "/osilab_seg") + name + "_label.tiff"
    elif "/Public/images" in folder:
        return folder.replace("/images", "/1st_osilab_seg") + name + "_label.tiff"
    elif "/Public/WSI" in folder:
        return folder.replace("/WSI", "/osilab_seg_WSI") + name + "_label.tiff"
    return None

for cat in [UNLABELED, LABELED]:
    for ext in IMAGE_TYPES:
        for path in file_types[cat][ext]:
            synth_path = get_synth_path(path)
            if synth_path and os.path.exists(dataroot + "/raw" + synth_path):
                synth_assoc[path] = synth_path
                visited.remove(synth_path)

print(len(synth_assoc))
print(visited)
assert not visited

In [None]:
data_map = dataset_frame(dataroot + "/raw", mask_assoc)
data_map.to_csv(dataroot + "/zenodo.labels.csv")

synth_map = dataset_frame(dataroot + "/raw", synth_assoc)
synth_map.to_csv(dataroot + "/zenodo.synth.csv")

In [None]:
for df in enumerate_datasets(dataroot):
    save_maskframes(dataroot, df)
    preprocess_masks(dataroot, df)
    #save_hue_mask(dataroot + "/raw", dataroot + "/processed", "/zenodo/Testing/Public/labels/OpenTest_006_label.tiff")
    pass

In [None]:
print("Valeurs aberrantes?")
data_map[data_map["Objects"] > 2000]

In [None]:

num_obj = data_map["Objects"]
print(num_obj[num_obj > 2000])
num_obj = num_obj[num_obj < 2000]
print(len(num_obj), sum(num_obj), set(num_obj))

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2([x for x in num_obj]), bins=100)
plt.title("Number of segmented objects per file")
plt.xlabel("Number of objects (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
print(len(set(n for n in (data_map["Width"] * data_map["Height"]))))

area = data_map["Width"] * data_map["Height"]
print("Too big:", area[area > 10 ** 7].count())
#area = area[area < 10 ** 7]

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2(area), bins=100)
#plt.hist([np.log2(n) for n in numbers if n < 2000], bins=100)
plt.title("Size of of mask files")
plt.xlabel("Size of file (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
#print(set(n[2] for n in numbers))

density = 1 - (data_map["Background"] / data_map["Width"] / data_map["Height"])

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(density, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Density of segmented objects per file")
plt.xlabel("Density of objects")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
radius = (density / data_map["Objects"] / np.pi) ** .5

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(radius, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Average radius of segmented objects per file")
plt.xlabel("Radius of objects (in %)")
#plt.xticks([2.0 ** x for x in range(8)])
plt.ylabel("Count")
plt.show()