In [None]:
import matplotlib.pyplot as plt
import tifffile as tif
import numpy as np
import os

def split_filepath(filepath):
    dirpath, filename = os.path.split(filepath)
    name, ext = os.path.splitext(filename)
    return dirpath, name, ext

datapath_raw = "../../data/raw"
def enumerate_dataset(folder):
    dirpath, name, ext = split_filepath(folder)
    yield folder, dirpath, name, ext
    if ext: return
    print("Enumerating folder: ", folder)
    for filename in os.listdir(datapath_raw + folder):
        for fullpath, dirpath, name, ext in enumerate_dataset(folder + "/" + filename):
            yield fullpath, dirpath, name, ext


In [None]:

img_types = [".bmp", ".png", ".tif", ".tiff"]
misc_types = [".md", ".zip", ".txt", ".csv", ".py"]

synthetic_types = {type:set() for type in (img_types + misc_types)}
labeled_types = {type:set() for type in (img_types + misc_types)}
mask_types = {type:set() for type in (img_types + misc_types)}
unlabeled_types = {type:set() for type in (img_types + misc_types)}

synthetic_flags = ["osilab"]
mask_flags = ["labels"]
labeled_flags = ["images", "WSI"]
unlabeled_flags = ["Hidden"]

folders = set()
zipped_folders = set()

for datapath, dirpath, name, ext in enumerate_dataset("/zenodo"):
    if not ext:
        folders.add(dirpath + "/" + name)
    else:
        if ext == ".zip":
            zipped_folders.add(dirpath + "/" + name)
        if ext in misc_types:
            print("Misc file found: ", datapath)
        if any(flag in dirpath for flag in synthetic_flags):
            synthetic_types[ext].add(datapath)
        elif any(flag in dirpath for flag in unlabeled_flags):
            unlabeled_types[ext].add(datapath)
        elif any(flag in dirpath for flag in mask_flags):
            mask_types[ext].add(datapath)
        elif any(flag in dirpath for flag in labeled_flags):
            labeled_types[ext].add(datapath)
        else:
            unlabeled_types[ext].add(datapath)

for folder in zipped_folders:
    if folder not in folders:
        print("! WARNING ! Unzip: ", folder)

In [None]:
print(sum(len(s) for s in labeled_types.values()), labeled_types)
print(sum(len(s) for s in mask_types.values()), mask_types)
print(sum(len(s) for s in unlabeled_types.values()), unlabeled_types)
print(sum(len(s) for s in synthetic_types.values()), synthetic_types)

synthetic_hist, labeled_hist, mask_hist, unlabeled_hist = [[len(data[key]) for key in img_types] for data in [synthetic_types, labeled_types, mask_types, unlabeled_types]]

x = np.arange(len(img_types))  # the label locations
width = 0.25

# Display parallel histograms
plt.figure()
ax = plt.subplot(2, 1, 1)
plt.bar(x, labeled_hist, width, label="Labeled")
plt.bar(x + width, mask_hist, width, label="Masks")
plt.bar(x + 2 * width, unlabeled_hist, width, label="Unlabeled")
plt.bar(x + 3 * width, synthetic_hist, width, label="Synthetic")
plt.title("File types in dataset")
plt.xlabel("File type")
plt.xticks(x + width, img_types)
plt.ylabel("Count")
plt.legend()
plt.show()

In [None]:
mask_assoc = {}
visited = set(mask_types[".tiff"])

def get_mask_path(img_path):
    folder, name, ext = split_filepath(img_path)
    if "WSI" in folder:
        return folder + "-labels" + "/" + name + "_label.tiff"
    if "/images" in folder:
        return folder.replace("/images", "/labels") + "/" + name + "_label.tiff"
    return None

for ext in img_types:
    for path in labeled_types[ext]:
        mask_path = get_mask_path(path)
        if mask_path and os.path.exists(datapath_raw + mask_path):
            mask_assoc[path] = mask_path
            visited.remove(mask_path)
        else:
            print("Missing mask: ", datapath_raw, mask_path)

print(len(mask_assoc))
print(visited)

In [None]:
import pandas as pd

expected = sum(len(s) for s in mask_types.values())

numbers = []
for img_path, datapath in mask_assoc.items():
    print(len(numbers), "/", expected)
    imgT = tif.imread(datapath_raw + datapath)
    numbers.append({"Path":img_path, "Mask":datapath, "Width": imgT.shape[1], "Height":imgT.shape[0], "Objects": imgT.max(), "Background": (imgT == 0).sum()})
    #tif.imshow(imgT, show = True)

data_map = pd.DataFrame(numbers, columns = ["Path", "Mask", "Width", "Height", "Objects", "Background"]).set_index("Path")
data_map.to_csv(datapath_raw + "/zenodo/data_map.csv")



In [None]:
#data_map["Objects"] < 2000
data_map

In [None]:
for number in numbers:        
    if number["Objects"] > 2000:
        print("Valeurs aberrantes", number["Objects"], "?", number["Path"])

In [None]:

num_obj = data_map["Objects"]
print(num_obj[num_obj > 2000])
num_obj = num_obj[num_obj < 2000]
print(len(num_obj), sum(num_obj), set(num_obj))

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist([np.log2(n["Objects"]) for n in numbers if n["Objects"] < 2000], bins=100)
plt.title("Number of segmented objects per file")
plt.xlabel("Number of objects (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
print(len(set((n["Width"],n["Height"]) for n in numbers)))

area = data_map["Width"] * data_map["Height"]
print("Too big:", area[area > 10 ** 7].count())
#area = area[area < 10 ** 7]

# Display parallel histograms
plt.figure()
plt.subplot(2, 1, 1)
plt.hist(np.log2(area), bins=100)
#plt.hist([np.log2(n) for n in numbers if n < 2000], bins=100)
plt.title("Size of of mask files")
plt.xlabel("Size of file (log2)")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
#print(set(n[2] for n in numbers))

density = 1 - (data_map["Background"] / data_map["Width"] / data_map["Height"])

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(density, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Density of segmented objects per file")
plt.xlabel("Density of objects")
#plt.xticks([2.0 ** x for x in range(5)])
plt.ylabel("Count")
plt.show()

In [None]:
radius = (density / data_map["Objects"] / np.pi) ** .5

plt.figure()
plt.subplot(2, 1, 1)
plt.hist(radius, bins=100)
#plt.hist([-np.log2(n[2]) for n in numbers], bins=100)
plt.title("Average radius of segmented objects per file")
plt.xlabel("Radius of objects (in %)")
#plt.xticks([2.0 ** x for x in range(8)])
plt.ylabel("Count")
plt.show()