In [None]:
import json
import os
import logging
import girder_client
import large_image

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
from imageio import imread

from skimage import measure
from skimage import segmentation
import numpy as np
import matplotlib.image as mpimg
from PIL import Image

# Need to pull things like API key from a .env file
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join('..', '.env'))


In [None]:
from histomicstk.annotations_and_masks.annotation_and_mask_utils import (
    parse_slide_annotations_into_table,
    get_image_from_htk_response # Given a girder response, get np array image
) 

from histomicstk.annotations_and_masks.masks_to_annotations_handler import (
    get_annotation_documents_from_contours # Converts a contour table into a set of annotation docs for uploading
)

In [None]:
# Constatnts

# Directories
LOGGING_DIR = os.path.join('..', 'logs')
DATA_DIR = os.path.join('..', 'data')
LABEL_DIR = os.path.join('..', 'data', 'roi_labels')
INTERIM_DIR = os.path.join(DATA_DIR, 'roi_labels_binary')

# Server Info
APIURL = os.getenv('APIURL')
APIKEY = os.getenv('APIKEY')
COLLECTION_NAME = os.getenv('COLLECTION_NAME')
FOLDER_NAME = os.getenv('FOLDER_NAME')

# Files
ROI_DATA_PATH = os.path.join(DATA_DIR, 'notes_on_data.xlsx')
GTCODE_PATH = os.path.join(DATA_DIR, 'gt_codes.csv')
LOG_PATH = os.path.join(LOGGING_DIR, 'histomics_upload_rois.log')

# Constant values
RESIZE_FRACTION = 0.1

# Verification
if not os.path.exists(LOGGING_DIR):
    os.makedirs(LOGGING_DIR)

if not os.path.exists(INTERIM_DIR):
    os.makedirs(INTERIM_DIR)
    
if not os.path.exists(DATA_DIR):
    error(f"Data directory does not exist at {DATA_DIR}, please create and populate with the labelmaps, ROI-to-WSI spreadsheet, and GT Codes file. Aborting!")

if not os.path.exists(LABEL_DIR):
    error(f"Label directory does not exist at {LABEL_DIR}, please download the ground truth labelmaps (pngs) from Box. Aborting!")

if not os.path.exists(ROI_DATA_PATH):
    error(f"ROI file does not exist at {ROI_DATA_PATH}, please download the ROI-to-WSI dataset from Box. Aborting!")

if not os.path.exists(GTCODE_PATH):
    error(f"GT Codes file does not exist at {GTCODE_PATH}, please create the GT Codes spreadsheet or download from Box. Aborting!")

In [None]:
# set up logging -- experimental
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M',
                    filename=LOG_PATH,
                    filemode='w')

# Set up logging to console / notebook output
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

In [None]:
def extract_object_boundaries(img_bin):
    '''Extract the boundaries of each unique object in the binary image.
    
    Returns a list of objects as well as centroid points.
    '''
    img_labels = measure.label(img_bin)
    img_props = measure.regionprops(img_labels)
    
    # Build the sat centroids output
    obj_centroids = []
    for img_prop in img_props:
        obj_centroids.append(img_prop.centroid)
    
    # Get the list of labels for this class
    objs = np.unique(img_labels)
    
    obj_bounds = []

    for obj in objs[1:]:
        # Grab a binary image containing only the current object
        this_obj = img_labels == obj

        # Pad by 1 -- need to operate correctly at the boundaries
        this_obj = np.pad(this_obj, 1, 'constant', constant_values=0)

        cnt = plt.contour(this_obj)
        pts = cnt.collections[0].get_paths()[0].vertices

        obj_bounds.append(pts)
    
    return (obj_centroids, obj_bounds)

In [None]:
# Read in the pandas dataframe
roi_df = pd.read_excel(ROI_DATA_PATH, sheet_name='region_of_interest key')

# Drop any row that has NaN in the important columns
roi_df.dropna(subset=['corresponding_wsi_number', 'top', 'left'], inplace=True)

# Drop unnecessary columns
roi_df.drop(columns=['roi_number_modified', 'notes', 'wsi_uploaded_histomics'], inplace=True)

In [None]:
roi_df.head(20)

In [None]:
## DEBUGGING
roi_df = roi_df.loc[roi_df.corresponding_wsi_number=='OCC-02-0008-01Z-01-O01']

In [None]:
# Read the external info for the ground truth classes and check it
GTCodes_df = pd.read_csv(GTCODE_PATH)
GTCodes_df.index = GTCodes_df.loc[:, 'group']

In [None]:
GTCodes_df.head()

In [None]:
# Authenticate with the server
gc = girder_client.GirderClient(apiUrl=APIURL)
gc.authenticate(apiKey=APIKEY)

In [None]:
# Starting at the "collection" level, walk through the structure to find the target sample id number
# that corresponds to the sample name given above.
# We assume that there are no nested folders -- it goes collection / folder_list / item_list
# If this changes in future, we'll need to update the script
collection_list = gc.listCollection()
for collection in collection_list:
    if collection['name'] == COLLECTION_NAME:
        collection_id = collection['_id']

folder_list = gc.listFolder(collection_id, parentFolderType='collection')
for folder in folder_list:
    if folder['name'] == FOLDER_NAME:
        folder_id = folder['_id']

item_list = gc.listItem(folder_id)

# Create a list of all item ids that match this folder and their associated names
id_list = []
sample_list = []
for item in item_list:
    id_list.append(item['_id'])
    sample_list.append(item['name'])

In [None]:
def get_histomics_img_from_bb(gc, sample_id, roi_left, roi_right, roi_top, roi_bottom):
    # Construct the REST API query string that we'll use to pull the image ROI
    getStr = f"item/{sample_id}/tiles/region?"+ \
        f"left={roi_left}&"+ \
        f"right={roi_right}&"+ \
        f"top={roi_top}&"+ \
        f"bottom={roi_bottom}"

    # Get the image ROI from histomics server
    resp = gc.get(getStr, jsonResp=False)
    img_roi = get_image_from_htk_response(resp)

In [None]:
def gt_annotation_exists(slide_annotations):
    """Iterates through a slide annotations response, returns True if any of the annotations have a name that starts with 'groundtruth'."""

    ann_names = []
    for annidx, ann in enumerate(slide_annotations):
        ann_names.append(ann['annotation']['name'])

    if any([x.startswith('groundtruth_') for x in ann_names]):
        return True
    
    return False

In [None]:
roi_df

In [None]:
# Pull out the list of rois associated with each valid row in the spreadsheet
# c = 0
for sample in roi_df.itertuples():
    
    sample_name = sample.corresponding_wsi_number + '.tiff'

    # Set up logger for this sample
    this_log = logging.getLogger(f'{sample_name}')
    
    this_log.info(f"Beginning Processing")
    
    # Grab the id number corresponding to the current sample_name
    sample_id = id_list[sample_list.index(sample_name)]
    
    # Check to see if the current image has any annotations that already exist
    slide_annotations = gc.get('/annotation/item/' + sample_id)
    if len(slide_annotations) > 0:
        this_log.debug(f"Found {len(slide_annotations)} annotations")
        
        # If so, check to see whether this set of annotations includes a "ground truth"
        if gt_annotation_exists(slide_annotations):
            this_log.warning(f"Found existing ground truth, so skipping.")
            continue

#     # Grab the image ROI from histomics -- only necessary for display and verification
#     sample_img = get_histomics_img_from_bb(gc, sample_id, int(sample.left), int(sample.left+sample.width), int(sample.top), int(sample.top+sample.height))
#     plt.imshow(sample_img)
#     plt.show()

#     c = c + 1
#     if c > 2: 
#         break
        
    # Find the path to the corresponding ROI label, check that there is one
    label_path = os.path.join(LABEL_DIR, sample.roi_number_orig+'.png')
    if not os.path.exists(label_path):
        this_log.warning(f"Labelmap {sample.roi_number_orig} does not exist at {label_path}, please check. Skipping.")
        continue
    else:
        this_log.info(f"Found labelmap at {label_path}")
    
    # If we've found a labelmap, and we haven't found an existing ground truth, then process the ROI to obtain our ground truth
    
    # First, do we need to extract the binary labelmaps? If so, flag it.
    # We do the flagging first (this is fast) so we only load the labelmap once (slow) if ANY of the labelmaps are absent.
    extract_bin_flags = {}
    create_bin_flag = False
    for gt_group in GTCodes_df['group']:
        extract_bin_flags[gt_group] = False
        
        bin_path = os.path.join(INTERIM_DIR, sample.roi_number_orig+'_'+gt_group+'.png')
        
        if not os.path.exists(bin_path):
            extract_bin_flags[gt_group] = True
            create_bin_flag = True
    
    # If we have to extract ANY of the ground truth maps, load the image
    if create_bin_flag:
        this_log.info(f"Beginning label extraction for {label_path}")
        
        img = Image.open(label_path)
        w, h = img.size
        
        # Original image shape
        # Resize the image to make analysis easier / faster
        img = img.resize((int(w*RESIZE_FRACTION), int(h*RESIZE_FRACTION)), Image.NEAREST)
        img = np.array(img)
        img = np.uint8(img)
        
        # Separate the image labelmap into its channels
        img_red = img[:, :, 0]
        img_green = img[:,:,1]
        img_blue = img[:,:,2]
        
        # Cycle through the groups, and pull out binary images as necessary
        for gt_group in GTCodes_df['group']:
            if extract_bin_flags[gt_group]:
                
                # Parse the color of this group
                label_rgb = GTCodes_df.loc[gt_group, 'color']
                rgb_color = label_rgb.split('rgb(')[1][:-1].split(',')
                img_bin = (img_red == int(rgb_color[0])) & (img_green == int(rgb_color[1])) & (img_blue == int(rgb_color[2]))
                
                # Save this binary image
                bin_path = os.path.join(INTERIM_DIR, sample.roi_number_orig+'_'+gt_group+'.png')
                mpimg.imsave(bin_path, img_bin)

                # Run the extraction of tumor boundaries -- this takes awhile
                this_log.info(f"Beginning extraction of object boundaries for {gt_group}")
                obj_centroids, obj_bounds = extract_object_boundaries(img_bin)
                
                obj_path = os.path.join(INTERIM_DIR, sample.roi_number_orig+'_'+gt_group+'_bounds.npz')
                
                # Save the object centroids and boundaries
                # Use ** syntax with a dictionary to unpack keys as variable names
                # See: https://stackoverflow.com/questions/26427666/use-variables-as-key-names-when-using-numpy-savez
                np.savez_compressed(obj_path, **{
                    gt_group+'_centroids': obj_centroids, 
                    gt_group+'_bounds': obj_bounds} )
    
    
        
    # Begin conversion to Histomics
    group = []
    color = []
    ymin = []
    ymax = []
    xmin = []
    xmax = []
    has_holes = []
    touches_edge_top = []
    touches_edge_left = []
    touches_edge_bottom = []
    touches_edge_right = []
    coords_x = []
    coords_y = []

    # Now that we've extracted the boundaries, load up the npz and adjust the coordinates
    for gt_group in GTCodes_df['group']:
        obj_path = os.path.join(INTERIM_DIR, sample.roi_number_orig+'_'+gt_group+'_bounds.npz')
        gt_data_load = np.load(obj_path, allow_pickle=True)
        
        # Ok stop being generic, let's just call it tumor and satellite
        if gt_group == 'satellite':
            obj_bounds = gt_data_load['satellite_bounds']
            obj_centroids = gt_data_load['satellite_centroids']
        if gt_group == 'tumor':
            obj_bounds = gt_data_load['tumor_bounds']
            obj_centroids = gt_data_load['tumor_centroids']
        
        for obj in obj_bounds:
            obj_x = obj[:,0]
            obj_y = obj[:,1]
            
            obj_x_str = ','.join([str(int(x / RESIZE_FRACTION)) for x in obj_x[::3]])
            obj_y_str = ','.join([str(int(y / RESIZE_FRACTION)) for y in obj_y[::3]])

            group.append(gt_group)
            color.append(GTCodes_df.loc[gt_group, 'color'])
            ymin.append(int(np.min(obj_y)))
            xmin.append(int(np.min(obj_x)))
            ymax.append(int(np.max(obj_y)))
            xmax.append(int(np.max(obj_x)))

            has_holes.append(0)
            touches_edge_top.append(0)
            touches_edge_left.append(0)
            touches_edge_bottom.append(0)
            touches_edge_right.append(0)

            coords_x.append(obj_x_str)
            coords_y.append(obj_y_str)
            
    # Put it all into a dataframe
    contours_df = pd.DataFrame({
        'group': group,
        'color': color,
        'ymin': ymin,
        'ymax': ymax, 
        'xmin': xmin,
        'xmax': xmax,
        'has_holes': has_holes,
        'touches_edge-top': touches_edge_top, 
        'touches_edge-left': touches_edge_left,
        'touches_edge-bottom': touches_edge_bottom,
        'touches_edge-right': touches_edge_right,
        'coords_x': coords_x,
        'coords_y': coords_y
    })
    
    # Create the annotation "document", aka the JSON object that will be pushed to the server
    # Here's where we set up the offsets (top and left coords) plus the (non-color) display properties

    annprops = {
        'X_OFFSET': sample.left,
        'Y_OFFSET': sample.top,
        'opacity': 0.2,
        'lineWidth': 4.0,
    }

    annotation_docs = get_annotation_documents_from_contours(
        contours_df.copy(), 
        separate_docs_by_group=True, 
        annots_per_doc=10,
        docnamePrefix='groundtruth', 
        annprops=annprops,
        verbose=False, 
        monitorPrefix=sample_name + ": annotation docs")

    this_log.info(f"Posting Annotation")
    
    # Post the annotation documents you created to the server
    for annotation_doc in annotation_docs:
        resp = gc.post(
            "/annotation?itemId=" + sample_id, json=annotation_doc)
        

In [None]:
annotation_table = parse_slide_annotations_into_table(slide_annotations)
annotation_table.head()

In [None]:
# If you want, display the binary images here to ensure you've got the right classes
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(121)
ax.imshow(img_tumor, cmap=plt.cm.gray)

ax = fig.add_subplot(122)
ax.imshow(img_satellites, cmap=plt.cm.gray)

plt.show()

In [None]:
# Construct the dataframe for the object annotations you're going to work with
# This structure follows the HistomicsTK info here: https://digitalslidearchive.github.io/HistomicsTK/examples/masks_to_annotations_handler.html
# TODO: Convert this into a function that parses through all classes

group = []
color = []
ymin = []
ymax = []
xmin = []
xmax = []
has_holes = []
touches_edge_top = []
touches_edge_left = []
touches_edge_bottom = []
touches_edge_right = []
coords_x = []
coords_y = []

# Satellite groups First
for sat in sat_bounds:
    sat_x = sat[:,0]
    sat_y = sat[:,1]

    sat_x_str = ','.join([str(int(x)) for x in sat_x[::3]])
    sat_y_str = ','.join([str(int(y)) for y in sat_y[::3]])
    
    group.append('satellite')
    color.append(GTCodes_df.loc['satellite', 'color'])
    ymin.append(int(np.min(sat_y)))
    xmin.append(int(np.min(sat_x)))
    ymax.append(int(np.max(sat_y)))
    xmax.append(int(np.max(sat_x)))
    
    has_holes.append(0)
    touches_edge_top.append(0)
    touches_edge_left.append(0)
    touches_edge_bottom.append(0)
    touches_edge_right.append(0)
    
    coords_x.append(sat_x_str)
    coords_y.append(sat_y_str)

# Satellite groups First
for obj in tum_bounds:
    c_x = obj[:,0]
    c_y = obj[:,1]

    obj_x_str = ','.join([str(int(x)) for x in c_x[::3]])
    obj_y_str = ','.join([str(int(y)) for y in c_y[::3]])
    
    group.append('tumor')
    color.append(GTCodes_df.loc['tumor', 'color'])
    ymin.append(int(np.min(c_y)))
    xmin.append(int(np.min(c_x)))
    ymax.append(int(np.max(c_y)))
    xmax.append(int(np.max(c_x)))
    
    has_holes.append(0)
    touches_edge_top.append(0)
    touches_edge_left.append(0)
    touches_edge_bottom.append(0)
    touches_edge_right.append(0)
    
    coords_x.append(obj_x_str)
    coords_y.append(obj_y_str)

# Put it all into a dataframe
contours_df = pd.DataFrame({
    'group': group,
    'color': color,
    'ymin': ymin,
    'ymax': ymax, 
    'xmin': xmin,
    'xmax': xmax,
    'has_holes': has_holes,
    'touches_edge-top': touches_edge_top, 
    'touches_edge-left': touches_edge_left,
    'touches_edge-bottom': touches_edge_bottom,
    'touches_edge-right': touches_edge_right,
    'coords_x': coords_x,
    'coords_y': coords_y
})

In [None]:
contours_df.head()

In [None]:
# Create the annotation "document", aka the JSON object that will be pushed to the server
# Here's where we set up the offsets (top and left coords) plus the (non-color) display properties

annprops = {
    'X_OFFSET': roi_left,
    'Y_OFFSET': roi_top,
    'opacity': 0.2,
    'lineWidth': 4.0,
}

annotation_docs = get_annotation_documents_from_contours(
    contours_df.copy(), 
    separate_docs_by_group=True, 
    annots_per_doc=10,
    docnamePrefix='groundtruth', 
    annprops=annprops,
    verbose=True, 
    monitorPrefix=SAMPLE_NAME + ": annotation docs")

In [None]:
annotation_docs

In [None]:
# Post the annotation documents you created to the server
for annotation_doc in annotation_docs:
    resp = gc.post(
        "/annotation?itemId=" + sample_id, json=annotation_doc)