In [2]:
# 
%load_ext autoreload
%autoreload 2

# Model Assisted Labeling Workflow

In [3]:
# imports
from pandas import read_csv
from colorama import Fore, Style
from glob import glob
from shutil import rmtree
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
from geopandas import GeoDataFrame
from IPython.display import clear_output
from time import sleep
from tqdm.notebook import tqdm
from math import ceil

from os import makedirs, remove
from os.path import join, isfile, isdir

from nft_helpers.utils import (
    load_yaml, get_filename, im_to_txt_path, delete_file, imread, Timer
)
from nft_helpers import compile_model_results, update_roi_tile_labels
from nft_helpers.yolov5 import predict
from nft_helpers.yolov5.utils import (
    read_yolo_label, non_max_suppression, remove_contained_boxes, draw_boxes
)
from nft_helpers.box_and_contours import corners_to_polygon, line_to_xys
from nft_helpers.girder_dsa import login, get_annotations_documents

from ipywidgets import (
    interactive, HBox, VBox, HTML, Button, IntProgress, ToggleButtons, Text,
    Layout
)

# Setup
np.set_printoptions(suppress=True)
cf = load_yaml()
dataset_dir = join(cf.datadir, 'datasets/model-assisted-labeling')
df_fp = join(dataset_dir, 'model-assisted-labeling.csv')
tile_df = read_csv(join(dataset_dir, 'tiles.csv'))
label_map = {lb: i for i, lb in enumerate(cf.labels)}

# Specify the weights to start the model-assisted-worflow with.
starting_weights = join(
    cf.datadir,
    'models/4-models-consensus-n2/4-models-consensus-n2/weights/best.pt'
)
models_dir = join(cf.datadir, 'models/model-assisted-labeling')
makedirs(models_dir, exist_ok=True)

# Parameters of interest.
device = None
conf_thr = 0.25
iou_thr = 0.4
contained_thr = 0.7
img_size = 1280

# percent (as fraction) of ROIs to clean up in each iteration
perc_per_iteration = 0.05##switch to 10%

In [4]:
# Authenticate girder client.
gc = login(join(cf.dsaURL, 'api/v1'), username=cf.user, password=cf.password)

Login or email: jvizcar
Password for jvizcar: ········


## Step 1) Predict Labels + Select ROIs
1. Choose latest model to predict on ROIs not yet manually checked.
2. Merge predictions to update ROI labels (add average confidence of predictions)
3. Identify next set of ROIs to clean up and push labels as DSA annotations

In [None]:
# Choose latest model and predict.
# Read or start the csv file.
if isfile(df_fp):
    # Read the file
    df = read_csv(df_fp)
else:
    # Starting model assisted labeling.
    df = read_csv(join(dataset_dir, 'rois.csv')).sort_values(by='fp')
    
    # Add new columns
    df['iteration'] = [-1] * len(df)
    df['checked'] = [False] * len(df)
    df['mean_conf'] = [1.] * len(df)
    df['labels_updated'] = [False] * len(df)
    df['roi_index'] = list(range(1, len(df)+1))
    df.to_csv(df_fp, index=False)
    
# Check if there is a model for the latest iteration.
latest_iteration = df.iteration.max()

if latest_iteration < 1:
    # Starting iteration
    weights = starting_weights
    latest_iteration = 0
else:
    model_results = compile_model_results(
        join(cf.datadir, 'models/model-assisted-labeling')
    )
    
    model_results = model_results[
        (model_results.model == f'iteration{latest_iteration}') & \
        (model_results.dataset == 'test-roi') & \
        (model_results.label == 'all')
    ]
    
    if len(model_results) != 3:
        raise Exception(
            f'Models for current iteration ({latest_iteration}) not trained!'
        )
        
    # Get the best model for this iteration.
    model_results = model_results.sort_values(by='mAP50-95', ascending=False)
    
    weights = model_results.iloc[0].weights
    
# Clear the cache
for cache_fp in glob(join(dataset_dir, 'texts/*.cache')):
    delete_file(cache_fp)
    
# Now use the best weights to predict on all ROIs not of current iterations.
N = len(df[df.iteration == -1])

n = 1
for i, r in df.iterrows():
    if r.iteration != -1:
        continue
    
    print(Fore.BLUE, Style.BRIGHT, f'Processing ROI {n} of {N}', 
          Style.RESET_ALL)
    roi_dir = join(dataset_dir, 'tiles', get_filename(r.fp))
    
    # Delete the label directory.
    if isdir(join(roi_dir, 'labels')):
        rmtree(join(roi_dir, 'labels'))

    predict(
        join(roi_dir, 'images'),
        roi_dir,
        weights,
        device=device,
        conf_thr=conf_thr,
        iou_thr=iou_thr,
        im_size=img_size,
    )

    # Read predictions and merge to get ROI labels.
    boxes_df = []
    
    for _, tile_r in tile_df[tile_df.roi_fp == r.fp].iterrows():
        label_fp = im_to_txt_path(tile_r.fp)
        
        if isfile(label_fp):
            labels = ''  # overwrite previous predictions as a label file.
            
            for box in read_yolo_label(label_fp):
                label, xc, yc, bw, bh = box[:5]

                labels += f'{label:.0f} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n'
                
                # Convert box.
                xc, yc = (xc * img_size) + tile_r.x, (yc * img_size) + tile_r.y
                bw, bh = bw * img_size, bh * img_size
                
                half_bw, half_bh = bw / 2, bh / 2
                x1, y1 = int(xc - half_bw), int(yc - half_bh)
                x2, y2 = int(xc + half_bw), int(yc + half_bh)
                
                boxes_df.append([
                    int(label), x1, y1, x2, y2, box[5], 
                    corners_to_polygon(x1, y1, x2, y2)
                ])
            
            with open(label_fp, 'w') as fh:
                fh.write(labels.strip())
         
    roi_label_fp = im_to_txt_path(r.fp)
    
    # Delete label file if it exists.
    delete_file(roi_label_fp)
    
    if len(boxes_df):
        # Merge the boxes to create ROI label.
        boxes_df = GeoDataFrame(
            boxes_df, 
            columns=['label', 'x1', 'y1', 'x2', 'y2', 'conf', 'geometry']
        )
        
        df.loc[i, 'mean_conf'] = boxes_df.conf.mean()
        
        labels = ''
        
        boxes_df = non_max_suppression(boxes_df, iou_thr)
        boxes_df = remove_contained_boxes(boxes_df, contained_thr)
        
        for _, box_r in boxes_df.iterrows():
            x1, y1, x2, y2 = box_r.x1, box_r.y1, box_r.x2, box_r.y2
            
            # Format back
            x1, y1, x2, y2 = x1 / r.w, y1 / r.h, x2 / r.w, y2 / r.h
            
            xc, yc = (x1 + x2) / 2, (y1 + y2) / 2
            bw, bh = x2 - x1, y2 - y1
            
            labels += f'{box_r.label} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n'
            
            
        # Save the new ROI label.
        with open(roi_label_fp, 'w') as fh:
            fh.write(labels.strip())
    else:
        df.loc[i, 'mean_conf'] = 1.
        
    n += 1
    
    clear_output()
    
# Assign the next set of ROIs for cleanup.
df = df.sort_values(by=['iteration', 'mean_conf', 'fp']).reset_index(drop=True)
num_per_iter = ceil(perc_per_iteration * len(df))
new_iteration = latest_iteration + 1
df.loc[:num_per_iter-1, 'iteration'] = new_iteration

# Update the csv.
df.to_csv(df_fp, index=False)

# For selected ROIs push the annotations as DSA documents.
print(Fore.BLUE, Style.BRIGHT, 'Pushing new annotations.', Style.RESET_ALL)
current_df = df[df.iteration == new_iteration]
for _, r in tqdm(current_df.iterrows(), total=len(current_df)):
    doc_name = f'iteration{new_iteration}_ROI{r.roi_index}'
    
    label_fp = im_to_txt_path(r.fp)
    
    els = []
    
    if isfile(label_fp):
        # Push boxes as annotations.
        boxes = read_yolo_label(
            label_fp, im_shape=(r.w, r.h), shift=(-r.x, -r.y)
        )
        
        for box in boxes:
            label, xc, yc, bw, bh = box[:5].astype(int)
            
            els.append({
                'center': [int(xc), int(yc), 0],
                'group': cf.labels[label],
                'height': int(bh),
                'width': int(bw),
                'label': {'value': cf.labels[label]},
                'lineColor': cf.label_colors[label],
                'lineWidth': 3,
                'type': 'rectangle'
            })
    
    _ = gc.post(
        f'/annotation?itemId={r.wsi_id}', 
        json={'name': doc_name, 'description': '', 'elements': els}
    )

## Step 2) Manual Cleanup of ROIs
Use interactive to clean up ROIs.
* Clean them up in order.
* Track which ROIs are cleaned.
* Allow timing.

In [5]:
# Interactive: manually clean the next iteration of ROIs.
def timer_func(bn):
    """Start timer / modify timer."""
    if bn.icon == 'solid play':
        # Starting timer.
        timer.start()
        
        bn.icon = 'solid stop'
        bn.style.button_color = 'red'
        timer_desc.value = '<h4>Timer running...</h4>'
    else:
        # Pausing timer.
        timer.stop()
        
        bn.icon = 'solid play'
        bn.style.button_color = 'green'
        timer_desc.value = '<h4>Timer stoped / paused.</h4>'
        
        
def next_roi(bn):
    """Go to next ROI."""
    df.loc[progress.value-1, 'checked'] = True
    df.to_csv(df_fp, index=False)
    
    if progress.value == n:
        if timer.running:
            timer.stop()
        
        # Finished!
        clear_output()
        print(Fore.GREEN, Style.BRIGHT, 'Complete, time to train new models!', 
              Style.RESET_ALL)
    else:
        prev_bn.disabled = False
        
        # Start timer if it was off!
        if timer_bn.icon == 'solid play':
            timer_func(timer_bn)
        
        progress.value += 1
        progress.description = f'ROI {progress.value} of {n}:'
        
        # Update HTMLs
        r = df.iloc[progress.value - 1]
        
        dsa_url = join(
            cf.dsaURL, 
            f'histomics#?image={r.wsi_id}&bounds={r.x}%2C{r.y}%2C{r.x+r.w}' + \
            f'%2C{r.y+r.h}%2C0'
        )
        pearce_url = 'https://pearcetm.github.io/osd-paperjs-annotation/demo/' + \
                     'yoloreviewer/app.html#dsa=https://computablebrain.emory' + \
                     f'.edu&image={r.wsi_id}&bounds={r.x}%2C{r.y}%2C{r.x+r.w}' + \
                     f'%2C{r.y+r.h}'
        
        roi_hyperlink.value = f'<h3>ROI {r.roi_index}:    </h3>'
        hu_hyperlink.value = f'<h3><a href="{dsa_url}" target="_blank">' + \
                             'HistomicsUI</a></h3>'
        p_hyperlink.value = f'<h3><a href="{pearce_url}" style="color: #29e807" target="_blank">' + \
                           'Pearce App</a></h3>'
        
        # Change the button if this is the last.
        if progress.value == n:
            next_bn.description = 'Finish'
            next_bn.icon = 'duotone flag checkered'
            next_bn.style.button_color = '#00FF09'
            
            
def prev_roi(bn):
    """Logic for previous ROI."""
    df.loc[progress.value-2, 'checked'] = False
    df.to_csv(df_fp, index=False)
    
    progress.value -= 1
    progress.description = f'ROI {progress.value} of {n}:'
    
    # Update HTMLs
    r = df.iloc[progress.value - 1]

    dsa_url = join(
        cf.dsaURL, 
        f'histomics#?image={r.wsi_id}&bounds={r.x}%2C{r.y}%2C{r.x+r.w}' + \
        f'%2C{r.y+r.h}%2C0'
    )
    pearce_url = 'https://pearcetm.github.io/osd-paperjs-annotation/demo/' + \
                 'yoloreviewer/app.html#dsa=https://computablebrain.emory' + \
                 f'.edu&image={r.wsi_id}&bounds={r.x}%2C{r.y}%2C{r.x+r.w}' + \
                 f'%2C{r.y+r.h}'

    roi_hyperlink.value = f'<h3>ROI {r.roi_index}:    </h3>'
    hu_hyperlink.value = f'<h3><a href="{dsa_url}" target="_blank">' + \
                         'HistomicsUI</a></h3>'
    p_hyperlink.value = f'<h3><a href="{pearce_url}" style="color: #29e807" target="_blank">' + \
                       'Pearce App</a></h3>'
    
    if progress.value  == 1:
        bn.disabled = True
        
    if next_bn.description == 'Finish':
        next_bn.description = 'Next'
        next_bn.style.button_color = '#88F78C'
        next_bn.icon = 'forward'
            

# Read the csv.
if not isfile(df_fp):
    raise Exception('ROI csv file does not exist, please run the first cell.')
    
df = read_csv(df_fp)

# Sort appropriately.
df = df.sort_values(by=['iteration', 'mean_conf', 'fp'], 
                    ascending=[False, True, True]).reset_index(drop=True)

iteration = df.iteration.max()

indices = df[df.iteration == iteration].index
n = len(indices)

# current index
index = df[(df.iteration == iteration) & (~df.checked)].index

if len(index):
    timer_dir = join(dataset_dir, 'timer-logs')
    makedirs(timer_dir, exist_ok=True)
    
    timer = Timer(join(timer_dir, f'{iteration}.txt'))
    
    index = index[0]
    
    # Widgets
    timer_desc = HTML(
        value=f'<h4>Timer not started.</h4>', description=''
    )
    timer_bn = Button(
        description='', style={'font_weight': 'bold'}, 
        icon='solid play'
    )
    timer_bn.style.button_color = 'green'
    timer_bn.on_click(timer_func)
    
    progress = IntProgress(
        value=index+1, min=1, max=len(indices),
        description=f'ROI {index+1} of {n}:',
        style= {'solid description_width': 'initial'}
    )
    
    next_bn = Button(description='Next', style={'font_weight': 'bold'},
                     icon='forward')
    next_bn.style.button_color = '#88F78C'
    next_bn.on_click(next_roi)
    
    # If you are at the last ROI then set the button appropriately.
    if index == n-1:
        next_bn.description = 'Finish'
        next_bn.icon = 'duotone flag checkered'
        next_bn.style.button_color = '#00FF09'
    
    prev_bn = Button(description='Previous', style={'font_weight': 'bold'},
                     icon='backward')
    prev_bn.style.button_color = '#FF9100'
    prev_bn.on_click(prev_roi)
    
    if index == 0:
        prev_bn.disabled = True
        
    r = df.iloc[index]
    dsa_url = join(cf.dsaURL, f'histomics#?image={r.wsi_id}&bounds={r.x}%2C{r.y}'
               f'%2C{r.x+r.w}%2C{r.y+r.h}%2C0')
    
    # URL to pearce app.
    pearce_url = 'https://pearcetm.github.io/osd-paperjs-annotation/demo/' + \
                 'yoloreviewer/app.html#dsa=https://computablebrain.emory' + \
                 f'.edu&image={r.wsi_id}&bounds={r.x}%2C{r.y}%2C{r.x+r.w}' + \
                 f'%2C{r.y+r.h}'
               
    roi_hyperlink = HTML(
        value=f'<h3>ROI {r.roi_index}:    </h3>', description='')
    
    hu_hyperlink = HTML(
        value=f'<h3><a href="{dsa_url}" target="_blank">'
                    'HistomicsUI</a></h3>', description='')
    p_hyperlink = HTML(
        value=f'<h3><a href="{pearce_url}" style="color: #29e807" target="_blank">'
                    'Pearce App</a></h3>', description='')
    
    ui = VBox([
        HBox([timer_bn, timer_desc]),
        progress,
        HBox([prev_bn, next_bn]),
        HBox([roi_hyperlink, hu_hyperlink, p_hyperlink])
    ])
    display(ui)
else:
    print(
        Fore.GREEN, Style.BRIGHT, 
        'All ROIs in current iteration checked, train new models.',
        Style.RESET_ALL
    )

[32m [1m All ROIs in current iteration checked, train new models. [0m


## Step 3) Update Labels

In [None]:
# Get ROI labels from DSA and update tile labels.
df = read_csv(df_fp)
tile_df = read_csv(join(dataset_dir, 'tiles.csv'))

iteration = df.iteration.max()

if not all(df[df.iteration == iteration].checked):
    raise Exception('Not all ROIs have been checked, please use interactive.')
    
df_iteration = df[df.iteration == iteration]

print(Fore.BLUE, Style.BRIGHT, 'Updating ROI & Tile Labels', Style.RESET_ALL)

for i, r in tqdm(df_iteration.iterrows(), total=len(df_iteration)):
    # Get annotation document.
    doc_name = f'iteration{iteration}_ROI{r.roi_index}'
    
    labels = ''
    
    for doc in get_annotations_documents(gc, r.wsi_id, docs=[doc_name]):
        for el in doc['annotation']['elements']:
            if el['group'] in label_map:
                xc, yc = el['center'][:2]
                xc -= r.x
                yc -= r.y
                bw, bh = el['width'] / r.w, el['height'] / r.h
                
                xc /= r.w
                yc /= r.h
                
                labels += f"{label_map[el['group']]} {xc:.6f} {yc:.6f} " + \
                          f'{bw:.6f} {bh:.6f}\n'
    
    label_fp = im_to_txt_path(r.fp)
    
    if len(labels):
        with open(label_fp, 'w') as fh:
            fh.write(labels.strip())
    elif isfile(label_fp):
        # No more labels delete
        delete_file(label_fp)
        
    update_roi_tile_labels(tile_df[tile_df.roi_fp == r.fp], label_fp, 
                           (r.w, r.h), 0.5)
    
    df.loc[i, 'labels_updated'] = True
        
df.to_csv(df_fp, index=False)

In [None]:
for log_fp in sorted(list(glob(join(dataset_dir, 'timer-logs/*.txt')))):
    with open(log_fp, 'r') as fh:
        time = 0

        for line in fh.readlines():
            time += int(line.strip())

    minutes = int(time / 60)
    hours = int(minutes / 60)
    minutes = minutes % 60
    iteration = get_filename(log_fp)

    print(f'(Iteration {iteration}) Time taken: {hours} hour(s) & {minutes}'
          ' minutes.')