# Results: Extra Analysis
Analysis that are not included in the other results notebooks.

In [None]:
# Imports
from pandas import read_csv, DataFrame, read_excel
from tqdm.notebook import tqdm
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
from ipywidgets import interact, Dropdown, IntSlider
import random
import seaborn as sns
from ipywidgets import interact, Dropdown

from os import makedirs
from os.path import join, splitext, isfile

from nft_helpers.utils import (
    load_yaml, imread, im_to_txt_path, get_filename, imwrite
)
from nft_helpers.girder_dsa import login, get_tile_metadata
from nft_helpers.yolov5.utils import read_yolo_label
from nft_helpers.roi_utils import read_roi_txt_file, line_to_xys

cf = load_yaml()

# Create a directory to save files to.
save_dir = join(cf.datadir, 'results/extras')
makedirs(save_dir, exist_ok=True)

np.set_printoptions(suppress=True)

In [None]:
# Authenticate client.
gc = login(join(cf.dsaURL, 'api/v1'), username=cf.user, password=cf.password)

## Cohort Information

In [None]:
# Create the table as a dataframe.
cases_df = read_csv('csvs/cases.csv').fillna('')  # case metadata

# split the cases into each cohort
cohorts = ['Inference-Cohort-1', 'Inference-Cohort-2', 'External-Cohort']
cohort_dfs = {cohort: cases_df[cases_df.cohort == cohort] for cohort in cohorts}
 
cohorts_df = [['', '', 'Emory-Train', 'Emory-Test', 'UC Davis']]

cohorts_df.append(['Demographics', '', '', '', ''])

# Add the demographics sex row.
row = ['', 'Number of cases (M/F)']

for cohort in cohorts:
    counts = cohort_dfs[cohort].sex.value_counts()
        
    f = counts['female'] if 'female' in counts else 0
    m = counts['male'] if 'male' in counts else 0
    
    row.append(f'{counts.sum()} ({m}/{f})')
    
cohorts_df.append(row)

# Add average age at death row.
row = ['', 'Average age at death (standard deviation)']

for cohort in cohorts:
    age_at_death = cohort_dfs[cohort].age_at_death.replace('90+', 90).astype(int)
    row.append(f'{age_at_death.mean():.2f} ({age_at_death.std():.2f})')
    
cohorts_df.append(row)

# Add race info.
cohorts_df.append(['', 'Race/Ethnicity:', '', '', ''])

for race in ('Caucasian', 'Black / African American', 'Hispanic', 'Asian','unknown'):
    row = ['', f'  {race}']
    
    for cohort in cohorts:
        counts = cohort_dfs[cohort].race.value_counts()
        
        count = counts[race] if race in counts else 0
        
        if count:
            row.append(f'{count} ({count / counts.sum() * 100:.2f}%)')
        else:
            row.append('-')
            
    cohorts_df.append(row)

# Add the Braak Stage info.
cohorts_df.append(['Braak NFT Stage', '', '', '', ''])

stage_map = {
    '0': '0', '1': 'I', '1-2': 'I-II', '2': 'II', '3': 'III', '4': 'IV', '5': 'V', '6': 'VI',
}
for stage, rstage in stage_map.items():
    row = ['', rstage]
    
    for cohort in cohorts:
        stages = cohort_dfs[cohort].Braak_stage.value_counts()
        
        count = int(stages.get(stage)) if stage in stages else 0
        
        if count:
            row.append(f'{count} ({count / stages.sum() * 100:.2f}%)')
        else:
            row.append('-')
        
    cohorts_df.append(row)
    
# Add antibody distribution.
cohorts_df.append(['Tau Antibody (WSI counts)', '', '', '', ''])

ab_df = read_csv('csvs/Tau antibody for Emory WSIs.csv')

for ab in ['PHF-1', 'AT8', 'CP13', 'Accurate']:
    row = ['', ab]
    
    for cohort in ['Emory-Train', 'Emory-Holdout']:
        ab_cohort = ab_df[ab_df.Cohort == cohort]
        ab_count = len(ab_cohort[ab_cohort['Antibody Info'] == ab])
        
        if ab_count:
            row.append(f'{ab_count} ({ab_count / len(ab_cohort) * 100:.2f}%)')
        else:
            row.append('-')
        
    if ab == 'AT8':
        row.append('92 (100%)')
    else: 
        row.append('-')
        
    cohorts_df.append(row)
    
# Build into dataframe - save to file.
cohorts_df = DataFrame(cohorts_df, 
                       columns=['', '', 'Cohorts', '', ''])
cohorts_df.to_csv(join(save_dir, 'Cohort Information.csv'), index=False)
cohorts_df

## Example Pre-NFT & iNFT Annotations 

In [None]:
# Examples of Pre-NFT / iNFT
def rgb_to_rgba(img: np.array, alpha: int = 255) -> np.array:
    """Convert an RGB image to an RGBA image by adding an alpha channel.
    
    Args:
        img: RGB image.
        alpha: Alpha value to add to all pixels. 0 is transparent and 255 is
            non-transparant.
            
    Returns:
        Image with alpha channel.
        
    """
    # Split image to R, G, and B channels.
    b_channel, g_channel, r_channel = cv.split(img)
    
    # Create alpha channel
    a_channel = np.ones(b_channel.shape, dtype=b_channel.dtype) * alpha
    
    # Merge the channels
    return cv.merge((b_channel, g_channel, r_channel, a_channel))


ann_df = read_csv('csvs/annotations.csv')
ann_df = ann_df[ann_df.annotator == 'expert1']


def select_nft_type(nft_type):
    """Select the NFT type, and then create the slider interactive."""
    nft_ann_df = ann_df[ann_df.label == nft_type]
    color = (255, 0, 0, 255) if nft_type == 'iNFT' else (0, 0, 255, 255)
    
    
    def show_image(i):
        r = nft_ann_df.iloc[i]
        
        # Read the image with an alpha channel.
        img = rgb_to_rgba(imread(r.im_path))
        
        # Draw the NFT box with transparency.
        box = line_to_xys(r.box_coords) - [r.im_left, r.im_top]        
        img = cv.rectangle(img, box[0,:], box[1,:], color, 2)
        
        img = img[150:350, 150:350, :]
        
        # Zoom in by taken a center box of the image.
        plt.figure(figsize=(5,5))
        plt.imshow(img)
        plt.axis('off')
        
        plt.savefig(
            join(save_dir, f'{nft_type} sample.png'), 
            bbox_inches='tight', 
            dpi=300
        )
        plt.show()
        
        
    _ = interact(
        show_image, 
        i=IntSlider(
            min=0, max=len(nft_ann_df)-1, continuous_update=False
        )
    )
    
    
_ = interact(select_nft_type, nft_type=Dropdown(options=['Pre-NFT', 'iNFT']))

## Supplementary File 1: WSI Information.
Include more detail on each case by providing info on all WSIs.

In [None]:
# Compile WSI metadata into a single dataframe / table.
save_fp = 'csvs/Supplementary File 1.csv'

if isfile(save_fp):
    wsi_metadata = read_csv(save_fp)
else:
    wsis = read_csv('csvs/wsis.csv').fillna('')
    cohorts = ['Inference-Cohort-1', 'Inference-Cohort-2', 'External-Cohort']
    wsis = wsis[wsis.cohort.isin(cohorts)]

    # Marla info on antibodies for Emory.
    marla_ab_df = read_excel('csvs/Emory-antibody-info-MG.xlsx').fillna('')

    cases = read_csv('csvs/cases.csv').fillna('')
    cases = {r.case: r for _, r in cases.iterrows()}

    # Read antibody info.
    # ab_df = read_csv('csvs/wsis-antibody-info.csv').fillna('')
    # ab_map = {r.Filename: r.Antibody for _, r in ab_df.iterrows() if r.Antibody in \
    #           ('PHF-1', 'AT8', 'CP13', 'ACC')}

    wsi_metadata = []

    # Exclude annotated cohort - same as the inference cohort 1
    for _, r in tqdm(wsis.iterrows(), total=len(wsis)):
        # Look for antibody.
        ab_info = marla_ab_df[marla_ab_df.Filename == r.wsi_name]

        if len(ab_info):
            ab_info = ab_info.iloc[0]

            if ab_info['Antibody Info']:
                ab = ab_info['Antibody Info']
            else:
                ab = ab_info.Antibody
        elif r.cohort == 'External-Cohort':
            ab = 'AT8'
        else:
            raise Exception(f'Antibody info missing for {r.wsi_name}')

        if r.cohort == 'Inference-Cohort-1':
            cohort = 'Emory (Train)'
        elif r.cohort == 'Inference-Cohort-2':
            cohort = 'Emory (Test)'
        else:
            cohort = 'UC Davis'

        ts = get_tile_metadata(gc, r.wsi_id)
        sizeX, sizeY = ts['sizeX'], ts['sizeY']
        mag = ts['magnification']

        abc = int(case.ABC) if case.ABC != '' else ''
        thal = int(case.Thal) if case.Thal != '' else ''

        case = cases[r.case]

        wsi_metadata.append([
            cohort, r.case, r.wsi_name, r.region, mag, f'{sizeX} x {sizeY}', 
            splitext(r.wsi_name)[-1][1:].upper(), case.Braak_stage, thal,
            abc, case.age_at_death, case.race, case.sex, case.Clinical_Dx,
            case.Primary_NP_Dx, ab, ts['mm_x'] * 1000, ts['mm_y'] * 1000
        ])

    wsi_metadata = DataFrame(
        wsi_metadata, 
        columns=[
            'cohort', 'Case ID', 'WSI Name', 'Brain Region', 
            'Scanned Magnification', 'Height x width in pixels', 'File Type', 
            'Braak NFT Stage', 'Thal phase', 'ABC score', 'Age at Death', 
            'Race/Ethnicity', 'Sex', 'Clinical Diagnosis', 
            'Primary Neuropathology Diagnosis', 'Tau Antibody', 
            'microns / pixel (horizontal)', 'microns / pixel (vertical)'
        ]
    )

    wsi_metadata['Tau Antibody'] = wsi_metadata['Tau Antibody'].replace(
        {'likely PHF-1': 'PHF-1'}
    )
    
    wsi_metadata.to_csv(save_fp, index=False)
    
print('Sample data:')
display(wsi_metadata.sample(n=5))

# Report the resolution for each cohort.
print(
    '\nAverages (with standard deviations) of resolutions by cohort in ' + \
    'microns per pixel:'
)
for cohort in wsi_metadata.cohort.unique():
    um_x = wsi_metadata[wsi_metadata.cohort == cohort][
        'microns / pixel (vertical)'
    ]
    um_y = wsi_metadata[wsi_metadata.cohort == cohort][
        'microns / pixel (horizontal)'
    ]
    
    print(f'   Cohort {cohort}: {um_x.mean():.2f} x {um_y.mean():.2f} ' + \
          f'(± {um_x.std():.2f} x {um_y.std():.2f})')
    
# Report for all and also for the cohort
print('\nAntibodies in Emory Cohorts:')

for cohorts in ([
    'Emory (Train)', 'Emory (Test)'], 'Emory (Train)', 'Emory (Test)'
):
    if isinstance(cohorts, str):
        cohorts = [cohorts]
        
    counts = wsi_metadata[
        wsi_metadata.cohort.isin(cohorts)
    ]['Tau Antibody'].value_counts()
    
    print(f'  Cohorts: {cohorts}')
    
    for k, v in counts.items():
        print(f'    {k} (n={v})')
    print(f'  Total N={counts.sum()}\n')

## Average Size of ROIs.

In [None]:
# Read the ROI data and subset to only the inter-annotator ones.
rois_df = read_csv('csvs/rois.csv')

iaa_df = rois_df[
    (rois_df.cohort == 'Annotated-Cohort') & (rois_df.annotator == 'expert1') \
    & (rois_df.roi_group == 'ROIv2')
]

iaa_df = rois_df[
    (rois_df.cohort == 'Annotated-Cohort') \
    & (rois_df.roi_group == 'ROIv3')
]

# size_arr = []
# sizes = ''

# for _, r in tqdm(iaa_df.iterrows():
#     tile_metadata = get_tile_metadata(gc, r.wsi_id)
    
#     # Convert the ROI width and height to millemters
#     w, h = r.roi_width, r.roi_height
    
#     # Get the scale factor
#     w = w * tile_metadata['mm_x'] * 1000 # to microns
#     h = h * tile_metadata['mm_y'] * 1000
    
#     w, h = sorted([w, h])
    
#     size_arr.append([w, h])
#     sizes += f'{w:.0f}x{h:.0f}\n'
    
# size_arr = np.array(size_arr)
# w, h = np.mean(size_arr, axis=0)

# print(f'Average size of ROI in microns: {w:.0f} x {h:.0f} microns')

## Tiling Process

In [None]:
# Read ROI info.
tiles_df = read_csv('/workspace/data/datasets/annotator-datasets/tiles.csv')

# Grab a single ROI.
roi_info = tiles_df.iloc[0]
roi_fp = roi_info.roi_fp

# Subset tiles to only this roi
tiles_df = tiles_df[tiles_df.roi_fp == roi_fp]

# Read the ROI Image and save with boxes.
roi_img = imread(roi_fp)

for box in read_roi_txt_file(im_to_txt_path(roi_fp)):
    lb, x1, y1, x2, y2 = box
    roi_img = cv.rectangle(
        roi_img, 
        (x1, y1), 
        (x2, y2), 
        (255, 0, 0) if lb else (0, 0, 255), 
        10
    )
    
plt.figure(figsize=(5,10))
plt.imshow(roi_img)
plt.axis('off')
plt.savefig(join(save_dir, f'sample-roi-with-labels.png'), 
            bbox_inches='tight', dpi=300)
plt.show()

# Draw overlapping grids
for x in np.arange(0, roi_img.shape[1], 960):
    for y in np.arange(0, roi_img.shape[0], 960):
        roi_img = cv.rectangle(roi_img, (x, y), (x+1280, y+1280), (0, 0, 0), 10)
        
plt.figure(figsize=(5,10))
plt.imshow(roi_img)
plt.axis('off')
plt.savefig(join(save_dir, f'sample-roi-with-grids.png'), 
            bbox_inches='tight', dpi=300)
plt.show()

# Save a random tile image with labels.
tile_info = tiles_df.sample(n=1, random_state=64).iloc[0]

tile_img = imread(tile_info.fp)

for box in read_yolo_label(
    im_to_txt_path(tile_info.fp), im_shape=(1280, 1280), convert=True
):
    lb, x1, y1, x2, y2 = box.astype(int)
    
    tile_img = cv.rectangle(
        tile_img, (x1, y1), (x2, y2), (255, 0, 0) if lb else (0, 0, 255), 10
    )
    
plt.imshow(tile_img)
plt.axis('off')
plt.savefig(join(save_dir, f'sample-tile-with-labels.png'), 
            bbox_inches='tight', dpi=300)
plt.show()

## Consensus Labeling
Choose a single ROI from the consensus labeling set. Draw the labels created from different *n* consensus values.

In [None]:
# ROI info - choose a single ROI to show that has NFTs and is not rotated.
dataset_dir = join(cf.datadir, 'datasets/model-assisted-labeling')
rois_df = read_csv(join(dataset_dir, 'rois.csv'))

rois_df = rois_df[rois_df.wsi_id == '638147727f8a5e686a53837b']

raw_img = imread(roi_meta.fp)
h, w = raw_img.shape[:2]

fn = get_filename(roi_meta.fp) + '.txt'

# Get the large image metadata to draw scale bar.
ts = get_tile_metadata(gc, roi_meta.wsi_id)

# Get the pixels for 200 microns.
px = int(200 / (ts['mm_x'] * 1000))

# Draw scale bar.
roi_img = cv.line(raw_img.copy(), (w-px, h-100), (w, h-100), (0, 0, 0), 30)

# imwrite(join(save_dir, 'sample-roi.png'), roi_img)

# Draw the prediction of each expert / novice model on this ROI.
for model in ('expert1', 'expert2', 'expert3', 'expert4', 'expert5', 'novice1',
              'novice2', 'novice3'):
    # Read the prediction file.
    model_roi = raw_img.copy()
    
    for box in read_yolo_label(
        join(dataset_dir, 'rois/predictions', model, fn), 
        im_shape=(w, h), 
        convert=True
    ):
        label, x1, y1, x2, y2 = box[:5].astype(int)
        
        xc, yc = int((x2 + x1) / 2), int((y2 + y1) / 2)
        
        color = (255, 0, 0) if label else (0, 0, 255)
        
        model_roi = cv.circle(model_roi, (xc, yc), 100, color, 40)

#     imwrite(join(save_dir, f'sample-roi-{model}-predictions.png'), model_roi)

In [None]:
# Draw consensus images - 1, 4, and 8
pad = 75
cls = 1
i = 2

for model in ('1', '4', '8'):
    model_roi = roi_img.copy()
    model_box = raw_img.copy()
    
    boxes = read_yolo_label(
        join(dataset_dir, 'rois/consensus', model, fn),
        im_shape=(w, h),
        convert=True
    )
    
    box_dict = {0: [], 1: []}
    
    for box in boxes:
        label, x1, y1, x2, y2 = box[:5].astype(int)
        
        xc, yc = int((x2 + x1) / 2), int((y2 + y1) / 2)
        
        box_dict[label].append((xc, yc))
        
        color = (255, 0, 0) if label else (0, 0, 255)
        
        model_roi = cv.circle(model_roi, (xc, yc), 100, color, 40)
        model_box = cv.rectangle(model_box, (x1, y1), (x2, y2), color, 3)
#     imwrite(
#         join(save_dir, f'sample-roi-consensus-{model}-predictions.png'), 
#         model_roi
#     )
    
    if i >= len(box_dict[cls]):
        continue
        
    xc, yc = box_dict[cls][i]
    
    x1, y1, x2, y2 = xc - pad, yc - pad, xc + pad, yc + pad

    if x1 < 0:
        x1, x2 = 0, pad*2
        
    if y1 < 0:
        y1, y2 = 0, pad*2
        
    if x2 > w:
        x1, x2 = w-(pad*2), w
        
    if y2 > h:
        y1, y2 = h-(pad*2), h
        
    img = model_box[y1:y2, x1:x2]
    
    plt.imshow(img)
    plt.title(f'Consensus n={model}', fontsize=16)
    plt.axis('off')
    plt.show()

#     imwrite(join(save_dir, f'sample-roi-iNFT-consensus-{model}.png'), img)

## Emory NFT Predictions by Antibody (AT8 vs PHF1)

In [None]:
# Read the imaging features for each case.
fts_df = read_csv(join(cf.datadir, 
                       'results/wsi-inference/inference-features.csv'))
fts_df = fts_df[fts_df.dataset.isin(('train', 'Emory test'))]

# Create a map to know for each case which regions are stained with AT8 or PHF1.
ab_df = read_csv('csvs/Tau antibody for Emory WSIs.csv')

# Add region to each WSI.
wsis_df = read_csv('csvs/wsis.csv')

for i, r in ab_df.iterrows():
    region = wsis_df[wsis_df.wsi_name == r.Filename].iloc[0].region
    
    ab_df.loc[i, 'region'] = region
    
ab_df = ab_df.replace({
    'Right hippocampus': 'Hippocampus', 'Left hippocampus': 'Hippocampus'
})


def plot_ab_by_regions(region):
    plot_df = []

    for _, r in fts_df.iterrows():
        # Get the antibody for this region and case.
        ab = ab_df[
            (ab_df.Case == r.case) & (ab_df.region == region)
        ].iloc[0]['Antibody Info']

        if ab not in ('AT8', 'PHF-1'):
            continue

        row = [r.dataset, r.case, r.stage, r.age, r.sex, 'Pre-NFT', ab,
               r[f'Pre-NFT density ({region})']]
        plot_df.append(row)

        row = [r.dataset, r.case, r.stage, r.age, r.sex, 'iNFT', ab,
               r[f'iNFT density ({region})']]
        plot_df.append(row)


    plot_df = DataFrame(
        plot_df, 
        columns=[
            'dataset', 'case', 'stage', 'age_at_death', 'sex', 'label', 'antibody',
            f'density'
        ]
    )

    # For this region plot two suplots - one for each NFT class.
    fig = plt.figure(figsize=(12, 4))
    y_max = plot_df['density'].max()

    for i, cls in enumerate(('Pre-NFT', 'iNFT')):
        df = plot_df[plot_df.label == cls]

        fig.add_subplot(1, 2, i+1)

        ax = plt.gca()
        sns.boxplot(data=df, y='density', x='stage', hue='antibody', ax=ax)

        # Format figure.
        plt.xlabel('Braak Stage', fontsize=16, fontweight='bold')
        plt.ylabel('Density (Object / tissue area)', fontsize=16, fontweight='bold')
        plt.xticks(fontweight='bold', fontsize=12)
        plt.yticks(fontweight='bold', fontsize=12)
        plt.title(cls, fontsize=16, fontweight='bold', y=1.15)
        plt.legend(ncol=3, fontsize=14, bbox_to_anchor=(0.55, 1.15), 
                   loc='upper center')
        plt.ylim([0, y_max])
        ax.tick_params(axis='both', which='both', direction='out', length=10, 
                        width=2)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['bottom'].set_linewidth(2)
        ax.spines['left'].set_linewidth(2)

    ax.get_legend().remove()
    plt.show()
    
    
_ = interact(
    plot_ab_by_regions, 
    region=Dropdown(options=[
        'Hippocampus', 'Amygdala', 'Temporal cortex', 'Occipital cortex'
    ])
)