# Data Visualizations

In [1]:
%load_ext autoreload
%autoreload 2

import base64
import copy
import itertools
from typing import Union
from xml.dom import minidom

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import umap  # This notebook requires the extras dependencies (cf. README)
from sklearn.decomposition import PCA

from htc.settings import settings
from htc.tissue_atlas.settings_atlas import settings_atlas
from htc.tivita.DatasetSettings import DatasetSettings
from htc.utils.helper_functions import median_table
from htc.utils.visualization import add_figcaption, compress_html

In [2]:
dataset_name = "HeiPorSPECTRAL"
dsettings = DatasetSettings(settings.data_dirs[dataset_name])
df = median_table(dataset_name=dataset_name, annotation_name="all")
df.replace(
    {"subject_name": dsettings["subject_mapping"], "image_name": dsettings["subject_mapping"]}, inplace=True, regex=True
)

target_dir = settings.results_dir / "open_data"
target_dir.mkdir(parents=True, exist_ok=True)

renaming = copy.deepcopy(settings_atlas.labels_paper_renaming)
renaming["kidney_with_Gerotas_fascia"] = "kidney with<br>Gerota's fascia"

# Load the organ symbol images
label_images = {}
label_images_border = {}
for l in df["label_name"].unique():
    label_number = dsettings["label_ordering"][l]
    label_name = l

    svg_path = settings.data_dirs[dataset_name] / "extra_label_symbols" / f"Cat_{label_number}_{label_name}.svg"
    with svg_path.open("r") as f:
        svg = f.read()

        dom = minidom.parseString(svg)

        # Make room for the border
        border_width = 20
        border_padding = 40
        svg_node = dom.getElementsByTagName("svg")[0]
        box = svg_node.getAttribute("viewBox")
        minx, miny, width, height = (int(v) for v in box.split(" "))
        svg_node.setAttribute("viewBox", f"{minx - border_padding // 2} {miny} {width + border_padding} {height}")

        circle = dom.getElementsByTagName("circle")[0]
        circle2 = copy.deepcopy(circle)
        circle.parentNode.appendChild(circle2)

        circle.setAttribute("r", str(int(circle.getAttribute("r")) + 12))
        circle.setAttribute("stroke", "black")
        circle.setAttribute("stroke-width", str(border_width))

        circle2.setAttribute("stroke", "white")
        circle2.setAttribute("stroke-width", "15")

        svg_border = dom.toxml()

        encoded_string = base64.b64encode(svg.encode()).decode()
        encoded_string_border = base64.b64encode(svg_border.encode()).decode()
        # Add the prefix that plotly will want when using the string as source
    label_images[l] = "data:image/svg+xml;base64," + encoded_string
    label_images_border[l] = "data:image/svg+xml;base64," + encoded_string_border

## PCA

In [3]:
opacity = 0.4


def plot_embeddings(df: pd.DataFrame, size_symbols: Union[float, None]) -> go.Figure:
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color="label_name",
        hover_data=["image_name", "annotation_name", "situs", "angle", "repetition"],
        color_discrete_map=settings_atlas.label_colors,
        labels={"label_name": "label"},
    )

    if size_symbols is not None:
        for label in df_full["label_name"].unique():
            df_s = df_full[df_full["label_name"] == label]
            fig.add_layout_image(
                source=label_images_border[label],
                xref="x",
                yref="y",
                xanchor="center",
                yanchor="middle",
                x=df_s["x"].mean(),
                y=df_s["y"].mean(),
                sizex=size_symbols,
                sizey=size_symbols,
                layer="above",
            )

    marker_size = 7
    marker_line = dict(width=0.5, color="DarkSlateGrey")
    fig.for_each_trace(lambda t: t.update(name=renaming.get(t.name, t.name)))
    fig.update_layout(width=1000, height=800, template="plotly_white", margin=dict(t=10, b=5, l=5), font_size=15)
    fig.update_traces(marker_size=marker_size, marker_line=marker_line)
    fig.update_traces(opacity=opacity, showlegend=False)

    # Add invisible points to style the legend (basically to remove the opacity)
    for name in df["label_name"].unique():
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                marker_size=marker_size,
                marker_line=marker_line,
                marker_color=settings_atlas.label_colors[name],
                name=renaming.get(name, name),
                legendgroup=name,  # Same group as the real points so that the legend selection still works
            )
        )

    return fig


def add_opacity_slider(fig: go.Figure) -> go.Figure():
    # Create and add slider
    steps = []
    opacity_range = np.arange(0, 1 + 0.05, 0.05)
    for o in opacity_range:
        steps.append({"method": "restyle", "args": [{"opacity": o}], "label": f"{o:.02f}"})

    opacity_slider = {
        "active": np.searchsorted(np.arange(0, 1 + 0.05, 0.05), 0.3).item(),
        "currentvalue": {"prefix": "Opacity: "},
        "steps": steps,
    }
    fig.update_layout(sliders=[opacity_slider])

    return fig


df_full = df.copy()
spectra = np.stack(df_full["median_normalized_spectrum"])

pca = PCA(n_components=2, random_state=0)
pca.fit(spectra)
spectra_pca = pca.transform(spectra)
df_full["x"] = spectra_pca[:, 0]
df_full["y"] = spectra_pca[:, 1]

fig = plot_embeddings(df_full, size_symbols=0.0045)
fig.update_xaxes(title=f"PC1 ({pca.explained_variance_ratio_[0]:.2f})", title_standoff=0, showticklabels=True)
fig.update_yaxes(title=f"PC2 ({pca.explained_variance_ratio_[1]:.2f})", title_standoff=0, showticklabels=True)
fig.write_image(target_dir / "Figure_04_PCA.pdf")

# The interactive figure does not work well with the legend symbols since they become super large when zooming in
# Hence, we just leave them out (the label name can also be retrieved by hovering ofer the points)
fig = plot_embeddings(df_full, size_symbols=None)
fig.update_xaxes(title=f"PC1 ({pca.explained_variance_ratio_[0]:.2f})", title_standoff=0, showticklabels=True)
fig.update_yaxes(title=f"PC2 ({pca.explained_variance_ratio_[1]:.2f})", title_standoff=0, showticklabels=True)
add_opacity_slider(fig)

html = add_figcaption(
    fig.to_html(full_html=False, include_plotlyjs=True, div_id="PCA_labels"),
    title="Visualization of spectral similarity",
    caption=(
        "<strong>Figure 4 | Visualization of spectral similarity</strong> with PCA as a linear multi-dimensionality"
        " reduction tool with a cumulative explained variance of 0.83 (0.44 for x; 0.39 for y). Each point represents"
        " the median spectrum across one annotation of one organ in one image of one pig. The organ symbols are placed"
        " at the centroid of the corresponding organ distribution."
    ),
)
compress_html(target_dir / "Figure_04_PCA.html", html)

## PCA stratified by label

In [4]:
rows = {
    "pca1": [],
    "pca2": [],
    "pca1_explained": [],
    "pca2_explained": [],
    "label_name": [],
    "annotation_name": [],
    "subject_name": [],
    "situs": [],
    "angle": [],
    "repetition": [],
}
labels = df["label_name"].unique()
for label in labels:
    df_label = df.query("label_name == @label")
    spectra = np.stack(df_label["median_normalized_spectrum"])

    # We either use the global PCA or a PCA per organ
    # pca = PCA(n_components=2, whiten=True, random_state=1337)
    # pca.fit(spectra)
    spectra_pca = pca.transform(spectra)

    rows["pca1"] += spectra_pca[:, 0].tolist()
    rows["pca2"] += spectra_pca[:, 1].tolist()
    rows["pca1_explained"] += [pca.explained_variance_ratio_[0]] * len(spectra)
    rows["pca2_explained"] += [pca.explained_variance_ratio_[1]] * len(spectra)
    rows["label_name"] += [label] * len(spectra)
    rows["annotation_name"] += df_label["annotation_name"].tolist()
    rows["subject_name"] += df_label["subject_name"].tolist()
    rows["situs"] += df_label["situs"].astype(int).astype(str).tolist()
    rows["angle"] += df_label["angle"].astype(int).astype(str).tolist()
    rows["repetition"] += df_label["repetition"].astype(int).astype(str).tolist()

df_pca = pd.DataFrame(rows)
df_pca.head()

Unnamed: 0,pca1,pca2,pca1_explained,pca2_explained,label_name,annotation_name,subject_name,situs,angle,repetition
0,0.001079,0.001739,0.442632,0.386527,stomach,polygon#annotator1,P01,1,0,1
1,0.00017,0.001817,0.442632,0.386527,stomach,polygon#annotator2,P01,1,0,1
2,-9.3e-05,0.001583,0.442632,0.386527,stomach,polygon#annotator3,P01,1,0,1
3,0.000821,0.001574,0.442632,0.386527,stomach,polygon#annotator1,P01,1,0,2
4,-0.000129,0.00188,0.442632,0.386527,stomach,polygon#annotator2,P01,1,0,2


In [5]:
cols = ["subject_name", "annotation_name", "situs", "angle", "repetition"]
combinations = list(itertools.product(cols, cols))
combinations = [c for c in combinations if c[0] != c[1]]
combinations

[('subject_name', 'annotation_name'),
 ('subject_name', 'situs'),
 ('subject_name', 'angle'),
 ('subject_name', 'repetition'),
 ('annotation_name', 'subject_name'),
 ('annotation_name', 'situs'),
 ('annotation_name', 'angle'),
 ('annotation_name', 'repetition'),
 ('situs', 'subject_name'),
 ('situs', 'annotation_name'),
 ('situs', 'angle'),
 ('situs', 'repetition'),
 ('angle', 'subject_name'),
 ('angle', 'annotation_name'),
 ('angle', 'situs'),
 ('angle', 'repetition'),
 ('repetition', 'subject_name'),
 ('repetition', 'annotation_name'),
 ('repetition', 'situs'),
 ('repetition', 'angle')]

In [6]:
def create_figure(color_name: str, symbol_name: str) -> go.Figure:
    color_mapping = dict(zip(sorted(df_pca[color_name].unique()), px.colors.qualitative.Alphabet))
    possible_symbols = [
        "cross",
        "circle",
        "triangle-right",
        "star",
        "diamond-tall",
        "square",
        "circle-cross",
        "triangle-left",
        "triangle-up",
        "triangle-down",
        "diamond-cross",
    ]
    symbol_mapping = dict(zip(sorted(df_pca[symbol_name].unique()), possible_symbols))

    n_cols = 4
    n_rows = len(labels) // n_cols
    fig = px.scatter(
        df_pca,
        x="pca1",
        y="pca2",
        color=color_name,
        symbol=symbol_name,
        facet_col="label_name",
        facet_col_wrap=n_cols,
        facet_row_spacing=0.04,  # Exactly 0 does not work (then the setting will be ignored)
        facet_col_spacing=0.02,
        # Does also affect in which order the points are plotted
        category_orders={"subject_name": sorted(df["subject_name"].unique())},
        color_discrete_map=color_mapping,
        symbol_map=symbol_mapping,
        labels={"subject_name": "pig", "annotation_name": "annotation"},
    )
    marker_size = 7
    marker_line = dict(width=0.5, color="DarkSlateGrey")
    fig.update_traces(marker_size=marker_size, marker_opacity=0.4, marker_line=marker_line, showlegend=False)

    # Add invisible points to style the legend (basically to remove the opacity)
    for name_color, color in color_mapping.items():
        for name_symbol, symbol in symbol_mapping.items():
            fig.add_trace(
                go.Scatter(
                    x=[None],
                    y=[None],
                    mode="markers",
                    marker_size=marker_size,
                    marker_line=marker_line,
                    marker_color=color,
                    marker_symbol=symbol,
                    name=f"{name_color}, {name_symbol}",
                    legendgroup=f"{name_color}, {name_symbol}",
                )
            )

    x_symbol_positions = [0.21, 0.465, 0.72, 0.975]
    y_symbol_positions = [0.195, 0.405, 0.614, 0.821, 1.03]

    for i, label in enumerate(labels):
        row = n_rows - (i // n_cols)
        col = i % n_cols + 1
        df_l = df_pca.query("label_name == @label")
        xe = df_l["pca1_explained"].unique().item()
        ye = df_l["pca2_explained"].unique().item()

        if row == 1:
            fig.update_xaxes(title=f"PC1 ({xe:.2f})", title_standoff=0, showticklabels=True, row=row, col=col)
        else:
            fig.update_xaxes(showticklabels=False, row=row, col=col)
        if col == 1:
            fig.update_yaxes(title=f"PC2 ({ye:.2f})", title_standoff=0, showticklabels=True, row=row, col=col)
        else:
            fig.update_yaxes(showticklabels=False, row=row, col=col)

        fig.add_layout_image(
            source=label_images[label],
            xref="paper",
            yref="paper",
            x=x_symbol_positions[col - 1],
            y=y_symbol_positions[row - 1],
            sizex=0.04,
            sizey=0.04,
            layer="above",
        )
    for annotation in fig.layout.annotations:
        label = annotation.text.removeprefix("label_name=")

        # We have enough space for the Gerota's fascia name
        label = settings_atlas.labels_paper_renaming.get(label, label)
        annotation.text = "<b>" + label + "</b>"

    fig.update_layout(height=1200, width=1400, template="plotly_white", font_size=14)
    fig.update_traces(visible=False)

    return fig


figs_html = ""
js_init = ""
js = ""
for color_name, symbol_name in combinations:
    fig = create_figure(color_name, symbol_name)
    div_id = f"{color_name}#{symbol_name}"
    if color_name == "subject_name" and symbol_name == "situs":
        fig.update_traces(visible=True)
        js_init += f"document.getElementById('{div_id}').style.display = 'block';"
        js += f"window.previous_color = '{color_name}';"
        js += f"window.previous_symbol = '{symbol_name}';"
    else:
        js_init += f"document.getElementById('{div_id}').style.display = 'none';"

    figs_html += fig.to_html(include_plotlyjs="cdn", full_html=False, div_id=div_id)

js += """
function selectionChanged() {
    const color = document.getElementById("color_name").value;
    const symbol = document.getElementById("symbol_name").value;
    const name = color + "#" + symbol;
    const previous_name = previous_color + "#" + previous_symbol;

    Plotly.restyle(document.getElementById(name), {visible: true});
    document.getElementById(name).style.display ='block';
    document.getElementById(previous_name).style.display ='none';

    window.previous_color = color;
    window.previous_symbol = symbol;
}
function colorChanged(selection) {
    let symbol_previous_partner = document.querySelector("#symbol_name > option[value='" + window.previous_color + "']");
    let symbol_partner = document.querySelector("#symbol_name > option[value='" + selection.value + "']");
    symbol_previous_partner.removeAttribute("disabled");
    symbol_partner.setAttribute("disabled", "disabled");

    selectionChanged();
}
function symbolChanged(selection) {
    let color_previous_partner = document.querySelector("#color_name > option[value='" + window.previous_symbol + "']");
    let color_partner = document.querySelector("#color_name > option[value='" + selection.value + "']");
    color_previous_partner.removeAttribute("disabled");
    color_partner.setAttribute("disabled", "disabled");

    selectionChanged();
}

// Progress bar
window.progressCheck = window.setInterval(function(){
    const n_divs = document.querySelectorAll("#graphs > div").length
    document.getElementById("progress_graphs").value = n_divs / %i;
}, 500);
document.addEventListener('DOMContentLoaded', function() {
    document.getElementById("progress").remove();
    clearInterval(window.progressCheck);
    document.getElementById("plot").style.display = "block";
});
""" % len(combinations)

css = """
#plot {
    display: none;
}
figure {
    /* Same font as plotly */
    font-family: "Open Sans", verdana, arial, sans-serif;
    width: min-content;
    margin-bottom: 25px;
}
figure > figcaption {
    margin-left: 15px;
    margin-right: 15px;
}
figcaption {
    text-align: center;
    margin-top: 10px;
}
"""

compress_html(
    target_dir / "Figure_06_PCA_single_organs.html",
    f"""
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8">
    <title>Median Spectra Visualizations</title>
    <script>{js}</script>
    <style>{css}</style>
  </head>
  <body>
    <div id="progress">
        <label for="progress_graphs">Loading. Please wait...</label>
        <progress id="progress_graphs" max="1" value="0"></progress>
    </div>

    <div id="plot">
        <label for="color_name">Choose a color attribute:</label>
        <select name="color_name" id="color_name" onchange="colorChanged(this)">
            <option value="subject_name" selected="selected">pig</option>
            <option value="annotation_name">annotation</option>
            <option value="situs" disabled="disabled">situs</option>
            <option value="angle">angle</option>
            <option value="repetition">repetition</option>
        </select>

        <label for="symbol_name">Choose a symbol attribute:</label>
        <select name="symbol_name" id="symbol_name" onchange="symbolChanged(this)">
            <option value="subject_name" disabled="disabled">pig</option>
            <option value="annotation_name">annotation</option>
            <option value="situs" selected="selected">situs</option>
            <option value="angle">angle</option>
            <option value="repetition">repetition</option>
        </select>

        <figure>
            <div id="graphs">{figs_html}</div>
            <figcaption><strong>Figure 6 | Visualization of spectral similarity</strong> with PCA stratified by organ; each point represents the median spectrum across one annotation of one organ in one image of one pig. Different shapes represent different situs. Different colors represent 11 different pigs.</figcaption>
        </figure>
        <script>{js_init}</script>
    </div>
  </body>
</html>
""",
)

In [7]:
fig = create_figure(color_name="subject_name", symbol_name="situs")
fig.update_traces(visible=True)
fig.write_image(target_dir / "Figure_06_PCA_single_organs.pdf")

## UMAP

In [8]:
df_full = df.copy()
spectra = np.stack(df_full["median_normalized_spectrum"])

spectra_umap = umap.UMAP(min_dist=0.8, spread=1, n_neighbors=50, random_state=0).fit_transform(spectra)
df_full["x"] = spectra_umap[:, 0]
df_full["y"] = spectra_umap[:, 1]

fig = plot_embeddings(df_full, size_symbols=2.5)
fig.update_xaxes(title="UMAP1", title_standoff=0, showticklabels=True)
fig.update_yaxes(title="UMAP2", title_standoff=0, showticklabels=True)
fig.write_image(target_dir / "Figure_05_UMAP.pdf")

fig = plot_embeddings(df_full, size_symbols=None)
fig.update_xaxes(title="UMAP1", title_standoff=0, showticklabels=True)
fig.update_yaxes(title="UMAP2", title_standoff=0, showticklabels=True)
add_opacity_slider(fig)

html = add_figcaption(
    fig.to_html(full_html=False, include_plotlyjs=True, div_id="UMAP"),
    title="Visualization of spectral similarity with UMAP",
    caption=(
        "<strong>Figure 5 | Visualization of spectral similarity with UMAP as a non-linear multi-dimensionality"
        " reduction tool.</strong> Each point represents the median spectrum across one annotation of one organ in one"
        " image of one pig of one annotation. The organ symbols are placed at the centroid of the corresponding organ"
        " distribution."
    ),
)
compress_html(target_dir / "Figure_05_UMAP.html", html)

In [9]:
df_full = df.copy()
spectra = np.stack(df_full["median_normalized_spectrum"])

spectra_umap = umap.UMAP(min_dist=0.8, spread=1, n_neighbors=50, random_state=0).fit_transform(
    spectra, y=df_full["label_index"]
)
df_full["x"] = spectra_umap[:, 0]
df_full["y"] = spectra_umap[:, 1]

fig = plot_embeddings(df_full, size_symbols=2.5)
fig.update_xaxes(title="UMAP1", title_standoff=0, showticklabels=True)
fig.update_yaxes(title="UMAP2", title_standoff=0, showticklabels=True)
fig.write_image(target_dir / "Supplemental_Figure_09_UMAP_supervised.pdf")

fig = plot_embeddings(df_full, size_symbols=None)
fig.update_xaxes(title="UMAP1", title_standoff=0, showticklabels=True)
fig.update_yaxes(title="UMAP2", title_standoff=0, showticklabels=True)
add_opacity_slider(fig)

html = add_figcaption(
    fig.to_html(full_html=False, include_plotlyjs=True, div_id="UMAP_supervised"),
    title="Visualization of spectral similarity with supervised UMAP",
    caption=(
        "<strong>Supplemental Figure 9 | Visualization of spectral similarity with supervised UMAP as a non-linear"
        " multi-dimensionality reduction tool.</strong> Organs are used as labels for the embedding. Each point"
        " represents the median spectrum across one annotation of one organ in one image of one pig of one annotation."
        " The organ symbols are placed at the centroid of the corresponding organ distribution."
    ),
)
compress_html(target_dir / "Supplemental_Figure_09_UMAP_supervised.html", html)