In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly as pltly
import plotly.express as px
from plotly.subplots import make_subplots

from pathlib import Path

%load_ext blackcellmagic

In [None]:
# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

In [None]:
# Training patient IDs
train_p_ids = [
    "CID3586",
    "CID3941",
    "CID3963",
    "CID44041",
    "CID4530N",
    "CID3838",
    "CID3946",
    "CID4040",
    "CID4461",
    "CID44991",
    "CID45171",
    "CID4535",
    "CID3948",
    "CID4398",
    "CID4463",
    "CID4495",
    "CID4513",
    "CID4465",
]
# Training patient IDs
test_p_ids = [
    "CID4067",
    "CID4290A",
    "CID4471",
    "CID3921",
    "CID4066",
    "CID4523",
    "CID44971",
    "CID4515",
]
train_df = pd.DataFrame(train_p_ids, columns=["patient_id"])
train_df["train/test"] = "train"

test_df = pd.DataFrame(test_p_ids, columns=["patient_id"])
test_df["train/test"] = "test"

train_test_df = pd.concat([train_df, test_df], axis=0)
train_test_df.set_index(["patient_id"], inplace=True)

In [None]:
# Extract collour pallet
colour_pallete_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

In [None]:
# Extract lineages
lineages_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_immune_normal_lineages.csv"),
    sep=",",
)

In [None]:
meta_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_meta.csv"), index_col=0
)

# First row only contains the type of data, remove it
meta_df.drop(["TYPE"], axis=0, inplace=True)

In [None]:
# Produce a dataframe containing patient subtypes
subtype_df = (
    meta_df[["Patient", "subtype"]]
    .drop_duplicates()
    .reset_index()
    .set_index(["Patient"])
    .drop(["NAME"], axis=1)
)

## Save cell counts by lineages

In [None]:
# Major cell types
major_ctype_counts_df = (
    meta_df[["Patient", "celltype_major"]]
    .value_counts()
    .reset_index()
    .pivot(index="Patient", columns="celltype_major", values=0)
    .fillna(0)
)
major_ctype_counts_df.sort_index(inplace=True)

In [None]:
# Minor cell types
minor_ctype_counts_df = (
    meta_df[["Patient", "celltype_minor"]]
    .value_counts()
    .reset_index()
    .pivot(index="Patient", columns="celltype_minor", values=0)
    .fillna(0)
)
minor_ctype_counts_df = minor_ctype_counts_df[
    [i for i in minor_ctype_counts_df.columns if i not in major_ctype_counts_df.columns]
]
minor_ctype_counts_df.rename(
    columns={"Cycling_Myeloid": "Cycling Myeloid"}, inplace=True
)
minor_ctype_counts_df.sort_index(inplace=True)

In [None]:
# Subset cell types
subset_ctype_counts_df = (
    meta_df[["Patient", "celltype_subset"]]
    .value_counts()
    .reset_index()
    .pivot(index="Patient", columns="celltype_subset", values=0)
    .fillna(0)
)
subset_ctype_counts_df.rename(
    columns={
        v["celltype_subset"]: v["celltype_subset_short"]
        for i, v in lineages_df.iterrows()
    },
    inplace=True,
)
subset_ctype_counts_df = subset_ctype_counts_df[
    [
        i
        for i in subset_ctype_counts_df.columns
        if i not in major_ctype_counts_df.columns
        and i not in minor_ctype_counts_df.columns
    ]
]
subset_ctype_counts_df.sort_index(inplace=True)

In [None]:
# Integreated cell types across lineage levels
integrated_ctype_counts_df = (
    meta_df[["Patient", "celltype_subset"]]
    .value_counts()
    .reset_index()
    .pivot(index="Patient", columns="celltype_subset", values=0)
    .fillna(0)
)
integrated_ctype_counts_df.rename(
    columns={
        v["celltype_subset"]: v["celltype_subset_short"]
        for i, v in lineages_df.iterrows()
    },
    inplace=True,
)

# Extract unique combinations of major/minor/subset cell types
lineage_mapping_df = (
    lineages_df[["celltype_major", "celltype_minor", "celltype_subset_short"]]
    .value_counts()
    .to_frame()
    .sort_index()
    .reset_index()
    .rename(
        columns={
            "celltype_major": "major",
            "celltype_minor": "minor",
            "celltype_subset_short": "subset",
        }
    )
    .drop([0], axis=1)
)

# Convert "major" column to Categorical so cell types appear in a specific order
lineage_mapping_df["major"] = pd.Categorical(
    lineage_mapping_df["major"],
    categories=[
        "Cancer Epithelial",
        "Normal Epithelial",
        "T-cells",
        "B-cells",
        "Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ],
    ordered=True,
)

# Get train/test patient information and merge with lineage mapping
integrated_ctype_counts_df = (
    integrated_ctype_counts_df.merge(train_test_df, left_index=True, right_index=True)
    .merge(subtype_df, left_index=True, right_index=True)
    .reset_index()
    .set_index(["train/test", "subtype", "index"])
    .sort_index(ascending=False)
    .T.merge(
        lineage_mapping_df.set_index(["subset"]), left_index=True, right_index=True
    )
    .reset_index()
    .set_index(["major", "minor", "index"])
    .sort_index()
)

# Turn columns into Multiindex
integrated_ctype_counts_df.columns = pd.MultiIndex.from_tuples(
    integrated_ctype_counts_df.columns.to_list()
)

## [Fig] Plot major cell types by patient

In [None]:
major_meta_df = meta_df[["Patient", "celltype_major", "subtype"]]

# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = major_meta_df.groupby(["Patient"])[
    "celltype_major"
].value_counts(normalize=False)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[
    [
        "Cancer Epithelial",
        "Normal Epithelial",
        "T-cells",
        "B-cells",
        "Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ]
]

# Merge with subtype DataFrame
patient_hybrid_meta_df = patient_hybrid_meta_df.merge(
    subtype_df, left_index=True, right_index=True
)
patient_hybrid_meta_df["subtype"] = pd.Categorical(
    patient_hybrid_meta_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
patient_hybrid_meta_df = patient_hybrid_meta_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
patient_hybrid_meta_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
patient_hybrid_meta_df.reset_index(inplace=True)
patient_hybrid_meta_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
# Pivot table so cell types becomes one column
pivot_patient_hybrid_meta_df = patient_hybrid_meta_df.melt(
    id_vars=["train/test", "Patient", "subtype"]
)
pivot_patient_hybrid_meta_df.rename(
    columns={"variable": "Cell Type", "value": "Cell Counts"}, inplace=True
)

In [None]:
patient_hybrid_meta_df.set_index(["Patient"]).drop(["subtype"], axis=1).sort_values(
    ["train/test"], ascending=True
)

In [None]:
# Make a variable to store major cell type counts
major_ctype_counts = (
    patient_hybrid_meta_df.set_index(["Patient"])
    .drop(["subtype", "train/test"], axis=1)
    .sum(axis=0)
    .to_frame()
    .rename(columns={0: "counts"})
)
major_ctype_counts["level"] = "major"

In [None]:
# Save source data
pivot_patient_hybrid_meta_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_1a.tsv"), sep="\t"
)

In [None]:
fig = px.bar(
    pivot_patient_hybrid_meta_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
    facet_col="train/test",
    category_orders={"train/test": ["train", "test"]},
)

# Make lines around the bar very thing
fig.update_traces(marker_line=dict(width=0.01))

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Update axes
fig.update_xaxes(
    showticklabels=True,
    title=dict(text="Patient", font_size=8),
    tickangle=45,
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    matches=None,
    title_standoff=5,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
    # showticklabels=True, # If we don't say anything, plotly will only display ticklabels of the first subplot
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    range=[0, 40000],
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    title_standoff=5,
    title_font_size=8,
)

# Resize domain of x axes so training subplot takes up 70% of the plot
fig.update_xaxes(domain=[0.0, 0.69], col=1)
fig.update_xaxes(domain=[0.71, 1], col=2)

# Change the bar mode
fig.update_layout(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    legend=dict(
        title_font_size=7,
        title="Cell type",
        font_size=10,
        orientation="h",
        yanchor="bottom",
        y=-1.5,
        xanchor="center",
        x=0.5,
    ),
    showlegend=False,
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8, color="black"),
)

# Update text and position of column annotation
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i, v in enumerate([0.35, 0.86]):
    fig.layout.annotations[i].update(x=v, y=0.965)

# Save into svg
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_1a").with_suffix(".svg"),
    width=315,
    height=200,
    scale=5,
)

## [Fig] Plot normal epithelial lineages by patient

In [None]:
# We need to create a hybrid celltype column include
# subset cell types for B-cells, T-cells, Myeloid and Normal Epithelial
# major cell types for everyting else
major_normal_ctypes = ["Normal Epithelial"]

normal_subset_df = meta_df[meta_df["celltype_major"].isin(major_normal_ctypes)][
    "celltype_minor"
].to_frame()
normal_subset_df.rename(columns={"celltype_minor": "celltype_hybrid"}, inplace=True)


non_normal_subset_df = meta_df[~meta_df["celltype_major"].isin(major_normal_ctypes)][
    "celltype_major"
].to_frame()
non_normal_subset_df.rename(columns={"celltype_major": "celltype_hybrid"}, inplace=True)

# Merge hybrid celltype into meta_df
hybrid_celltype_df = pd.concat([normal_subset_df, non_normal_subset_df], axis=0)
normal_lineage_hybrid_meta_df = meta_df.merge(
    hybrid_celltype_df, left_index=True, right_index=True
)

In [None]:
# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = (
    normal_lineage_hybrid_meta_df[["Patient", "celltype_hybrid", "subtype"]]
    .groupby(["Patient"])["celltype_hybrid"]
    .value_counts(normalize=False)
)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[
    [
        "Cancer Epithelial",
        "Luminal Progenitors",
        "Mature Luminal",
        "Myoepithelial",
        "T-cells",
        "B-cells",
        "Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ]
]

# Replace any values <10 by 0 as we dropped these cell types for each patient
patient_hybrid_meta_df[patient_hybrid_meta_df < 10] = 0

# Merge with subtype DataFrame
patient_hybrid_meta_df = patient_hybrid_meta_df.merge(
    subtype_df, left_index=True, right_index=True
)
patient_hybrid_meta_df["subtype"] = pd.Categorical(
    patient_hybrid_meta_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
patient_hybrid_meta_df = patient_hybrid_meta_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
patient_hybrid_meta_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
patient_hybrid_meta_df.reset_index(inplace=True)
patient_hybrid_meta_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
# Pivot table so cell types becomes one column
pivot_patient_hybrid_meta_df = patient_hybrid_meta_df.melt(
    id_vars=["train/test", "Patient", "subtype"]
)
pivot_patient_hybrid_meta_df.rename(
    columns={"variable": "Cell Type", "value": "Cell Counts"}, inplace=True
)

In [None]:
# Save source data
pivot_patient_hybrid_meta_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_1c.tsv"), sep="\t"
)

In [None]:
fig = px.bar(
    pivot_patient_hybrid_meta_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
    facet_col="train/test",
    category_orders={"train/test": ["train", "test"]},
)

# Make lines around the bar very thing
fig.update_traces(marker_line=dict(width=0.01))

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Update axes
fig.update_xaxes(
    showticklabels=True,
    title=dict(text="Patient", font_size=9),
    tickangle=45,
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    matches=None,
    title_standoff=5,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
    # showticklabels=True, # If we don't say anything, plotly will only display ticklabels of the first subplot
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    range=[0, 50000],
    dtick=10000,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    title_standoff=5,
    title_font_size=8,
)

# Resize domain of x axes so training subplot takes up 70% of the plot
fig.update_xaxes(domain=[0.0, 0.69], col=1)
fig.update_xaxes(domain=[0.71, 1], col=2)

# Change the bar mode
fig.update_layout(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    legend=dict(
        title_font_size=7,
        title="Cell type",
        font_size=10,
        orientation="h",
        yanchor="bottom",
        y=-0.8,
        xanchor="center",
        x=0.5,
    ),
    showlegend=True,
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8, color="black"),
)

# Update text and position of column annotation
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i, v in enumerate([0.35, 0.86]):
    fig.layout.annotations[i].update(x=v, y=0.965)

# Save into svg
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_1c").with_suffix(".svg"),
    width=315,
    height=250,
    scale=5,
)

## [Fig] Plot immune lineages by patient

In [None]:
# We need to create a hybrid celltype column include
# subset cell types for B-cells, T-cells, Myeloid and Normal Epithelial
major_immune_ctypes = ["B-cells", "T-cells", "Myeloid"]

immune_subset_df = meta_df[meta_df["celltype_major"].isin(major_immune_ctypes)][
    "celltype_subset"
].to_frame()
immune_subset_df.rename(columns={"celltype_subset": "celltype_hybrid"}, inplace=True)

# Merge with lineages_df to extract annotated subset cell types
immune_subset_df = (
    immune_subset_df.reset_index()
    .merge(
        lineages_df[["celltype_subset", "celltype_subset_short"]]
        .rename(
            columns={
                "celltype_subset": "celltype_hybrid",
                "celltype_subset_short": "annotated_celltype_hybrid",
            }
        )
        .drop_duplicates(),
        on="celltype_hybrid",
        how="left",
    )
    .set_index(["NAME"])
    .drop(["celltype_hybrid"], axis=1)
    .rename(columns={"annotated_celltype_hybrid": "celltype_hybrid"})
)


# major cell types for everyting else
non_immune_subset_df = meta_df[~meta_df["celltype_major"].isin(major_immune_ctypes)][
    "celltype_major"
].to_frame()
non_immune_subset_df.rename(columns={"celltype_major": "celltype_hybrid"}, inplace=True)

# Merge hybrid celltype into meta_df
hybrid_celltype_df = pd.concat([immune_subset_df, non_immune_subset_df], axis=0)
subset_hybrid_meta_df = meta_df.merge(
    hybrid_celltype_df, left_index=True, right_index=True
)

In [None]:
# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = (
    subset_hybrid_meta_df[["Patient", "celltype_hybrid", "subtype"]]
    .groupby(["Patient"])["celltype_hybrid"]
    .value_counts(normalize=False)
)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[
    [
        "Cancer Epithelial",
        "Normal Epithelial",
        "Effector Memory T Cells",
        "T-regs",
        "Tfh",
        "Naive/central Memory T Cells",
        "T-cells:IFNG",
        "Chemokine-expressing T Cells",
        "IFN-I Signature T Cells",
        "T-cells:LAG3",
        "NK cells",
        "NKT cells",
        "Cycling T-cells",
        "B cells Memory",
        "B cells Naive",
        "LAM1*",
        "LAM2*",
        "M2-like Macrophage:EGR1",
        "Myeloid:cDC2/CD1C",
        "Mono:IL1B",
        "Mono:FCGR3A",
        "Myeloid:cDC2/CD1C",
        "Myeloid:pDC/IRF7",
        "Myeloid:cDC1/CLEC9A",
        "Cycling Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ]
]

# Replace any values <10 by 0 as we dropped these cell types for each patient
patient_hybrid_meta_df[patient_hybrid_meta_df < 10] = 0

# Merge with subtype DataFrame
patient_hybrid_meta_df = patient_hybrid_meta_df.merge(
    subtype_df, left_index=True, right_index=True
)
patient_hybrid_meta_df["subtype"] = pd.Categorical(
    patient_hybrid_meta_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
patient_hybrid_meta_df = patient_hybrid_meta_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
patient_hybrid_meta_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
patient_hybrid_meta_df.reset_index(inplace=True)
patient_hybrid_meta_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
# Pivot table so cell types becomes one column
pivot_patient_hybrid_meta_df = patient_hybrid_meta_df.melt(
    id_vars=["train/test", "Patient", "subtype"]
)
pivot_patient_hybrid_meta_df.rename(
    columns={"variable": "Cell Type", "value": "Cell Counts"}, inplace=True
)

In [None]:
# Save source data
pivot_patient_hybrid_meta_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_1e.tsv"), sep="\t"
)

In [None]:
fig = px.bar(
    pivot_patient_hybrid_meta_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
    facet_col="train/test",
    category_orders={"train/test": ["train", "test"]},
)

# Make lines around the bar very thing
fig.update_traces(marker_line=dict(width=0.01))

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Update axes
fig.update_xaxes(
    showticklabels=True,
    title=dict(text="Patient", font_size=9),
    tickangle=45,
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    matches=None,
    title_standoff=5,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
    # showticklabels=True, # If we don't say anything, plotly will only display ticklabels of the first subplot
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    range=[0, 100000],
    dtick=10000,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    title_standoff=5,
    title_font_size=8,
)

# Resize domain of x axes so training subplot takes up 70% of the plot
fig.update_xaxes(domain=[0.0, 0.69], col=1)
fig.update_xaxes(domain=[0.71, 1], col=2)

# Change the bar mode
fig.update_layout(
    margin=dict(t=10, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    legend=dict(
        title_font_size=7,
        title="Cell type",
        font_size=10,
        orientation="h",
        yanchor="bottom",
        y=-1.5,
        xanchor="center",
        x=0.5,
    ),
    showlegend=False,  # Switch width to 900 when displaying legend
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8, color="black"),
)

# Update text and position of column annotation
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i, v in enumerate([0.35, 0.86]):
    fig.layout.annotations[i].update(x=v, y=1)

# Save into svg
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_1e").with_suffix(".svg"),
    width=315,
    height=325,
    scale=5,
)