## Plot number of cells per type per patient after SMOTE

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly as pltly
import plotly.express as px
from plotly.subplots import make_subplots

from pathlib import Path

%load_ext blackcellmagic

In [None]:
# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

In [None]:
# Training patient IDs
train_p_ids = [
    "CID3586",
    "CID3941",
    "CID3963",
    "CID44041",
    "CID4530N",
    "CID3838",
    "CID3946",
    "CID4040",
    "CID4461",
    "CID44991",
    "CID45171",
    "CID4535",
    "CID3948",
    "CID4398",
    "CID4463",
    "CID4495",
    "CID4513",
    "CID4465",
]
# Training patient IDs
test_p_ids = [
    "CID4067",
    "CID4290A",
    "CID4471",
    "CID3921",
    "CID4066",
    "CID4523",
    "CID44971",
    "CID4515",
]
train_df = pd.DataFrame(train_p_ids, columns=["patient_id"])
train_df["train/test"] = "train"

test_df = pd.DataFrame(test_p_ids, columns=["patient_id"])
test_df["train/test"] = "test"

train_test_df = pd.concat([train_df, test_df], axis=0)
train_test_df.set_index(["patient_id"], inplace=True)

In [None]:
# Extract collour pallet
colour_pallete_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

In [None]:
# Extract lineages
lineages_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_immune_normal_lineages.csv"),
    sep=",",
)

In [None]:
meta_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_meta.csv"), index_col=0
)

# First row only contains the type of data, remove it
meta_df.drop(["TYPE"], axis=0, inplace=True)

In [None]:
# Produce a dataframe containing patient subtypessmote_df.to_csv("major_figures/major_smote.csv", sep="\t")
subtype_df = (
    meta_df[["Patient", "subtype"]]
    .drop_duplicates()
    .reset_index()
    .set_index(["Patient"])
    .drop(["NAME"], axis=1)
)

## [Fig] Major cell types

In [None]:
major_meta_df = meta_df[["Patient", "celltype_major", "subtype"]]

# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = major_meta_df.groupby(["Patient"])[
    "celltype_major"
].value_counts(normalize=False)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[
    [
        "Cancer Epithelial",
        "Normal Epithelial",
        "T-cells",
        "B-cells",
        "Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ]
]

# Replace 1 by 0 as we dropped cell types with only 1 cell
patient_hybrid_meta_df.replace({1: 0}, inplace=True)

# Calculate how many cells are smoted for each cell type for each patient
series_l = []

for i, row in patient_hybrid_meta_df.iterrows():
    row_max = row.max()
    row[row > 0] = row_max
    series_l.append(row)

smote_counts_df = pd.concat(series_l, axis=1).T

# Merge with subtype DataFrame
smote_counts_df = smote_counts_df.merge(subtype_df, left_index=True, right_index=True)
smote_counts_df["subtype"] = pd.Categorical(
    smote_counts_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
smote_counts_df = smote_counts_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
smote_counts_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
smote_counts_df.reset_index(inplace=True)
smote_counts_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
# Pivot table so cell types becomes one column
pivot_smote_counts_df = smote_counts_df.melt(
    id_vars=["train/test", "Patient", "subtype"]
)
pivot_smote_counts_df.rename(
    columns={"variable": "Cell Type", "value": "Cell Counts"}, inplace=True
)

In [None]:
# Save source data
pivot_smote_counts_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_1b.tsv"), sep="\t"
)

In [None]:
fig = px.bar(
    pivot_smote_counts_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
    facet_col="train/test",
    category_orders={"train/test": ["train", "test"]},
)

# Make lines around the bar very thing
fig.update_traces(marker_line=dict(width=0.01))

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Update axes
fig.update_xaxes(
    showticklabels=True,
    title=dict(text="Patient", font_size=9),
    tickangle=45,
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    matches=None,
    title_standoff=5,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
    # showticklabels=True, # If we don't say anything, plotly will only display ticklabels of the first subplot
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    range=[0, 40000],
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    title_standoff=5,
    title_font_size=8,
)

# Resize domain of x axes so training subplot takes up 70% of the plot
fig.update_xaxes(domain=[0.0, 0.69], col=1)
fig.update_xaxes(domain=[0.71, 1], col=2)

# Change the bar mode
fig.update_layout(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    legend=dict(font_size=8, title_font_size=9),
    showlegend=False,
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8, color="black"),
)

# Update text and position of column annotation
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i, v in enumerate([0.35, 0.86]):
    fig.layout.annotations[i].update(x=v, y=0.965)

# Save into svg
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_1b").with_suffix(".svg"),
    width=315,
    height=200,
    scale=6,
)

## [Fig] Normal Epithelial lineages

In [None]:
# We need to create a hybrid celltype column include
# subset cell types for B-cells, T-cells, Myeloid and Normal Epithelial
# major cell types for everyting else
major_normal_ctypes = ["Normal Epithelial"]

normal_subset_df = meta_df[meta_df["celltype_major"].isin(major_normal_ctypes)][
    "celltype_minor"
].to_frame()
normal_subset_df.rename(columns={"celltype_minor": "celltype_hybrid"}, inplace=True)


non_normal_subset_df = meta_df[~meta_df["celltype_major"].isin(major_normal_ctypes)][
    "celltype_major"
].to_frame()
non_normal_subset_df.rename(columns={"celltype_major": "celltype_hybrid"}, inplace=True)

# Merge hybrid celltype into meta_df
hybrid_celltype_df = pd.concat([normal_subset_df, non_normal_subset_df], axis=0)
normal_lineage_hybrid_meta_df = meta_df.merge(
    hybrid_celltype_df, left_index=True, right_index=True
)

In [None]:
# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = (
    normal_lineage_hybrid_meta_df[["Patient", "celltype_hybrid", "subtype"]]
    .groupby(["Patient"])["celltype_hybrid"]
    .value_counts(normalize=False)
)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[
    [
        "Cancer Epithelial",
        "Luminal Progenitors",
        "Mature Luminal",
        "Myoepithelial",
        "T-cells",
        "B-cells",
        "Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ]
]

# Replace any values <10 by 0 as we dropped these cell types for each patient
patient_hybrid_meta_df[patient_hybrid_meta_df < 10] = 0

# Calculate how many cells are smoted for each cell type for each patient
series_l = []

for i, row in patient_hybrid_meta_df.iterrows():
    row_max = row.max()
    row[row > 0] = row_max
    series_l.append(row)

smote_counts_df = pd.concat(series_l, axis=1).T

# Merge with subtype DataFrame
smote_counts_df = smote_counts_df.merge(subtype_df, left_index=True, right_index=True)
smote_counts_df["subtype"] = pd.Categorical(
    smote_counts_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
smote_counts_df = smote_counts_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
smote_counts_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
smote_counts_df.reset_index(inplace=True)
smote_counts_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
# Pivot table so cell types becomes one column
pivot_smote_counts_df = smote_counts_df.melt(
    id_vars=["train/test", "Patient", "subtype"]
)
pivot_smote_counts_df.rename(
    columns={"variable": "Cell Type", "value": "Cell Counts"}, inplace=True
)

In [None]:
# Save source data
pivot_smote_counts_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_1d.tsv"), sep="\t"
)

In [None]:
fig = px.bar(
    pivot_smote_counts_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
    facet_col="train/test",
    category_orders={"train/test": ["train", "test"]},
)

# Make lines around the bar very thing
fig.update_traces(marker_line=dict(width=0.01))

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Update axes
fig.update_xaxes(
    showticklabels=True,
    title=dict(text="Patient", font_size=9),
    tickangle=45,
    ticks="outside",
    ticklen=3,
    tickwidth=0.5,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    matches=None,
    title_standoff=5,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
    # showticklabels=True, # If we don't say anything, plotly will only display ticklabels of the first subplot
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    range=[0, 50000],
    dtick=10000,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    title_standoff=5,
    title_font_size=8,
)

# Resize domain of x axes so training subplot takes up 70% of the plot
fig.update_xaxes(domain=[0.0, 0.69], col=1)
fig.update_xaxes(domain=[0.71, 1], col=2)

# Change the bar mode
fig.update_layout(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    legend=dict(font_size=8, title_font_size=9),
    showlegend=False,
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8, color="black"),
)

# Update text and position of column annotation
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i, v in enumerate([0.35, 0.86]):
    fig.layout.annotations[i].update(x=v, y=0.965)

# Save into svg
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_1d").with_suffix(".svg"),
    width=315,
    height=250,
    scale=5,
)

## [Fig] Subset immune lineages

In [None]:
subset_immune_ctypes = [
    "Cancer Epithelial",
    "Normal Epithelial",
    "Effector Memory T Cells",
    "T-regs",
    "Tfh",
    "Naive/central Memory T Cells",
    "T-cells:IFNG",
    "Chemokine-expressing T Cells",
    "IFN-I Signature T Cells",
    "T-cells:LAG3",
    "T-cells:GZMK",
    "NK cells",
    "NKT cells",
    "Cycling T-cells",
    "B cells Memory",
    "B cells Naive",
    "LAM1*",
    "LAM2*",
    "M2-like Macrophage:EGR1",
    "Myeloid:cDC2/CD1C",
    "M2-like Macrophage",
    "Mono:IL1B",
    "Mono:FCGR3A",
    "Myeloid:cDC2/CD1C",
    "Myeloid:pDC/IRF7",
    "Myeloid:cDC1/CLEC9A",
    "Myeloid:DC/LAMP3",
    "Cycling Myeloid",
    "Endothelial",
    "CAFs",
    "PVL",
    "Plasmablasts",
]

In [None]:
# We need to create a hybrid celltype column include
# subset cell types for B-cells, T-cells, Myeloid and Normal Epithelial
# major cell types for everyting else
major_immune_ctypes = ["B-cells", "T-cells", "Myeloid"]

immune_subset_df = meta_df[meta_df["celltype_major"].isin(major_immune_ctypes)][
    "celltype_subset"
].to_frame()
immune_subset_df.rename(columns={"celltype_subset": "celltype_hybrid"}, inplace=True)


non_immune_subset_df = meta_df[~meta_df["celltype_major"].isin(major_immune_ctypes)][
    "celltype_major"
].to_frame()
non_immune_subset_df.rename(columns={"celltype_major": "celltype_hybrid"}, inplace=True)

# Merge hybrid celltype into meta_df
hybrid_celltype_df = pd.concat([immune_subset_df, non_immune_subset_df], axis=0)
subset_hybrid_meta_df = meta_df.merge(
    hybrid_celltype_df, left_index=True, right_index=True
)

# Merge with lineages_df to extract annotated subset cell types
subset_hybrid_meta_df = subset_hybrid_meta_df.merge(
    lineages_df[["celltype_subset", "celltype_subset_short"]]
    .rename(
        columns={
            "celltype_subset": "celltype_hybrid",
            "celltype_subset_short": "annotated_celltype_hybrid",
        }
    )
    .replace(
        {
            "Luminal Progenitors": "Normal Epithelial",
            "Mature Luminal": "Normal Epithelial",
            "Myoepithelial": "Normal Epithelial",
        }
    )
    .drop_duplicates(),
    on="celltype_hybrid",
    how="inner",
)

In [None]:
# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = (
    subset_hybrid_meta_df[["Patient", "annotated_celltype_hybrid", "subtype"]]
    .groupby(["Patient"])["annotated_celltype_hybrid"]
    .value_counts(normalize=False)
)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[subset_immune_ctypes]

# Replace any values <10 by 0 as we dropped these cell types for each patient
patient_hybrid_meta_df[patient_hybrid_meta_df < 10] = 0

# Calculate how many cells are smoted for each cell type for each patient
series_l = []

for i, row in patient_hybrid_meta_df.iterrows():
    row_max = row.max()
    row[row > 0] = row_max
    series_l.append(row)

smote_counts_df = pd.concat(series_l, axis=1).T
# smote_counts_df.to_csv("figures/data/smote_immune_lineages.csv", sep="\t")

# Merge with subtype DataFrame
smote_counts_df = smote_counts_df.merge(subtype_df, left_index=True, right_index=True)
smote_counts_df["subtype"] = pd.Categorical(
    smote_counts_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
smote_counts_df = smote_counts_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
smote_counts_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
smote_counts_df.reset_index(inplace=True)
smote_counts_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
# Pivot table so cell types becomes one column
pivot_smote_counts_df = smote_counts_df.melt(
    id_vars=["train/test", "Patient", "subtype"]
)
pivot_smote_counts_df.rename(
    columns={"variable": "Cell Type", "value": "Cell Counts"}, inplace=True
)

In [None]:
# Save source data
pivot_smote_counts_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_1f.tsv"), sep="\t"
)

In [None]:
fig = px.bar(
    pivot_smote_counts_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
    facet_col="train/test",
    category_orders={"train/test": ["train", "test"]},
)

# Make lines around the bar very thing
fig.update_traces(marker_line=dict(width=0.01))

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Update axes
fig.update_xaxes(
    showticklabels=True,
    title=dict(text="Patient", font_size=9),
    tickangle=45,
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    matches=None,
    title_standoff=5,
)
fig.update_yaxes(
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
    # showticklabels=True, # If we don't say anything, plotly will only display ticklabels of the first subplot
    ticks="outside",
    ticklen=2,
    tickwidth=0.5,
    range=[0, 100000],
    dtick=10000,
    tickfont_size=7,
    linecolor="black",
    linewidth=0.5,
    title_standoff=5,
    title_font_size=8,
)

# Resize domain of x axes so training subplot takes up 70% of the plot
fig.update_xaxes(domain=[0.0, 0.69], col=1)
fig.update_xaxes(domain=[0.71, 1], col=2)

# Change the bar mode
fig.update_layout(
    margin=dict(t=10, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    legend=dict(font_size=7, title_font_size=10, orientation="h"),
    showlegend=False,
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8, color="black"),
)

# Update text and position of column annotation
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
for i, v in enumerate([0.35, 0.86]):
    fig.layout.annotations[i].update(x=v, y=1)

# Save into svg
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_1f").with_suffix(".svg"),
    width=315,
    height=325,
    scale=5,
)

## Immune and Normal lineages

In [None]:
# We need to create a hybrid celltype column include
# subset cell types for B-cells, T-cells, Myeloid and Normal Epithelial
# major cell types for everyting else
major_immune_normal_ctypes = ["B-cells", "T-cells", "Myeloid", "Normal Epithelial"]

immune_normal_subset_df = meta_df[
    meta_df["celltype_major"].isin(major_immune_normal_ctypes)
]["celltype_subset"].to_frame()
immune_normal_subset_df.rename(
    columns={"celltype_subset": "celltype_hybrid"}, inplace=True
)


non_immune_normal_major_df = meta_df[
    ~meta_df["celltype_major"].isin(major_immune_normal_ctypes)
]["celltype_major"].to_frame()
non_immune_normal_major_df.rename(
    columns={"celltype_major": "celltype_hybrid"}, inplace=True
)

# Merge hybrid celltype into meta_df
hybrid_celltype_df = pd.concat(
    [immune_normal_subset_df, non_immune_normal_major_df], axis=0
)
subset_hybrid_meta_df = meta_df.merge(
    hybrid_celltype_df, left_index=True, right_index=True
)

# Merge with lineages_df to extract annotated subset cell types
subset_hybrid_meta_df = subset_hybrid_meta_df.merge(
    lineages_df[["celltype_subset", "celltype_subset_short"]].rename(
        columns={
            "celltype_subset": "celltype_hybrid",
            "celltype_subset_short": "annotated_celltype_hybrid",
        }
    ),
    on="celltype_hybrid",
    how="inner",
)

In [None]:
# First create a multiindex table of cell counts per type
patient_hybrid_meta_df = (
    subset_hybrid_meta_df[["Patient", "annotated_celltype_hybrid", "subtype"]]
    .groupby(["Patient"])["annotated_celltype_hybrid"]
    .value_counts(normalize=False)
)

# Unstack to turn the 2nd index into column
patient_hybrid_meta_df = patient_hybrid_meta_df.unstack(level=-1)
patient_hybrid_meta_df.fillna(0, inplace=True)

# Rearrange column orders
patient_hybrid_meta_df = patient_hybrid_meta_df[
    [
        "Cancer Epithelial",
        "Luminal Progenitors",
        "Mature Luminal",
        "Myoepithelial",
        "Effector Memory T Cells",
        "T-regs",
        "Tfh",
        "Naive/central Memory T Cells",
        "T-cells:IFNG",
        "Chemokine-expressing T Cells",
        "IFN-I Signature T Cells",
        "T-cells:LAG3",
        "T-cells:GZMK",
        "NK cells",
        "NKT cells",
        "Cycling T-cells",
        "B cells Memory",
        "B cells Naive",
        "LAM1*",
        "LAM2*",
        "M2-like Macrophage:EGR1",
        "Myeloid:cDC2/CD1C",
        "M2-like Macrophage",
        "Mono:IL1B",
        "Mono:FCGR3A",
        "Myeloid:cDC2/CD1C",
        "Myeloid:pDC/IRF7",
        "Myeloid:cDC1/CLEC9A",
        "Myeloid:DC/LAMP3",
        "Cycling Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ]
]

# Replace any values <10 by 0 as we dropped these cell types for each patient
patient_hybrid_meta_df[patient_hybrid_meta_df < 10] = 0

# Calculate how many cells are smoted for each cell type for each patient
series_l = []

for i, row in patient_hybrid_meta_df.iterrows():
    row_max = row.max()
    row[row > 0] = row_max
    series_l.append(row)

smote_counts_df = pd.concat(series_l, axis=1).T
# smote_df.to_csv("figures/data/smote_major.csv", sep="\t")

# Merge with subtype DataFrame
smote_counts_df = smote_counts_df.merge(subtype_df, left_index=True, right_index=True)
smote_counts_df["subtype"] = pd.Categorical(
    smote_counts_df["subtype"], categories=["HER2+", "TNBC", "ER+"]
)

# Retrieve train/test info
smote_counts_df = smote_counts_df.merge(
    train_test_df, left_index=True, right_index=True
)

# Sort patient by subtype and train/test
smote_counts_df.sort_values(
    ["subtype", "train/test"], ascending=[True, False], inplace=True
)

# Free patien ids from being the index
smote_counts_df.reset_index(inplace=True)
smote_counts_df.rename(columns={"index": "Patient"}, inplace=True)

In [None]:
fig = px.bar(
    pivot_smote_counts_df,
    x="Patient",
    y="Cell Counts",
    color="Cell Type",
    color_discrete_map={
        row["all_celltype"]: row["fill"] for index, row in colour_pallete_df.iterrows()
    },
)

# Reverse data order so Cancer is placed at the top of each bar chart
fig.data = fig.data[::-1]
fig.layout.legend.traceorder = "reversed"

# Change the bar mode
fig.update_layout(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    barmode="stack",
    xaxis=dict(
        showticklabels=True,
        title=dict(text="Patient", font_size=8),
        tickangle=45,
        ticks="outside",
        linecolor="black",
    ),
    legend=dict(font_size=8, title_font_size=8),
    showlegend=False,
    yaxis=dict(
        showgrid=True,
        gridcolor="lightgray",
        showticklabels=True,
        title=dict(text="Cell count", font_size=8),
        range=[0, 95000],
        linecolor="black",
    ),
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=8),
)

fig.write_image(
    "figures/cell_counts_by_patient/smote_normal_immune_lineages_by_patients.png",
    width=650,
    height=250,
    scale=6,
)

## Plot taxonomy of major lineages

In [None]:
lineages_melted_df = pd.read_csv(
    "params/Whole_miniatlas_lineages_melted.csv", index_col=0, sep="\t"
)

In [None]:
def plot_taxonomy(
    taxonomy_df: pd.DataFrame,
    output_path: str,
    plot_html: bool = False,
    plot_png: bool = True,
) -> None:
    # Create Sunburst figure with labels, parents and no values
    fig = go.Figure(
        go.Sunburst(
            labels=taxonomy_df["cell_type"],
            parents=taxonomy_df["parent"],
            hovertext=taxonomy_df["lineage"],
            values=taxonomy_df["counts"],
            branchvalues="total",
            # rotation=90
        )
    )

    # Update traces
    fig.update_traces(
        marker=dict(
            colors=taxonomy_df["fill"],
            line=dict(color="black", width=1),
        ),
        root=dict(color="white"),
        leaf=dict(opacity=1),
        sort=False,
    )

    # Tight margin
    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))

    # Update layout
    fig["layout"].update(
        title={
            # 'text': "",
            "y": 0.99,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(
            size=20,
        ),
    )

    # Save in html and png
    if plot_html:
        fig.write_html(
            Path(output_path).with_suffix(".html"),
            auto_open=True,
        )
    if plot_png:
        fig.write_image(
            Path(output_path).with_suffix(".png"),
            scale=6,
        )

#### Major

In [None]:
# Major
major_smote_counts_df = pd.read_csv(
    "figures/data/smote_major.csv", index_col=0, sep="\t"
)
# major_smote_counts_df = (
#     major_smote_counts_df.loc["CID4471"]
#     .reindex(
#         [
#             "T-cells",
#             "Myeloid",
#             "B-cells",
#             "Endothelial",
#             "CAFs",
#             "PVL",
#             "Plasmablasts",
#             "Cancer Epithelial",
#             "Normal Epithelial",
#         ]
#     )
#     .to_frame()
#     .reset_index()
#     .rename(columns={"index": "cell_type", "CID4471": "counts"})
# )
major_smote_counts_df = major_smote_counts_df[
    [
        "T-cells",
        "Myeloid",
        "B-cells",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
        "Cancer Epithelial",
        "Normal Epithelial",
    ]
]
major_smote_counts_df = (
    major_smote_counts_df.sum(axis=0)
    .to_frame()
    .reset_index()
    .rename(columns={"index": "cell_type", 0: "counts"})
)
major_smote_counts_df = major_smote_counts_df.merge(
    colour_pallete_df.rename(columns={"all_celltype": "cell_type"}),
    on="cell_type",
    how="inner",
)

# Create dummy parent
# major_smote_counts_df["parent"] = "CID4471"
major_smote_counts_df["parent"] = " "

In [None]:
plot_taxonomy(taxonomy_df=major_smote_counts_df, output_path="figures/smoted_major")

#### Normal lineages

In [None]:
# Normal lineages
normal_lineage_smote_counts_df = pd.read_csv(
    "figures/cell_counts_by_patient/smote_normal_lineages.csv", index_col=0, sep="\t"
)
# normal_lineage_smote_counts_df = (
#     normal_lineage_smote_counts_df.loc["CID4471"]
#     .to_frame()
#     .reset_index()
#     .rename(columns={"index": "cell_type", "CID4471": "counts"})
# )
normal_lineage_smote_counts_df = (
    normal_lineage_smote_counts_df.sum(axis=0)
    .to_frame()
    .reset_index()
    .rename(columns={"index": "cell_type", 0: "counts"})
)
normal_lineage_smote_counts_df = normal_lineage_smote_counts_df.merge(
    lineages_melted_df.drop(["counts"], axis=1),
    on="cell_type",
    how="inner",
)

# Get counts for parents of minor cell tyes that were SMOTE-d
for parent in ["Normal Epithelial"]:
    counts = normal_lineage_smote_counts_df[
        normal_lineage_smote_counts_df["parent"] == parent
    ]["counts"].sum()

    normal_lineage_smote_counts_df = normal_lineage_smote_counts_df.append(
        pd.Series(
            [parent, counts, None, None, None, None],
            index=normal_lineage_smote_counts_df.columns,
        ),
        ignore_index=True,
    )

# Redo colour pallete merge to get colour codes
normal_lineage_smote_counts_df = normal_lineage_smote_counts_df.drop(
    ["fill", "line", "lineage", "parent"], axis=1
).merge(
    lineages_melted_df.drop(["counts"], axis=1),
    on="cell_type",
    how="inner",
)

# Re-arrange major cell types in a specific order
major_df = (
    normal_lineage_smote_counts_df[normal_lineage_smote_counts_df["lineage"] == "major"]
    .set_index(["cell_type"])
    .reindex(
        [
            "T-cells",
            "Myeloid",
            "B-cells",
            "Endothelial",
            "CAFs",
            "PVL",
            "Plasmablasts",
            "Cancer Epithelial",
            "Normal Epithelial",
        ]
    )
).reset_index()
# major_df["parent"] = "CID4471"
non_major_df = normal_lineage_smote_counts_df[
    normal_lineage_smote_counts_df["lineage"] != "major"
]
normal_lineage_smote_counts_df = pd.concat([major_df, non_major_df], axis=0)

In [None]:
plot_taxonomy(
    taxonomy_df=normal_lineage_smote_counts_df,
    output_path="figures/smoted_normal_lineages",
)

#### Immune lineages

In [None]:
# Immune lineages
immune_lineage_smote_counts_df = pd.read_csv(
    "figures/data/smote_immune_lineages.csv", index_col=0, sep="\t"
)
# immune_lineage_smote_counts_df = (
#     immune_lineage_smote_counts_df.loc["CID4471"]
#     .to_frame()
#     .reset_index()
#     .rename(columns={"index": "cell_type", "CID4471": "counts"})
# )
immune_lineage_smote_counts_df = (
    immune_lineage_smote_counts_df.sum(axis=0)
    .to_frame()
    .reset_index()
    .rename(columns={"index": "cell_type", 0: "counts"})
)
immune_lineage_smote_counts_df = immune_lineage_smote_counts_df.merge(
    lineages_melted_df.drop(["counts"], axis=1),
    on="cell_type",
    how="inner",
)

# Get counts for parents of subset cell tyes that were SMOTE-d
for parent in ["DCs", "Macrophage", "Monocyte", "T cells CD4+", "T cells CD8+"]:
    counts = immune_lineage_smote_counts_df[
        immune_lineage_smote_counts_df["parent"] == parent
    ]["counts"].sum()

    # We have to specifyc parent cell type to faciliate the reconciliation at the nex step
    if parent in ["DCs", "Macrophage", "Monocyte"]:
        immune_lineage_smote_counts_df = immune_lineage_smote_counts_df.append(
            pd.Series(
                [parent, counts, "Myeloid", "minor", None, None],
                index=immune_lineage_smote_counts_df.columns,
            ),
            ignore_index=True,
        )
    else:
        immune_lineage_smote_counts_df = immune_lineage_smote_counts_df.append(
            pd.Series(
                [parent, counts, "T-cells", "minor", None, None],
                index=immune_lineage_smote_counts_df.columns,
            ),
            ignore_index=True,
        )
# Get counts for parents of minor cell tyes that were SMOTE-d
for parent in ["T-cells", "B-cells", "Myeloid"]:
    counts = immune_lineage_smote_counts_df[
        immune_lineage_smote_counts_df["parent"] == parent
    ]["counts"].sum()

    immune_lineage_smote_counts_df = immune_lineage_smote_counts_df.append(
        pd.Series(
            [parent, counts, None, None, None, None],
            index=immune_lineage_smote_counts_df.columns,
        ),
        ignore_index=True,
    )

# Redo colour pallete merge to get colour codes
immune_lineage_smote_counts_df = immune_lineage_smote_counts_df.drop(
    ["fill", "line", "lineage", "parent"], axis=1
).merge(
    lineages_melted_df.drop(["counts"], axis=1),
    on="cell_type",
    how="inner",
)

# Re-arrange major cell types in a specific order
major_df = (
    immune_lineage_smote_counts_df[immune_lineage_smote_counts_df["lineage"] == "major"]
    .set_index(["cell_type"])
    .reindex(
        [
            "T-cells",
            "Myeloid",
            "B-cells",
            "Endothelial",
            "CAFs",
            "PVL",
            "Plasmablasts",
            "Cancer Epithelial",
            "Normal Epithelial",
        ]
    )
).reset_index()
# major_df["parent"] = "CID4471"
non_major_df = immune_lineage_smote_counts_df[
    immune_lineage_smote_counts_df["lineage"] != "major"
]
immune_lineage_smote_counts_df = pd.concat([major_df, non_major_df], axis=0)

# Sort by lineage so major appears first, then minor, then subset
immune_lineage_smote_counts_df.sort_values(["lineage", "parent"], inplace=True)

In [None]:
plot_taxonomy(
    taxonomy_df=immune_lineage_smote_counts_df,
    output_path="figures/smoted_immune_lineages",
)