In [None]:
import os
import pandas as pd
import numpy as np
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["font.family"] = "Arial"
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
from scipy.stats import zscore

from pyInfinityFlow.InfinityFlow_Utilities import marker_finder
from pyInfinityFlow.InfinityFlow_Utilities import pearson_corr_df_to_df
from pyInfinityFlow.Plotting_Utilities import assign_rainbow_colors_to_groups
from pyInfinityFlow.Plotting_Utilities import blue_black_yellow_cmap


os.chdir("/Volumes/Kyle_T7_2/grimes_lab/analysis/"\
    "2024_03_02_redo_cluster_order_for_plots/")  

In [None]:
# Prepare the CITE-seq data
path_to_cite_data = "/Volumes/Kyle_T7_2/grimes_lab/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/"

cite_adata = anndata.read_h5ad(os.path.join(\
    path_to_cite_data,
    "cite_seq_adata_rna_combined_soupx_0_15_with_order_and_lvl4_annotations.h5ad"))

cite_adata.X = np.log2((10000 * (cite_adata.X.T / \
    cite_adata.X.sum(axis=1).T).T) + 1)


In [None]:
### Run markerFinder
## Gene selection
# Force genes to be above 0 log2 cptt in at least 25 cells
sel_genes = pd.Series(\
    (np.asarray(cite_adata.X) > 0).sum(axis=0),
    index=cite_adata.var.index.values)

sel_genes = sel_genes[sel_genes > 25]

# Exclude mitochondrial genes
sel_genes = sel_genes.loc[\
    ~sel_genes.index.str.startswith("mt-")]

## markerFinder
r_df, p_df = marker_finder(\
    pd.DataFrame(\
        cite_adata[:,sel_genes.index.values].X,
        index = cite_adata.obs.index.values,
        columns = sel_genes.index.values),
    cite_adata.obs['lvl4'].values)

In [None]:
# Select top 25 (allow repeats) markers for each cluster
marker_df = r_df.unstack().reset_index()
marker_df.columns = ["cluster", "gene", "r"]
marker_df = marker_df.sort_values(by="r", ascending=False)
saved_markers = []
for tmp_cluster in r_df.columns.values:
    seg_marker_df = marker_df.loc[marker_df["cluster"] == tmp_cluster]
    seg_marker_df = seg_marker_df.head(25)
    seg_marker_df["rank"] = list(range(1,26))
    saved_markers.append(seg_marker_df)

saved_markers = pd.concat(saved_markers)

In [None]:
# Read in cluster annotations to map cluster order
path_cite_data = "/Volumes/Kyle_T7_2/grimes_lab/data/"\
    "2021_11_mouse_optimized_cite_seq/"
cluster_anno = pd.read_csv(os.path.join(path_cite_data, "processed_files/"\
    "cluster_anno_r6_r7_lvl4_and_order.csv"))
map_lvl4_to_order = pd.Series(\
    cluster_anno["Order"].values,
    index=cluster_anno["lvl4"].values).dropna().astype(int)
map_lvl4_to_color = pd.Series(\
    cluster_anno["Color"].values,
    index = cluster_anno["lvl4"].values)
map_lvl4_to_order

In [None]:
saved_markers["cluster_order"] = saved_markers["cluster"].replace(\
    map_lvl4_to_order.to_dict()).astype(int).values
saved_markers = saved_markers.sort_values(by=["cluster_order", "r"], 
    ascending=[True, False])
saved_markers

In [None]:
# Calculate centroids
unique_marker_genes = saved_markers["gene"].unique()
cite_centroids = pd.DataFrame(\
    cite_adata[:,unique_marker_genes].X.toarray(),
    index = cite_adata.obs.index.values,
    columns = unique_marker_genes)

cite_centroids["CLUSTER"] = cite_adata.obs["lvl4"].values
cite_centroids = pd.pivot_table(cite_centroids, index="CLUSTER", 
    aggfunc=np.mean)
cite_centroids

In [None]:
# Correlate individual cells to centroids
c2c_rdf = pearson_corr_df_to_df(\
    pd.DataFrame(\
        cite_adata[:,unique_marker_genes].X.toarray(),
        index = cite_adata.obs.index.values,
        columns = unique_marker_genes).T,
    cite_centroids[unique_marker_genes].T)

In [None]:
# Sample top 25 correlating cells for each cluster
sampled_cells = []
for tmp_cluster in map_lvl4_to_order.index.values:
    seg_cells = cite_adata.obs.loc[\
        cite_adata.obs["lvl4"] == tmp_cluster].copy()
    tmp_rank_cells = c2c_rdf.loc[\
        seg_cells.index.values,
        tmp_cluster].sort_values(ascending=False)
    tmp_sel_cells = pd.DataFrame(tmp_rank_cells.head(25))
    tmp_sel_cells.columns = ['r']
    tmp_sel_cells["rank"] = list(range(1,26))
    tmp_sel_cells["cluster"] = tmp_cluster
    sampled_cells.append(tmp_sel_cells)

sampled_cells = pd.concat(sampled_cells)
sampled_cells["sample"] = cite_adata.obs.loc[\
    sampled_cells.index.values, 
    "sample"].values
map_sample_to_tick_name = {\
    "AS_CITE_CD127": "CD127+",
    "AS_CITE_HSC": "HSC/MPP",
    "AS_CITE_Multilin1": "MultiLin",
    "AS_CITE_Kit": "CD117+",
    "AS_3CITE_Kit": "CD117+",
    "AS_CITE_Multilin2": "MultiLin",
    "AS_3CITE_Kitx2": "CD117+"}
map_tick_name_to_order = pd.Series({\
    "HSC/MPP": 1,
    "MultiLin": 2,
    "CD127+": 3,
    "CD117+": 4}).sort_values()
sampled_cells["tick_label"] = sampled_cells["sample"].replace(\
    map_sample_to_tick_name).values
plot_tick_marks = pd.get_dummies(sampled_cells["tick_label"]).loc[\
    sampled_cells.index.values, 
    map_tick_name_to_order.index.values].T
sampled_cells

In [None]:
# Make RNA data to to plot dataframe
rna_to_plot = pd.DataFrame(\
    cite_adata[\
        sampled_cells.index.values,
        saved_markers["gene"].values].X.toarray(),
    index = sampled_cells.index.values,
    columns = saved_markers["gene"].values).T
rna_to_plot = zscore(rna_to_plot, axis=1)
rna_to_plot

In [None]:
cluster_colors = pd.DataFrame(sampled_cells["cluster"].replace(map_lvl4_to_order.to_dict())).T
cluster_colors

In [None]:
map_lvl4_to_color.dropna().values

In [None]:
# Read in the ADT values
adt_data = pd.read_table(os.path.join(\
    path_cite_data, 
    "processed_files/totalvi_denoised_adt_values_optimized_cite.txt"),
    index_col=0)
def fix_adt_cell_names(tmp_cell):
    tmp_cell = "-".join(tmp_cell.split("-")[:-1])
    tmp_barcode = tmp_cell.split("_")[-1]
    tmp_sample = "_".join(tmp_cell.split("_")[:-1])
    return(f"{tmp_sample}.{tmp_barcode}")

adt_data.index = [fix_adt_cell_names(i) for i in adt_data.index.values]
adt_features = [i for i in adt_data.columns.values if not i.endswith("Ctrl")]

In [None]:
## markerFinder
adt_r_df, adt_p_df = marker_finder(\
    adt_data.loc[sampled_cells.index.values,adt_features],
    sampled_cells["cluster"])

In [None]:
adt_markers = adt_r_df.unstack().reset_index()
adt_markers.columns = ["cluster", "marker", "r"]
adt_markers = adt_markers.loc[~adt_markers["cluster"].isin([\
    "HSC-Mac-2", "HSC-Mac-1", "Mac-Nr1h3"]).values]
adt_markers = adt_markers.sort_values(by="r", ascending=False)
adt_markers = adt_markers.loc[~adt_markers["marker"].duplicated()]
adt_markers["cluster_order"] = adt_markers["cluster"].replace(\
    map_lvl4_to_order).values
adt_markers = adt_markers.sort_values(by=["cluster_order", "r"], 
    ascending=[True, False])
adt_markers

In [None]:
adt_to_plot = zscore(\
    adt_data.loc[\
        sampled_cells.index.values, 
        adt_markers["marker"].values].T, 
    axis=1)

In [None]:
plt.close("all")
fig = plt.figure(constrained_layout=True, figsize=(19.2, 12))
ax = fig.add_gridspec(140, 1)
ax1 = fig.add_subplot(ax[:3, 0])
ax2 = fig.add_subplot(ax[3:90, 0])
ax3 = fig.add_subplot(ax[90:104, 0])
ax4 = fig.add_subplot(ax[104:, 0])

from matplotlib.colors import ListedColormap
tmp_cmap = cmap = ListedColormap(map_lvl4_to_color.dropna().values)
heat1 = sns.heatmap(\
    cluster_colors,
    cmap = tmp_cmap,
    xticklabels=False,
    yticklabels=False,
    cbar=False,
    ax=ax1)

# RNA data to plot
heat2 = sns.heatmap(\
    rna_to_plot.fillna(0), 
    vmin=-3, 
    vmax=3, 
    cmap=blue_black_yellow_cmap,
    xticklabels=False,
    yticklabels=False,
    cbar=False,
    rasterized=True,
    ax=ax2)

# Sample tick mark plot
heat3 = sns.heatmap(plot_tick_marks,
    cmap="Greys",
    yticklabels=True,
    xticklabels=False,
    cbar=False,
    rasterized=True,
    ax=ax3)

icefire_cmap = sns.color_palette("icefire", as_cmap=True)
# ADT data to plot
heat4 = sns.heatmap(\
    adt_to_plot.fillna(0), 
    vmin=-3, 
    vmax=3, 
    cmap=icefire_cmap,
    xticklabels=False,
    yticklabels=False,
    cbar=False,
    rasterized=True,
    ax=ax4)

plt.savefig("output/rna_tick_adt_heatmap_combo_plot.pdf", dpi=600)

In [None]:
adt_to_plot.fillna(0).to_csv("output/rna_tick_adt_heatmap_combo_plot_adt_values.csv",
    index=True, index_label="UID")
rna_to_plot.fillna(0).to_csv("output/rna_tick_adt_heatmap_combo_plot_rna_values.csv",
    index=True, index_label="UID")

adt_markers.to_csv("output/rna_tick_adt_heatmap_combo_plot_adt_markers_ordered.csv",
    index=False)
saved_markers.to_csv("output/rna_tick_adt_heatmap_combo_plot_rna_markers_ordered.csv",
    index=False)

In [None]:
from pyInfinityFlow.Plotting_Utilities import blue_black_yellow_cmap

fig, ax = plt.subplots(figsize=(12, 1), layout='constrained')

cmap = matplotlib.cm.cool
norm = matplotlib.colors.Normalize(vmin=-3, vmax=3)

fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=blue_black_yellow_cmap),
             cax=ax, orientation='horizontal', label='Gene Expression')

plt.savefig("output/rna_heatmap_colorbar.pdf")

In [None]:
# fig, ax = plt.subplots(figsize=(12, 1), layout='constrained')

# plasma_cmap = matplotlib.cm.plasma
# norm = matplotlib.colors.Normalize(vmin=-3, vmax=3)

# fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=plasma_cmap),
#              cax=ax, orientation='horizontal', label='ADT Expression')

# plt.savefig("output/adt_heatmap_colorbar.pdf")

In [None]:
icefire_cmap = sns.color_palette("icefire", as_cmap=True)



norm = matplotlib.colors.Normalize(vmin=-3, vmax=3)
fig, ax = plt.subplots(figsize=(12, 1), layout='constrained')
fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=icefire_cmap),
             cax=ax, orientation='horizontal', label='ADT Expression')

plt.savefig("output/adt_heatmap_colorbar.pdf")