In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Arial"
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import gc

from pyInfinityFlow.InfinityFlow_Utilities import pearson_corr_df_to_df
from pyInfinityFlow.InfinityFlow_Utilities import marker_finder

os.chdir("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/2023_06_12_tea_seq_atac_processing/")

path_cite_data = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/"

In [80]:
new_cluster_order = pd.read_csv("/media/kyle_storage/kyle_ferchen/"\
    "grimes_lab_main/data/2021_11_mouse_optimized_cite_seq/processed_files/"\
    "r7_cluster_order_kf_2024_01.csv")
map_r7_to_lv3 = pd.Series(\
    new_cluster_order["Level 3"].values,
    index=new_cluster_order["Cluster"].values)
map_lvl3_to_order = pd.Series(\
    new_cluster_order["Order"].values,
    index=new_cluster_order["Level 3"].values)
map_r7_to_replace_dash = pd.Series(\
    new_cluster_order["Cluster"].values,
    index=[i.replace("-", "_") for i in new_cluster_order["Cluster"].values])
map_new_cluster_to_order = pd.Series(\
    new_cluster_order["Order"].values,
    index=new_cluster_order["Level 3"].values)
new_cluster_order

Unnamed: 0,Cluster,Group,Old_order,CITE-to-TEA,Order,Level 1,Level 2,Level 3,Level Kairavee
0,LT-HSC_Mllt3,HSCP,1,LT-HSC_Mllt3,1.0,HSPC,HSC,qHSC,HSC
1,ST-HSC,HSCP,2,ST-HSC,2.0,HSPC,HSC,aHSC,HSC
2,MPP4-Hlf,HSCP,3,MPP4-Hlf,3.0,HSPC,MPP4,HSC-Ly,HSPC
3,MPP5-Egr1,HSCP,8,MPP5-Egr1,4.0,HSPC,MPP5,MPP5-IER,MPP5-IER
4,MPP5-Flt3,HSCP,7,MPP5-Flt3,5.0,HSPC,MPP5,MPP5 Ly-I,HSPC
...,...,...,...,...,...,...,...,...,...
83,ILC2,ILC,84,ILC2,84.0,ILC,ILC,ILC2,ILC
84,Bcl11b+_preETP_Cd3d,ILC,85,Bcl11b+_preETP_Cd3d,85.0,T cell,preETP,pre-ILC1-ILC3-NKP,preETP
85,Bcl11b+_preETP_Tdrd5,ILC,88,Bcl11b+_preETP_Tdrd5,86.0,T cell,preETP,pre-NKP,preETP
86,ILC1-ILC3-NKP,ILC,87,ILC1-ILC3-NKP,87.0,ILC,ILC,ILC1-ILC3-NKP,ILC


In [3]:
# Seqlet annotation
seqlets = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_anno.fea")

# Seqlets
dp_scores = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_dp_scores.fea")
dp_scores = dp_scores.set_index("index").astype(np.float32)
dp_scores = dp_scores.rename(map_r7_to_replace_dash.to_dict(), axis=1)
dp_scores = dp_scores.rename(map_r7_to_lv3.to_dict(), axis=1)

In [4]:
# Load CITE-seq data (SoupX 0.15 corrected)
path_to_cite_data = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/"

cite_adata = anndata.read_h5ad(os.path.join(\
    path_to_cite_data,
    "cite_seq_adata_rna_combined_SoupX_0_15_with_R7_clusters.h5ad"))

cite_adata.X = np.log2((10000 * (cite_adata.X.T / \
    cite_adata.X.sum(axis=1).T).T) + 1)

cite_adata.obs["lvl3"] = cite_adata.obs["R7"].replace(\
    map_r7_to_lv3.to_dict()).values
cite_adata.obs["cluster_order"] = cite_adata.obs["lvl3"].replace(\
    map_lvl3_to_order.to_dict()).values

cite_adata = cite_adata[\
    cite_adata.obs.loc[cite_adata.obs["lvl3"] != "MEP-UNK"].sort_values(\
        by="cluster_order").index.values,
    :]

cite_df = pd.DataFrame(\
    cite_adata.X,
    index=cite_adata.obs.index.values,
    columns=cite_adata.var.index.values)

cite_rna = cite_df.copy()
cite_rna["cluster"] = cite_adata.obs.loc[\
    cite_rna.index.values, "lvl3"].values
cite_rna = pd.pivot_table(cite_rna, index="cluster", aggfunc=np.mean)

In [7]:
r_df_all, f_df_all = marker_finder(\
    cite_df, 
    cite_adata.obs.loc[cite_df.index.values, "lvl3"].values)

In [16]:
tea_cells = cite_adata.obs.loc[\
    cite_adata.obs["lvl3"].isin(\
        dp_scores.columns.values).values, "lvl3"].index.values

r_df_tea, p_df_tea = marker_finder(\
    cite_df.loc[tea_cells],
    cite_adata.obs.loc[tea_cells, "lvl3"].values)

In [46]:
markers_all = r_df_all.copy().dropna().unstack().reset_index()
markers_all.columns = ["cluster", "gene", "r"]
markers_all = markers_all.sort_values(by="r", ascending=False)
markers_all = markers_all.loc[~markers_all["gene"].str.startswith("Rps")]
markers_all = markers_all.loc[~markers_all["gene"].str.startswith("Rpl")]
markers_all = markers_all.loc[~markers_all["gene"].str.startswith("mt-")]
markers_all = markers_all.loc[~markers_all["gene"].str.startswith("Gm")]
markers_all = markers_all.loc[~markers_all["gene"].duplicated()]
markers_all.index = list(range(markers_all.shape[0]))
markers_all["total_rank"] = list(range(1, 1 + markers_all.shape[0]))
track_cluster_rank = {}
save_cluster_ranks = []
for i, row in markers_all.iterrows():
    if row["cluster"] in track_cluster_rank:
        track_cluster_rank[row["cluster"]] += 1
        save_cluster_ranks.append(track_cluster_rank[row["cluster"]])
    else:
        track_cluster_rank[row["cluster"]] = 1
        save_cluster_ranks.append(track_cluster_rank[row["cluster"]])

markers_all["cluster_rank"] = save_cluster_ranks


In [60]:
markers_tea = r_df_tea.copy().dropna().unstack().reset_index()
markers_tea.columns = ["cluster", "gene", "r"]
markers_tea = markers_tea.sort_values(by="r", ascending=False)
markers_tea = markers_tea.loc[~markers_tea["gene"].str.startswith("Rps")]
markers_tea = markers_tea.loc[~markers_tea["gene"].str.startswith("Rpl")]
markers_tea = markers_tea.loc[~markers_tea["gene"].str.startswith("mt-")]
markers_tea = markers_tea.loc[~markers_tea["gene"].str.startswith("Gm")]
markers_tea = markers_tea.loc[~markers_tea["gene"].duplicated()]
markers_tea.index = list(range(markers_tea.shape[0]))
markers_tea["total_rank"] = list(range(1, 1 + markers_tea.shape[0]))
track_cluster_rank = {}
save_cluster_ranks = []
for i, row in markers_tea.iterrows():
    if row["cluster"] in track_cluster_rank:
        track_cluster_rank[row["cluster"]] += 1
        save_cluster_ranks.append(track_cluster_rank[row["cluster"]])
    else:
        track_cluster_rank[row["cluster"]] = 1
        save_cluster_ranks.append(track_cluster_rank[row["cluster"]])

markers_tea["cluster_rank"] = save_cluster_ranks

In [84]:
selected_markers_all = markers_all.loc[\
    (markers_all["cluster_rank"] <= 50) & \
    (markers_all["r"] > 0.2)]
selected_markers_all["cluster_order"] = selected_markers_all[\
    "cluster"].replace(map_new_cluster_to_order.to_dict()).values
selected_markers_all = selected_markers_all.sort_values(by=["cluster_order", 
    "cluster_rank"])
selected_markers_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_markers_all["cluster_order"] = selected_markers_all[\


Unnamed: 0,cluster,gene,r,total_rank,cluster_rank,cluster_order
442,qHSC,Meg3,0.399439,443,1,1.0
623,qHSC,Mllt3,0.354087,624,2,1.0
697,qHSC,Pdzk1ip1,0.341198,698,3,1.0
791,qHSC,Mecom,0.323858,792,4,1.0
810,qHSC,Cdkn1c,0.321055,811,5,1.0
...,...,...,...,...,...,...
1529,ILC1-ILC3-NKP,Camk2n1,0.240471,1530,46,87.0
1532,ILC1-ILC3-NKP,Sytl3,0.240333,1533,47,87.0
1582,ILC1-ILC3-NKP,Phactr3,0.236301,1583,48,87.0
1663,ILC1-ILC3-NKP,Traf1,0.230627,1664,49,87.0


In [85]:
selected_markers_tea = markers_tea.loc[\
    (markers_tea["cluster_rank"] <= 50) & \
    (markers_tea["r"] > 0.2)]
selected_markers_tea["cluster_order"] = selected_markers_tea[\
    "cluster"].replace(map_new_cluster_to_order.to_dict()).values
selected_markers_tea = selected_markers_tea.sort_values(by=["cluster_order", 
    "cluster_rank"])
selected_markers_tea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_markers_tea["cluster_order"] = selected_markers_tea[\


Unnamed: 0,cluster,gene,r,total_rank,cluster_rank,cluster_order
136,qHSC,Mllt3,0.421411,137,1,1.0
165,qHSC,Meg3,0.401930,166,2,1.0
207,qHSC,Sult1a1,0.375976,208,3,1.0
243,qHSC,Pdzk1ip1,0.358501,244,4,1.0
282,qHSC,Tgm2,0.339631,283,5,1.0
...,...,...,...,...,...,...
55,CLP1-b,Dntt,0.525318,56,1,61.0
531,CLP1-b,Cox6a2,0.273894,532,2,61.0
634,CLP1-b,Satb1,0.257661,635,3,61.0
921,CLP1-b,Wfdc17,0.224228,922,4,61.0


In [87]:
# Output the two considered marker gene lists
selected_markers_all.to_csv("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/WITH_ALL_GENES/"\
    "marker_genes_all_clusters_n_50_corr_0_2.csv",
    header=True, index=False)

selected_markers_tea.to_csv("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/WITH_ALL_GENES/"\
    "marker_genes_tea_seq_cbpnet_clusters_n_50_corr_0_2.csv",
    header=True, index=False)