In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import re
from gimmemotifs.motif import Motif,read_motifs

from pyInfinityFlow.InfinityFlow_Utilities import pearson_corr_df_to_df

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Arial"
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

os.chdir("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/"\
    "2023_06_12_tea_seq_atac_processing/")

INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


In [2]:
# Read in the CITE-seq RNA counts
path_cite_data = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/"

adata_cite = sc.read(os.path.join(\
    path_cite_data, 
    "cite_seq_adata_rna_combined.h5ad"))
print("Computing CPTT normalized scRNA-seq from CITE-seq...")
adata_cite.X = np.log2((10000 * (adata_cite.X.T / \
    adata_cite.X.sum(axis=1).T).T) + 1)
cite_cell_anno = pd.read_csv(os.path.join(\
    path_cite_data, 
    "cite_seq_cell_annotations.csv"))
cite_cell_anno.index = cite_cell_anno["Cell_Barcode"].values

# Make pseudobulk RNA counts from cite clusters
print("Computing RNA centroids...")
shared_cells_cite = np.intersect1d(cite_cell_anno.index.values, 
    adata_cite.obs.index.values)
cite_cell_anno = cite_cell_anno.loc[shared_cells_cite]
cite_rna = {}
for tmp_cluster in cite_cell_anno["sctri_cite"].unique():
    print("\t{}...".format(tmp_cluster))
    tmp_barcodes = cite_cell_anno.loc[\
        cite_cell_anno["sctri_cite"] == tmp_cluster].index.values
    cite_rna[tmp_cluster] = pd.Series(np.asarray(\
            adata_cite[tmp_barcodes].X.mean(axis=0)).reshape(-1),
            index=adata_cite.var.index.values)

cite_rna = pd.DataFrame(cite_rna)
cite_rna = cite_rna.drop("Unknown", axis=1)
# map_r7_names = pd.read_csv(os.path.join(\
#     path_cite_data, 
#     "map_r7-v1_to_r7-v2_names.csv"))
# map_r7_names_v1_to_v2 = pd.Series(\
#     map_r7_names["R7_V2"].values,
#     index = map_r7_names["R7_V1"].values)

# cite_rna = cite_rna.rename(map_r7_names_v1_to_v2, axis=1)
# cite_rna.columns = [i.replace("-", "_") for i in cite_rna.columns.values]

Computing CPTT normalized scRNA-seq from CITE-seq...
Computing RNA centroids...
	ERP4--Ex...
	MultiLin-2_ML-c9--RNA...
	ERP2_Kit-c14--RNA...
	ERP2--Ex...
	Ebf1+_proB_CD127-c2--RNA...
	proNeu-1_1-4--ADT...
	Unknown...
	preNeu-2_1-8--ADT...
	cMoP_ML-c15--RNA...
	immNeu_Kit-c3--RNA...
	MP--Ex...
	proNeu-2_Kit-c4--RNA...
	Ebf1+_proB--Ex...
	ML_ERP1_ML-c4--RNA...
	MDP--Ex...
	preNeu-3_Kit-c2--RNA...
	ERP4_Kit-c13--RNA...
	preNeu-3_2-3--ADT...
	immNeu_2-5--ADT...
	preNeu-1_Kit-c5--RNA...
	immNeu_Kit-c1--RNA...
	cMoP--Ex...
	neoHPC_Myc--Ex...
	Bcl11b+_preETP--Ex...
	ML_Multi-Lin-2_HSC-c7--RNA...
	Ebf1+_proB_CD127-c8--RNA...
	IG2-proNeu1_ML-c14--RNA...
	ML_Mast--Ex...
	ML_cell_cycle_ML-c5--RNA...
	HSC-HPC-Cenpf_HSC-c12--RNA...
	MultiLin-2_ERP_ML-c10--RNA...
	ML_MDP_ML-c13--RNA...
	proNeu-1--Ex...
	SiglecH-Ly6C-pre-DC--Ex...
	HSC-HPC-Mki67--Ex...
	Pro-B_CD127-c3--RNA...
	precursor_B_cell_5-2--ADT...
	CLP1_CD127-c5--RNA...
	ML_MultiLin-1_ML-c8--RNA...
	ETP-A-0-Ccl4--Ex...
	ETP-CC-4-Ung--Ex...
	e

In [3]:
# Read in the motifs
path_cisbp = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/reference/"\
    "cisbp2/Mus_musculus_2020_06_01_11_53_pm/"
path_pwms = os.path.join(path_cisbp, "pwms_all_motifs/")
path_cisbp_anno = os.path.join(path_cisbp, "TF_Information_all_motifs.txt")

cisbp_anno = pd.read_table(path_cisbp_anno)
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"] != "."]

cisbp2_motifs = read_motifs(os.path.join(path_cisbp, "mouse_cisbp2_all_motif_ids_ppm_format.motif"))
cisbp2_names = np.array([str(x) for x in cisbp2_motifs])

# Filter cisbp2 annotation to only motifs that have a real entry
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"].isin(["_".join(i.split("_")[:2]) for i in cisbp2_names])]

map_id_to_tf = cisbp_anno[["Motif_ID", "TF_Name"]].drop_duplicates()

In [4]:
path_to_cisbp_correlations = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "correlate_modisco_to_cisbp2_motifs/"

corr_files = [i for i in os.listdir(path_to_cisbp_correlations) if \
    i.endswith("csv") and not i.startswith(".")]

modisco_pattern_re = re.compile(r'(\w+__pattern_[0-9]+)_\w+.csv')

corr_files = pd.Series(\
    corr_files, 
    index=[modisco_pattern_re.findall(i)[0] for i in corr_files])

pat_corrs = {}
for tmp_pat in corr_files.index.values:
    pat_corrs[tmp_pat] = pd.read_csv(os.path.join(\
        path_to_cisbp_correlations,
        corr_files[tmp_pat]))

for tmp_pat in pat_corrs:
    pat_corrs[tmp_pat].loc[:,"motif_pattern"] = [i.split("_")[-1] \
        for i in pat_corrs[tmp_pat]["motif"].values]
    pat_corrs[tmp_pat].loc[:,"motif"] = ["_".join(i.split("_")[:-1]) \
        for i in pat_corrs[tmp_pat]["motif"].values]
    pat_corrs[tmp_pat] = pat_corrs[tmp_pat].set_index("motif")

In [None]:
# new_cluster_order = pd.read_csv("/media/kyle_storage/kyle_ferchen/"\
#     "grimes_lab_main/data/2021_11_mouse_optimized_cite_seq/processed_files/"\
#     "r7_cluster_order_kf_2024_01.csv")
# map_r7_to_lv3 = pd.Series(\
#     new_cluster_order["Level 3"].values,
#     index=new_cluster_order["Cluster"].values)
# map_lvl3_to_order = pd.Series(\
#     new_cluster_order["Order"].values,
#     index=new_cluster_order["Level 3"].values)
# map_r7_to_replace_dash = pd.Series(\
#     new_cluster_order["Cluster"].values,
#     index=[i.replace("-", "_") for i in new_cluster_order["Cluster"].values])
# map_new_cluster_to_order = pd.Series(\
#     new_cluster_order["Order"].values,
#     index=new_cluster_order["Level 3"].values)
# new_cluster_order

In [5]:
cluster_anno = pd.read_csv("/media/kyle_storage/kyle_ferchen/grimes_lab_main/"\
    "data/2021_11_mouse_optimized_cite_seq/processed_files/"\
    "cluster_anno_r6_r7_lvl4_and_order.csv")
map_r7_to_lvl4 = pd.Series(\
    cluster_anno["lvl4"].values,
    index=cluster_anno["R7"].values)
map_lvl4_to_order = pd.Series(\
    cluster_anno["Order"].values,
    index=cluster_anno["lvl4"].values).dropna()
map_r7_to_replace_dash = pd.Series(\
    cluster_anno["R7"].values,
    index=[i.replace("-", "_") for i in cluster_anno["R7"].values])
map_r6_to_lvl4 = pd.Series(\
    cluster_anno["lvl4"].values,
    index=cluster_anno["R6"].values)
cluster_anno


Unnamed: 0,Order,R6,R7,lvl4,Color,Group
0,1.0,neoHSC-Mlt3_HSC-c5--RNA,LT-HSC_Mllt3,qHSC,#040485,HSPC
1,2.0,neoHSC-Mlt3--Ex,ST-HSC,aHSC,#043075,HSPC
2,3.0,MPP4-Flt3_HSC-c3--RNA,MPP4-Hlf,HSC-Ly,#0b3d8c,HSPC
3,4.0,ML_Multi-Lin-2_HSC-c4--RNA,MPP5-Egr1,MPP5-IER,#11489e,HSPC
4,5.0,ML_Multi-Lin-2_HSC-c7--RNA,MPP5-Flt3,MPP5 Ly-I,#1852ad,HSPC
...,...,...,...,...,...,...
85,83.0,ILCP_12-21--ADT,ILCP-Tox2,ILCP-b,#76c6f5,ILC/NkP
86,84.0,ILC2_12-2--ADT,ILC2,ILC2,#5dbdf5,ILC/NkP
87,85.0,Bcl11b+_preETP_CD127-c6--RNA,Bcl11b+_preETP_Cd3d,ILC1_3+NKP,#3db6fc,ILC/NkP
88,86.0,Bcl11b+_preETP--Ex,Bcl11b+_preETP_Tdrd5,pre-aceNKP,#1cacff,ILC/NkP


In [6]:
# Seqlet annotation
seqlets = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_anno.fea")

# Seqlets
dp_scores = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_dp_scores.fea")
dp_scores = dp_scores.set_index("index").astype(np.float32)
dp_scores = dp_scores.rename(map_r7_to_replace_dash.to_dict(), axis=1)
dp_scores = dp_scores.rename(map_r7_to_lvl4.to_dict(), axis=1)

In [7]:
# Rename cite-rna to the lvl4 annotations
cite_rna = cite_rna.rename(map_r6_to_lvl4.to_dict(), axis=1)
cite_rna

Unnamed: 0,ErP-5,ML-MDP,ErP-3,ErP-2,preB-kappa,proNeu-1b,preNeu-2,cMoP-a,immNeu-1,MP,...,MPP1-MKP,CHILP,ILC2,EILP,MPP3-IER,MEP,HSC-Ly,MPP5-IER,MPP4-TCA,HSC-Mac-2
Xkr4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Gm1992,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Gm37381,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Rp1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000992,0.000000,0.000859,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Rp1.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC168977.1,0.000908,0.000000,0.000177,0.000676,0.001885,0.000388,0.000000,0.001044,0.001121,0.000000,...,0.001185,0.000000,0.000000,0.000000,0.000000,0.000000,0.002227,0.001502,0.002648,0.000000
PISD,0.130867,0.487255,0.218956,0.291408,0.645974,1.015920,0.562707,0.496978,0.203916,0.693699,...,0.852288,0.973890,0.763850,0.577835,1.094302,0.977848,1.630157,1.226136,1.565246,1.271523
DHRSX,0.406684,0.332551,0.404806,0.390642,0.155894,0.376706,0.215978,0.418623,0.081948,0.433763,...,0.219337,0.315286,0.215548,0.202145,0.183721,0.200725,0.171802,0.203645,0.174204,0.206222
Vmn2r122,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [8]:
gene_to_seqlets = pd.read_csv("output/chrombpnet/modisco_merged_results/"\
    "fold_0/redo_extract_seqlets/"\
    "correlate_seqlets_to_gene_expression_all_genes/sig_conns.csv")
gene_to_seqlets = pd.concat([\
    gene_to_seqlets,
    seqlets.loc[gene_to_seqlets["seqlet_idx"].values].reset_index(drop=True)], 
        axis=1)
gene_to_seqlets = gene_to_seqlets.loc[\
    ~gene_to_seqlets["pattern"].str.contains("__neg_pattern_")]
gene_to_seqlets["pattern"] = gene_to_seqlets["pattern"].str.replace(\
    "__pos_pattern_", "__pattern_")
gene_to_seqlets

Unnamed: 0,Gene,seqlet_idx,r,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score
2,Mrpl15,2458694,0.547196,chr1,4540571,4540601,chr1:4540111-4541111,8.932149,461,-,ERP1__pattern_0,True,chr1:4540571-4540601,0.481389
3,Mrpl15,2807710,0.547052,chr1,4540571,4540601,chr1:4540111-4541111,8.829016,461,-,ERP2__pattern_0,True,chr1:4540571-4540601,0.405852
4,Mrpl15,3763246,0.547004,chr1,4540571,4540601,chr1:4540111-4541111,9.052924,461,+,HSCP_ERP1__pattern_1,False,chr1:4540571-4540601,0.083670
5,Mrpl15,9202943,0.547779,chr1,4540571,4540601,chr1:4540111-4541111,8.155797,461,+,MEP__pattern_3,False,chr1:4540571-4540601,0.157136
6,Mrpl15,9906506,0.547904,chr1,4540570,4540600,chr1:4540111-4541111,8.989227,460,+,MKP__pattern_3,False,chr1:4540570-4540600,0.196055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30813220,Hccs,14808102,0.618932,chrX,169062729,169062759,chrX:169062299-169063299,7.126749,431,+,pre_MultiLin_1__pattern_11,False,chrX:169062729-169062759,0.142939
30813221,Hccs,14808105,0.434179,chrX,169094790,169094820,chrX:169094308-169095308,10.263774,483,-,pre_MultiLin_1__pattern_4,False,chrX:169094790-169094820,0.406595
30813222,Hccs,15314097,0.441781,chrX,169094790,169094820,chrX:169094308-169095308,9.768803,483,+,pre_MultiLin_2__pattern_5,False,chrX:169094790-169094820,0.165522
30813223,Hccs,15796481,0.441943,chrX,169094786,169094816,chrX:169094308-169095308,10.079901,479,-,proNeu_1__pattern_7,False,chrX:169094786-169094816,0.222236


In [16]:
### Build TF to gene correlation matrix (pseudobulk CITE-seq CPTT)
## Only use clusters with modisco results
clusters_to_use = np.unique([i.split("__")[0] for i in list(pat_corrs.keys())])
clusters_to_use = [map_r7_to_lvl4[map_r7_to_replace_dash[i]] for i in \
    clusters_to_use]

shared_tfs = np.intersect1d(\
    map_id_to_tf["TF_Name"].unique(), 
    cite_rna.index.values)

tf2gene = pearson_corr_df_to_df(\
    cite_rna.loc[shared_tfs, clusters_to_use].T,
    cite_rna.loc[gene_to_seqlets["Gene"].unique(), clusters_to_use].T).dropna()

tf2gene

Unnamed: 0,Mrpl15,Lypla1,Tcea1,Atp6v1h,4732440D04Rik,Rb1cc1,St18,Pcmtd1,Rrs1,Adhfe1,...,Tceanc,Tmsb4x,Tlr8,Tlr7,Prps2,Msl3,Arhgap6,Gm15261,Amelx,Hccs
AC168977.1,-0.207345,0.214908,0.310641,0.447806,0.100249,0.101964,-0.096922,0.254469,-0.264164,-0.008615,...,0.118304,0.223562,-0.190263,0.525339,0.252252,0.071206,0.532635,0.484260,0.096185,0.271224
Ahctf1,0.351291,-0.283189,0.721642,-0.395829,0.042972,-0.182254,-0.124383,-0.044344,0.245067,-0.047850,...,0.129361,-0.714817,-0.255081,-0.235827,-0.117408,0.580536,0.132948,0.077102,0.121676,0.456680
Ahr,-0.740351,-0.110981,-0.349024,0.424515,0.227347,0.653538,0.412167,0.820725,-0.752553,0.123100,...,0.510348,0.125170,-0.225358,0.090179,-0.244987,-0.515346,0.368094,0.170119,-0.042461,-0.491934
Aire,-0.663790,0.029738,-0.197959,0.366057,0.538739,0.709377,0.207697,0.789649,-0.731934,-0.235657,...,0.640141,-0.040788,-0.218868,0.102753,-0.377064,-0.353798,0.356167,0.160371,-0.075698,-0.337738
Alx1,-0.138140,0.128608,0.109710,-0.015732,-0.026644,0.148997,-0.102979,0.141999,-0.171809,-0.118312,...,0.260712,-0.007479,-0.064929,-0.067750,-0.030915,0.259122,0.122319,0.134667,0.762830,0.038794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zscan20,-0.739673,0.189856,-0.152174,0.428758,0.495468,0.751963,0.489203,0.829072,-0.798257,-0.025336,...,0.796856,0.099543,-0.190500,0.121783,-0.270472,-0.274467,0.466787,0.231449,0.197603,-0.317767
Zscan26,-0.806257,-0.116374,-0.428609,0.402260,0.430397,0.836742,0.543816,0.901533,-0.828382,-0.029879,...,0.711038,0.006284,-0.157231,0.022427,-0.498710,-0.531891,0.301573,0.103166,-0.093738,-0.532424
Zscan29,-0.705735,0.153951,-0.426196,0.534282,0.415066,0.759371,0.145234,0.714268,-0.626279,-0.002207,...,0.509478,0.241187,-0.185488,-0.156000,-0.562612,-0.432589,0.207287,0.076315,-0.175622,-0.352948
Zscan4d,0.000730,0.228454,-0.148824,0.068063,0.229488,0.124778,0.482092,-0.087063,-0.078548,-0.118312,...,0.048984,0.049833,-0.064929,0.054471,0.126177,0.197613,-0.056416,-0.018415,-0.046150,0.067401


In [17]:
dpscore_threshold = 0.05
gex_threshold = 0.1
gexscores_max = cite_rna.max(axis=1)
dp_scores_max = dp_scores.max(axis=1)
path_to_tf_links = "output/GRN/all_genes/tf_links_observed/"
list_tf_link_files = os.listdir(path_to_tf_links)
list_tfs = [i.split("_")[0] for i in list_tf_link_files]
cols_to_save = ["Gene", "seqlet_idx", "TF_con", "con_score"]
for tmp_cluster in dp_scores.columns.values:
    print(f"{tmp_cluster}...")
    tmp_dp_scores = dp_scores[tmp_cluster].copy()
    tmp_dp_scores.loc[tmp_dp_scores < dpscore_threshold] = 0
    tmp_gexscores = cite_rna[tmp_cluster].copy()
    tmp_gexscores.loc[tmp_gexscores < 0.1] = 0
    tmp_check_tfs = tmp_gexscores.loc[list_tfs]
    tmp_check_tfs = tmp_check_tfs.loc[tmp_check_tfs > gex_threshold]
    print(f"\t{tmp_cluster} has {len(tmp_check_tfs)} TFs expressed...")
    # Load up the expressed TF connections
    tmp_conns = []
    for tmp_tf in tmp_check_tfs.index.values:
        tmp_conns.append(pd.read_feather(os.path.join(\
            path_to_tf_links, f"{tmp_tf}_links.fea")))

    tmp_conns = pd.concat(tmp_conns)
    n_conns_pre_filter = tmp_conns.shape[0]

    tmp_conns = tmp_conns.loc[~tmp_conns["Gene"].str.startswith("Rps")]
    tmp_conns = tmp_conns.loc[~tmp_conns["Gene"].str.startswith("Rpl")]
    tmp_conns["con_score"] = (\
        tmp_dp_scores.loc[tmp_conns["seqlet_idx"].values] * \
        tmp_gexscores.loc[tmp_conns["Gene"].values].values * \
        tmp_gexscores.loc[tmp_conns["TF_con"].values].values).values
    tmp_conns = tmp_conns.loc[tmp_conns["con_score"] > 0]
    tmp_conns = tmp_conns[cols_to_save]
    tmp_conns.reset_index(drop=True).to_feather(f"output/GRN/all_genes/"\
        "cluster_specific_tf_links/{tmp_cluster}_conns.fea")
    print(f"\t{n_conns_pre_filter} base connections.")
    print(f"\t{tmp_conns.shape[0]} active.")


BMCP...
	BMCP has 255 TFs expressed...
	62807750 base connections.
	29264261 active.
MP...
	MP has 247 TFs expressed...
	60587046 base connections.
	24724140 active.
CLP1-b...
	CLP1-b has 273 TFs expressed...
	68696044 base connections.
	34460676 active.
MPP1-MKP...
	MPP1-MKP has 286 TFs expressed...
	68971323 base connections.
	40591885 active.
MPP1-G1...
	MPP1-G1 has 298 TFs expressed...
	68320530 base connections.
	40920269 active.
ErP-1...
	ErP-1 has 215 TFs expressed...
	45856367 base connections.
	13753497 active.
ErP-2...
	ErP-2 has 212 TFs expressed...
	43484408 base connections.
	13084367 active.
ErP-HSC...
	ErP-HSC has 250 TFs expressed...
	59449782 base connections.
	23252803 active.
MPP2-G2...
	MPP2-G2 has 304 TFs expressed...
	71977057 base connections.
	47039174 active.
MPP2-M...
	MPP2-M has 293 TFs expressed...
	68982438 base connections.
	42764571 active.
MPP2-S...
	MPP2-S has 296 TFs expressed...
	71476009 base connections.
	42500052 active.
MkP-HSC...
	MkP-HSC has 264