In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import re
from gimmemotifs.motif import Motif,read_motifs

from pyInfinityFlow.InfinityFlow_Utilities import pearson_corr_df_to_df

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Arial"
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

os.chdir("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/"\
    "2023_06_12_tea_seq_atac_processing/")

INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


In [2]:
# Read in the CITE-seq RNA counts
path_cite_data = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/"

adata_cite = sc.read(os.path.join(\
    path_cite_data, 
    "cite_seq_adata_rna_combined.h5ad"))
print("Computing CPTT normalized scRNA-seq from CITE-seq...")
adata_cite.X = np.log2((10000 * (adata_cite.X.T / \
    adata_cite.X.sum(axis=1).T).T) + 1)
cite_cell_anno = pd.read_csv(os.path.join(\
    path_cite_data, 
    "cite_seq_cell_annotations.csv"))
cite_cell_anno.index = cite_cell_anno["Cell_Barcode"].values

# Make pseudobulk RNA counts from cite clusters
print("Computing RNA centroids...")
shared_cells_cite = np.intersect1d(cite_cell_anno.index.values, 
    adata_cite.obs.index.values)
cite_cell_anno = cite_cell_anno.loc[shared_cells_cite]
cite_rna = {}
for tmp_cluster in cite_cell_anno["sctri_cite"].unique():
    print("\t{}...".format(tmp_cluster))
    tmp_barcodes = cite_cell_anno.loc[\
        cite_cell_anno["sctri_cite"] == tmp_cluster].index.values
    cite_rna[tmp_cluster] = pd.Series(np.asarray(\
            adata_cite[tmp_barcodes].X.mean(axis=0)).reshape(-1),
            index=adata_cite.var.index.values)

cite_rna = pd.DataFrame(cite_rna)
cite_rna = cite_rna.drop("Unknown", axis=1)
map_r7_names = pd.read_csv(os.path.join(\
    path_cite_data, 
    "map_r7-v1_to_r7-v2_names.csv"))
map_r7_names_v1_to_v2 = pd.Series(\
    map_r7_names["R7_V2"].values,
    index = map_r7_names["R7_V1"].values)

cite_rna = cite_rna.rename(map_r7_names_v1_to_v2, axis=1)
cite_rna.columns = [i.replace("-", "_") for i in cite_rna.columns.values]

Computing CPTT normalized scRNA-seq from CITE-seq...
Computing RNA centroids...
	ERP4--Ex...
	MultiLin-2_ML-c9--RNA...
	ERP2_Kit-c14--RNA...
	ERP2--Ex...
	Ebf1+_proB_CD127-c2--RNA...
	proNeu-1_1-4--ADT...
	Unknown...
	preNeu-2_1-8--ADT...
	cMoP_ML-c15--RNA...
	immNeu_Kit-c3--RNA...
	MP--Ex...
	proNeu-2_Kit-c4--RNA...
	Ebf1+_proB--Ex...
	ML_ERP1_ML-c4--RNA...
	MDP--Ex...
	preNeu-3_Kit-c2--RNA...
	ERP4_Kit-c13--RNA...
	preNeu-3_2-3--ADT...
	immNeu_2-5--ADT...
	preNeu-1_Kit-c5--RNA...
	immNeu_Kit-c1--RNA...
	cMoP--Ex...
	neoHPC_Myc--Ex...
	Bcl11b+_preETP--Ex...
	ML_Multi-Lin-2_HSC-c7--RNA...
	Ebf1+_proB_CD127-c8--RNA...
	IG2-proNeu1_ML-c14--RNA...
	ML_Mast--Ex...
	ML_cell_cycle_ML-c5--RNA...
	HSC-HPC-Cenpf_HSC-c12--RNA...
	MultiLin-2_ERP_ML-c10--RNA...
	ML_MDP_ML-c13--RNA...
	proNeu-1--Ex...
	SiglecH-Ly6C-pre-DC--Ex...
	HSC-HPC-Mki67--Ex...
	Pro-B_CD127-c3--RNA...
	precursor_B_cell_5-2--ADT...
	CLP1_CD127-c5--RNA...
	ML_MultiLin-1_ML-c8--RNA...
	ETP-A-0-Ccl4--Ex...
	ETP-CC-4-Ung--Ex...
	e

In [3]:
# Read in the motifs
path_cisbp = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/reference/"\
    "cisbp2/Mus_musculus_2020_06_01_11_53_pm/"
path_pwms = os.path.join(path_cisbp, "pwms_all_motifs/")
path_cisbp_anno = os.path.join(path_cisbp, "TF_Information_all_motifs.txt")

cisbp_anno = pd.read_table(path_cisbp_anno)
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"] != "."]

cisbp2_motifs = read_motifs(os.path.join(path_cisbp, "mouse_cisbp2_all_motif_ids_ppm_format.motif"))
cisbp2_names = np.array([str(x) for x in cisbp2_motifs])

# Filter cisbp2 annotation to only motifs that have a real entry
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"].isin(["_".join(i.split("_")[:2]) for i in cisbp2_names])]

map_id_to_tf = cisbp_anno[["Motif_ID", "TF_Name"]].drop_duplicates()

In [4]:
path_to_cisbp_correlations = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "correlate_modisco_to_cisbp2_motifs/"

corr_files = [i for i in os.listdir(path_to_cisbp_correlations) if \
    i.endswith("csv") and not i.startswith(".")]

modisco_pattern_re = re.compile(r'(\w+__pattern_[0-9]+)_\w+.csv')

corr_files = pd.Series(\
    corr_files, 
    index=[modisco_pattern_re.findall(i)[0] for i in corr_files])

pat_corrs = {}
for tmp_pat in corr_files.index.values:
    pat_corrs[tmp_pat] = pd.read_csv(os.path.join(\
        path_to_cisbp_correlations,
        corr_files[tmp_pat]))

for tmp_pat in pat_corrs:
    pat_corrs[tmp_pat].loc[:,"motif_pattern"] = [i.split("_")[-1] \
        for i in pat_corrs[tmp_pat]["motif"].values]
    pat_corrs[tmp_pat].loc[:,"motif"] = ["_".join(i.split("_")[:-1]) \
        for i in pat_corrs[tmp_pat]["motif"].values]
    pat_corrs[tmp_pat] = pat_corrs[tmp_pat].set_index("motif")

In [5]:
pat_corrs["BMCP__pattern_25"].sort_values(by="r", ascending=False)

Unnamed: 0_level_0,pattern,offset,strand,r,motif_pattern
motif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M08127_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.937167,rnAGATAAGrA
M09123_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.930855,nnAGATAAGA
M09121_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.928374,nsAGATAAGrn
M08126_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,12,-,0.925646,nnsAGATAAGr
M09120_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.915643,nnAGATAAGn
...,...,...,...,...,...
M02935_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,17,+,0.095082,nCCCCCCCAC
M00139_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,19,+,0.093277,CnwCCCCCmn
M08978_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,10,+,0.091463,nnCnnGCCCCGCCCCCnnnn
M00143_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,18,+,0.089067,nCCCCCCnnn


In [6]:

unique_motifs = []
for tmp_pat in pat_corrs:
    unique_motifs += list(pat_corrs[tmp_pat].index.values)

unique_motifs = np.unique(unique_motifs)
mo_similarities = pd.DataFrame(0, index=list(pat_corrs.keys()), 
    columns=unique_motifs)
for tmp_pat in pat_corrs:
    mo_similarities.loc[\
        tmp_pat, 
        pat_corrs[tmp_pat].index.values] = pat_corrs[tmp_pat]["r"].values
    
mo_similarities

Unnamed: 0,M00111_2.00,M00112_2.00,M00113_2.00,M00114_2.00,M00115_2.00,M00116_2.00,M00117_2.00,M00118_2.00,M00119_2.00,M00120_2.00,...,M09648_2.00,M09649_2.00,M09650_2.00,M09651_2.00,M09652_2.00,M09653_2.00,M10469_2.00,M10570_2.00,M10767_2.00,M11049_2.00
HSCP_HPC_Cenpf__pattern_25,0.202201,0.240010,0.211908,0.257100,0.258435,0.185724,0.291062,0.311243,0.226841,0.318314,...,0.232542,0.352796,0.453523,0.349308,0.263264,0.258160,0.285762,0.302035,0.221077,0.186552
LT_HSC_Mllt3__pattern_8,0.413618,0.382552,0.361313,0.392718,0.077250,0.152473,0.322053,0.353005,0.282305,0.410082,...,0.317835,0.386771,0.383392,0.268356,0.385213,0.389015,0.250820,0.152642,0.192357,0.124400
MPP5_Flt3__pattern_1,0.497018,0.434836,0.437839,0.434691,0.164640,0.092760,0.417630,0.494317,0.322522,0.448245,...,0.353817,0.478664,0.291022,0.285284,0.273864,0.265764,0.337265,0.212127,0.137601,0.157886
BMCP__pattern_0,0.483937,0.384676,0.436416,0.427572,0.047236,0.086255,0.166355,0.294221,0.280555,0.169208,...,0.283571,0.358853,0.306188,0.127854,0.183160,0.161740,0.309193,0.101758,0.048325,0.090653
BMCP__pattern_10,0.273003,0.279414,0.300172,0.268997,0.259189,0.185422,0.643987,0.699773,0.900607,0.678878,...,0.524756,0.557052,0.309297,0.301051,0.226679,0.253312,0.326794,0.287368,0.248150,0.331538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSCP_HPC_Cenpf__pattern_20,0.341943,0.322229,0.332862,0.376382,0.157247,0.163326,0.333682,0.318763,0.217711,0.368670,...,0.271803,0.360738,0.490038,0.245841,0.298795,0.309190,0.322025,0.209153,0.173870,0.207671
HSCP_HPC_Cenpf__pattern_21,0.288247,0.334039,0.281859,0.309338,0.317346,0.175181,0.413132,0.375163,0.217356,0.402259,...,0.357111,0.392018,0.281211,0.328510,0.498873,0.491448,0.369715,0.403050,0.187095,0.163493
HSCP_HPC_Cenpf__pattern_22,0.549329,0.612334,0.555310,0.599159,0.325205,0.169126,0.289043,0.365376,0.393517,0.340932,...,0.258803,0.247206,0.327207,0.296509,0.507785,0.529500,0.455119,0.419917,0.228951,0.227655
HSCP_HPC_Cenpf__pattern_23,0.306296,0.267893,0.258015,0.282877,0.231764,0.409715,0.371785,0.399047,0.225301,0.361275,...,0.231620,0.372803,0.337738,0.285202,0.347221,0.326198,0.239051,0.349767,0.202582,0.273310


Check Irf8

In [7]:
map_id_to_tf.loc[map_id_to_tf["TF_Name"] == "Irf8"]

Unnamed: 0,Motif_ID,TF_Name
10795,M09246_2.00,Irf8


In [8]:
mo_similarities["M09246_2.00"].sort_values()

ML_cell_cycle__pattern_14       0.054666
ERP2__pattern_14                0.060635
HSCP_ERP1__pattern_12           0.061567
IG2_MP__pattern_11              0.064072
HSCP_HPC_Cenpf__pattern_15      0.068479
                                  ...   
MDP_Irf8__pattern_14            0.911989
MDP_Cpa3__pattern_17            0.937892
CD127_MP__pattern_9             0.976982
MDP_Irf8__pattern_12            0.977764
MultiLin_2_F13a1__pattern_11    0.978887
Name: M09246_2.00, Length: 1030, dtype: float64

Pu.1

In [9]:
map_id_to_tf.loc[map_id_to_tf["TF_Name"] == "Spi1"]

Unnamed: 0,Motif_ID,TF_Name
2248,M00155_2.00,Spi1
2249,M01475_2.00,Spi1
2250,M08027_2.00,Spi1
2251,M09065_2.00,Spi1
2252,M09538_2.00,Spi1


In [10]:
mo_similarities["M08027_2.00"].sort_values()

HSCP_ERP1__pattern_12          0.074018
MPP4_Nkx2_3__pattern_18        0.083438
MDP_Irf8__pattern_17           0.086101
ERP2__pattern_14               0.086308
pre_MultiLin_2__pattern_15     0.090617
                                 ...   
MPP4_Nkx2_3__pattern_1         0.960101
BMCP__pattern_2                0.964984
MDP_Cpa3__pattern_0            0.966150
CD127_MP__pattern_0            0.967661
MultiLin_2_F13a1__pattern_0    0.969131
Name: M08027_2.00, Length: 1030, dtype: float64

Myc

In [11]:
map_id_to_tf.loc[map_id_to_tf["TF_Name"] == "Myc"]

Unnamed: 0,Motif_ID,TF_Name
236,M08751_2.00,Myc
237,M09467_2.00,Myc


In [12]:
mo_similarities["M08751_2.00"].sort_values()

ST_HSC__pattern_24               -0.008677
HSCP_HPC_Hist1h2af__pattern_20   -0.005072
pre_MultiLin_1__pattern_13       -0.002683
LT_HSC_Mllt3__pattern_18         -0.000109
MPP5_Egr1__pattern_16             0.001345
                                    ...   
IG2_proNeu1__pattern_10           0.766721
CD127_MP__pattern_12              0.767995
eHSC_Pcna__pattern_11             0.772246
LT_HSC_Mllt3__pattern_11          0.773401
BMCP__pattern_10                  0.774147
Name: M08751_2.00, Length: 1030, dtype: float64

In [13]:
pat_corrs["BMCP__pattern_10"].sort_values(by="r", ascending=False)

Unnamed: 0_level_0,pattern,offset,strand,r,motif_pattern
motif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M08756_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,12,-,0.961201,nnGTCACGTGrCs
M08753_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,13,+,0.957774,CACGTGACy
M08777_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,12,-,0.957249,nGGTCACGTGnnsnnnnnn
M01751_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,13,+,0.951297,nCACGTGACn
M05833_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,14,+,0.946457,CACGTGAy
...,...,...,...,...,...
M08277_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,20,-,0.042748,TTTTyyTnTTTTTTynyTTyn
M09255_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,13,+,0.040652,nnCyAAAAATAGmn
M08325_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,20,-,0.035841,TTTTTTTTT
M09252_2.00,BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn,14,+,0.031166,nkCTAAAAATAGmn


In [14]:
pat_corrs["BMCP__pattern_10"].loc["M08751_2.00"]

pattern          BMCP__pattern_10_nnnnnnnnCACGTGACnnnnnnnnnnnnnn
offset                                                        13
strand                                                         +
r                                                       0.774147
motif_pattern                                       GnsCACGTGGnn
Name: M08751_2.00, dtype: object

Gfi1

In [15]:
map_id_to_tf.loc[map_id_to_tf["TF_Name"] == "Gfi1"]

Unnamed: 0,Motif_ID,TF_Name
871,M08984_2.00,Gfi1


In [16]:
mo_similarities["M08984_2.00"].sort_values()

MPP5_Flt3__pattern_23             0.074109
HSCP_HPC_Hist1h2af__pattern_19    0.079016
IG2_MP__pattern_11                0.084049
MDP_Cpa3__pattern_8               0.087545
ML_cell_cycle__pattern_14         0.088592
                                    ...   
MPP5_Egr1__pattern_5              0.622141
ERP2__pattern_3                   0.623317
MPP4_Hlf__pattern_9               0.628484
LT_HSC_Mllt3__pattern_3           0.630087
ERP1__pattern_7                   0.631999
Name: M08984_2.00, Length: 1030, dtype: float64

In [17]:
pat_corrs["ERP1__pattern_7"].loc["M08984_2.00"]

pattern          ERP1__pattern_7_nnnnnnnnnnnyTGTGGTTTnnnnnnnnnn
offset                                                       10
strand                                                        -
r                                                      0.631999
motif_pattern                                        AAATCAswGC
Name: M08984_2.00, dtype: object

In [18]:
gene_to_seqlets = pd.read_feather("output/chrombpnet/modisco_merged_results/"\
    "fold_0/redo_extract_seqlets/correlate_seqlets_to_gene_expression/"\
    "seqlet_to_gene_conns_filtered_to_marker_genes_n25.fea")
gene_to_seqlets = gene_to_seqlets.loc[\
    ~gene_to_seqlets["pattern"].str.contains("__neg_pattern_")]
gene_to_seqlets["pattern"] = gene_to_seqlets["pattern"].str.replace(\
    "__pos_pattern_", "__pattern_")
gene_to_seqlets

Unnamed: 0,Gene,seqlet_idx,r,index,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score,Cisbp2_TF,Cisbp2_TF_Family
0,Ube2w,999,0.431932,999,chr1,14995472,14995502,chr1:14995049-14996049,10.999075,424,-,BMCP__pattern_2,False,chr1:14995472-14995502,0.207860,Spi1,Ets
1,Ube2w,1010,0.807602,1010,chr1,15176376,15176406,chr1:15176006-15177006,6.002641,371,+,BMCP__pattern_36,False,chr1:15176376-15176406,0.502043,Cebpg,bZIP
2,Ube2w,1011,0.803144,1011,chr1,15176378,15176408,chr1:15176006-15177006,9.193361,373,-,BMCP__pattern_3,True,chr1:15176378-15176408,0.545012,Cebpa,bZIP
3,Ube2w,1012,0.798769,1012,chr1,15176380,15176410,chr1:15176006-15177006,5.916835,375,-,BMCP__pattern_32,False,chr1:15176380-15176410,0.437181,Cebpa,bZIP
4,Ube2w,1013,0.490000,1013,chr1,15176388,15176418,chr1:15176006-15177006,9.033985,383,-,BMCP__pattern_7,False,chr1:15176388-15176418,0.423640,Runx1,Runt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365889,Prdx4,16096769,0.454015,16096769,chrX,155338453,155338483,chrX:155337965-155338965,8.410910,489,+,ST_HSC__pattern_2,False,chrX:155338453-155338483,0.201752,Sp3,C2H2 ZF
1365890,Prdx4,16096770,0.404356,16096770,chrX,155338455,155338485,chrX:155337965-155338965,13.204884,491,-,ST_HSC__pattern_21,False,chrX:155338455-155338485,0.582122,Zbtb33,C2H2 ZF
1365891,Prdx4,16096771,0.406877,16096771,chrX,155338461,155338491,chrX:155337965-155338965,13.178372,497,+,ST_HSC__pattern_21,False,chrX:155338461-155338491,0.573688,Zbtb33,C2H2 ZF
1365892,Prdx4,16096772,0.488150,16096772,chrX,155338462,155338492,chrX:155337965-155338965,8.482800,498,+,ST_HSC__pattern_17,False,chrX:155338462-155338492,0.367858,Zfp523,C2H2 ZF


In [19]:
# path_to_gene_to_seqlet_anno = "output/chrombpnet/modisco_merged_results/"\
#     "fold_0/redo_extract_seqlets/"\
#     "gene_to_seqlet_connections_within_tads_Cisbp2_annotations.csv"

# gene_to_seqlets = pd.read_csv(path_to_gene_to_seqlet_anno)
# gene_to_seqlets = gene_to_seqlets.loc[\
#     ~gene_to_seqlets["pattern"].str.contains("__neg_pattern_")]
# gene_to_seqlets["pattern"] = gene_to_seqlets["pattern"].str.replace(\
#     "__pos_pattern_", "__pattern_")
# gene_to_seqlets

In [20]:
### Build TF to gene correlation matrix (pseudobulk CITE-seq CPTT)
## Only use clusters with modisco results
clusters_to_use = np.unique([i.split("__")[0] for i in list(pat_corrs.keys())])

shared_tfs = np.intersect1d(\
    map_id_to_tf["TF_Name"].unique(), 
    cite_rna.index.values)

tf2gene = pearson_corr_df_to_df(\
    cite_rna.loc[shared_tfs, clusters_to_use].T,
    cite_rna.loc[gene_to_seqlets["Gene"].unique(), clusters_to_use].T).dropna()

tf2gene


Unnamed: 0,Ube2w,Crispld1,Mcm3,Dst,Rnf149,Il1r1,Il1rl1,Gm973,Idh1,Acadl,...,Gria3,Bgn,Pola1,Maged1,F630028O10Rik,Dlg3,Ogt,Tspan6,Gprasp2,Prdx4
AC168977.1,-0.193185,0.315213,-0.008454,0.312728,0.159270,0.516580,-0.360217,0.116773,-0.335185,-0.284729,...,-0.218522,0.193926,0.027786,0.285680,-0.311218,0.400731,0.184981,0.561066,0.295366,0.256068
Ahctf1,-0.650111,-0.044839,0.034612,-0.024560,-0.398138,-0.172991,0.396183,-0.068390,-0.531872,0.378324,...,-0.474163,0.048396,0.180060,-0.078183,-0.469761,0.148153,-0.167159,0.207479,-0.027726,0.598227
Ahr,-0.097477,0.818891,-0.497163,0.872236,0.310685,0.377527,-0.373789,0.908719,-0.276114,-0.649331,...,-0.131852,0.821058,-0.483357,0.923598,-0.351779,0.331204,0.713398,0.505114,0.892425,-0.022740
Aire,-0.248137,0.846966,-0.717430,0.759419,0.388685,0.582389,-0.302047,0.617476,-0.283178,-0.585586,...,-0.183318,0.594610,-0.627083,0.705915,-0.380456,0.237666,0.748480,0.591179,0.705487,-0.058928
Alx1,-0.137000,0.061733,-0.159770,0.018128,-0.261601,0.075535,0.094295,0.010506,-0.093776,0.050735,...,-0.110423,0.072619,0.011939,-0.038177,-0.111148,0.114383,-0.016737,0.142182,0.029882,0.231045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zscan20,-0.207697,0.852965,-0.664307,0.845913,0.316484,0.502062,-0.437200,0.708045,-0.297726,-0.667199,...,-0.135971,0.772622,-0.517071,0.777572,-0.394345,0.429235,0.707177,0.643539,0.823552,0.109566
Zscan26,-0.124648,0.907696,-0.786915,0.930708,0.379809,0.390734,-0.290537,0.919952,-0.208815,-0.680915,...,-0.070266,0.833168,-0.717706,0.906550,-0.332398,0.215372,0.819926,0.476693,0.907875,-0.124793
Zscan29,0.114868,0.656484,-0.702329,0.683454,0.180597,0.133504,-0.194870,0.655993,0.080600,-0.520130,...,-0.054218,0.577103,-0.628954,0.635729,0.005846,0.152926,0.598976,0.232314,0.685331,-0.349913
Zscan4d,0.191993,-0.108852,-0.109522,-0.121262,0.127829,0.005741,-0.017532,-0.104682,0.131251,-0.008955,...,0.098490,-0.116394,0.014860,-0.119084,0.083230,-0.101966,-0.026273,-0.101074,-0.132362,-0.073572


In [21]:
mo_sim_threshold = 0.75
tf2gene_threshold = 0.4

conn_results = {}
for tmp_gene in gene_to_seqlets["Gene"].unique():
    print(f"Working on {tmp_gene}...")
    # Filter to seqlet connections for the given gene
    seg_gene2seqlets = gene_to_seqlets.loc[gene_to_seqlets["Gene"] == tmp_gene]
    # Pull out the unique patterns connected to that gene
    seg_pats = seg_gene2seqlets["pattern"].unique()
    # Pull out the motifs that match the patterns above given threshold
    seg_mo_sims = mo_similarities.loc[seg_pats]
    seg_mo = seg_mo_sims.columns.values[seg_mo_sims.max(axis=0) > mo_sim_threshold]

    # Pull out the TFs with a PWM that match that pattern
    seg_tf = map_id_to_tf.loc[\
        map_id_to_tf["Motif_ID"].isin(seg_mo), "TF_Name"].unique()
    seg_tf = np.intersect1d(seg_tf, tf2gene.index.values)

    # Identify the TFs that correlate to the gene
    seg_tf2gene = tf2gene.loc[seg_tf, tmp_gene]
    seg_tf2gene = seg_tf2gene[seg_tf2gene > tf2gene_threshold]
    seg_tf = seg_tf2gene.index.values

    # Pull out the IDs for those TFs
    seg_id2tf = map_id_to_tf.loc[map_id_to_tf["TF_Name"].isin(seg_tf)]
    seg_id2tf = seg_id2tf.loc[seg_id2tf["Motif_ID"].isin(seg_mo)]

    n_con = len(seg_id2tf['TF_Name'].unique())
    print(f"\tFound {n_con} TF-gene connections...")
    if n_con > 0:
        # Get the motif to TF names
        seg_mo_sims = seg_mo_sims[seg_id2tf["Motif_ID"].unique()]
        seg_mo_sims = seg_mo_sims.loc[seg_mo_sims.max(axis=1) > mo_sim_threshold]
        seg_mo_sims = seg_mo_sims.unstack().reset_index()
        seg_mo_sims.columns = ["Motif", "Pattern", "Motif_r"]
        seg_mo_sims = seg_mo_sims.loc[seg_mo_sims["Motif_r"] > mo_sim_threshold]



        new_conns = []
        for tmp_tf in seg_id2tf["TF_Name"].unique():
            # Reselect motif ids that map to the indicated TF
            tmp_mo_ids = seg_id2tf.loc[\
                seg_id2tf["TF_Name"] == tmp_tf, "Motif_ID"].values
            # Reselect patterns that map to those motif ids above
            tmp_pats = seg_mo_sims.loc[\
                seg_mo_sims["Motif"].isin(tmp_mo_ids), "Pattern"].unique()
            # Select the connections that match those patterns
            tmp_new_conns = seg_gene2seqlets.loc[\
                seg_gene2seqlets["pattern"].isin(tmp_pats)].copy()
            # Label the connection with the TF name
            tmp_new_conns["TF_CON"] = tmp_tf
            new_conns.append(tmp_new_conns)

        new_conns = pd.concat(new_conns)

    else:
        seg_mo_sims = None
        new_conns = None

    conn_results[tmp_gene] = {\
        "new_conns": new_conns,
        "seg_mo_sims": seg_mo_sims,
        "seg_id2tf": seg_id2tf,
        "seg_tf2gene": seg_tf2gene}

Working on Ube2w...
	Found 14 TF-gene connections...
Working on Crispld1...
	Found 55 TF-gene connections...
Working on Mcm3...
	Found 1 TF-gene connections...
Working on Dst...
	Found 83 TF-gene connections...
Working on Rnf149...
	Found 11 TF-gene connections...
Working on Il1r1...
	Found 39 TF-gene connections...
Working on Il1rl1...
	Found 9 TF-gene connections...
Working on Gm973...
	Found 103 TF-gene connections...
Working on Idh1...
	Found 6 TF-gene connections...
Working on Acadl...
	Found 8 TF-gene connections...
Working on Ikzf2...
	Found 13 TF-gene connections...
Working on Cxcr2...
	Found 8 TF-gene connections...
Working on Serpine2...
	Found 3 TF-gene connections...
Working on Dock10...
	Found 5 TF-gene connections...
Working on Arl4c...
	Found 92 TF-gene connections...
Working on Otos...
	Found 80 TF-gene connections...
Working on Gpc1...
	Found 14 TF-gene connections...
Working on Zcchc2...
	Found 78 TF-gene connections...
Working on Bcl2...
	Found 9 TF-gene connections.

In [22]:
conn_results["Clec12a"]["new_conns"]["TF_CON"].value_counts()

Cebpb    516
Cebpa    503
Nfil3    477
Cebpe    434
Spi1     134
Irf8      72
Jdp2      57
Xbp1      22
Atf6b      2
Atf6       2
Mlx        1
Usf1       1
Tfec       1
Usf2       1
Name: TF_CON, dtype: int64

In [23]:
conn_results["Cd55"]["new_conns"]["TF_CON"].value_counts()

Sp3        284
Sp1        265
Klf3       228
Sp4        155
Maz        150
Patz1      121
Klf1       121
Klf8       114
Gata2      110
Gata1      110
Tal1       110
Srebf1      64
Tfe3        64
Arnt        61
Mycn        61
Bhlhe41     61
Zbtb33      35
Atf1        27
Crem        26
Crebl2      24
Nfe2        23
Bach1       23
Nfe2l2      23
Pbx3        22
Gfi1b       22
Pbx4        22
Nfyb        22
Gmeb1       14
Rfx2         8
Tbx2         3
Name: TF_CON, dtype: int64

In [24]:
conn_results["Cebpe"]["new_conns"]["TF_CON"].value_counts()

Cebpa    353
Cebpb    347
Cebpe    314
Jdp2      59
Xbp1      28
Atf6b      7
Atf6       7
Spi1       2
Mlx        1
Tfec       1
Name: TF_CON, dtype: int64

In [25]:
conn_results["Cd55"]["seg_tf2gene"].sort_values(ascending=False)

Gfi1b      0.878581
Gata1      0.837461
Tal1       0.827702
Gata2      0.795273
Tfe3       0.728333
Pbx4       0.699658
Rfx2       0.610257
Klf1       0.594871
Nfe2l2     0.576854
Maz        0.553370
Sp4        0.516170
Patz1      0.512400
Tbx2       0.495987
Sp1        0.495594
Klf8       0.495121
Atf1       0.494811
Gmeb1      0.491863
Nfe2       0.481047
Pbx3       0.467066
Nfyb       0.465362
Sp3        0.459121
Crebl2     0.443504
Zbtb33     0.430716
Klf3       0.428576
Bhlhe41    0.417026
Srebf1     0.415674
Mycn       0.408364
Bach1      0.404901
Arnt       0.404558
Crem       0.403207
Name: Cd55, dtype: float64

In [26]:
conn_results["Irf8"]["seg_tf2gene"].sort_values(ascending=False)

Irf8      1.000000
Nfil3     0.808144
Hes6      0.781232
Atf6b     0.723927
Atf6      0.720608
Hesx1     0.718806
Usf1      0.679093
Spi1      0.631385
Usf2      0.494897
Cebpa     0.489089
Tfec      0.466902
Sp9       0.458169
Prrxl1    0.440107
Name: Irf8, dtype: float64

In [27]:
print("\n".join(conn_results["Irf8"]["seg_tf2gene"].sort_values(ascending=False).index.values))

Irf8
Nfil3
Hes6
Atf6b
Atf6
Hesx1
Usf1
Spi1
Usf2
Cebpa
Tfec
Sp9
Prrxl1


In [28]:
conn_results.keys()

dict_keys(['Ube2w', 'Crispld1', 'Mcm3', 'Dst', 'Rnf149', 'Il1r1', 'Il1rl1', 'Gm973', 'Idh1', 'Acadl', 'Ikzf2', 'Cxcr2', 'Serpine2', 'Dock10', 'Arl4c', 'Otos', 'Gpc1', 'Zcchc2', 'Bcl2', 'Dbi', 'Cd55', 'Ctse', 'Slc45a3', 'Btg2', 'Ptpn7', 'Aspm', 'Uchl5', 'Rgs2', 'Rgs1', 'Fam129a', 'Edem3', 'Ier5', '4930523C07Rik', 'Prdx6', 'Sell', 'Pbx1', 'Pcp4l1', 'Fcer1g', 'Cd48', 'Slamf1', 'Spta1', 'Fh1', 'Exo1', 'H3f3a', 'Cenpf', 'Ptpn14', 'Dtl', 'Nek2', 'Hsd11b1', 'G0s2', 'Cd34', 'Mical1', 'Cisd1', 'Ggt5', 'Mif', 'Itgb2', 'Prss57', 'Ptbp1', 'Plppr3', 'Ndufs7', 'Rps15', 'Mob3a', 'Lsm7', 'Sppl2b', 'Tle6', 'D10Wsu102e', 'Bpifc', 'Timp3', 'Gnptab', 'Gas2l3', 'Lta4h', 'Gm15915', 'Socs2', 'Dusp6', 'E2f7', 'Osbpl8', 'Nap1l1', 'Phlda1', 'Tspan8', 'Lyz2', 'Rap1b', 'Hmga2', 'Ctdsp2', 'Cdk4', 'Pa2g4', 'Cd63', 'Tespa1', 'Patz1', 'Nefh', 'H2afv', 'Meis1', 'Cep68', 'Snrnp25', 'Btnl9', 'Mgat1', 'Tbc1d9b', 'Ltc4s', 'Uqcrq', 'Gm2a', 'Obscn', 'Shmt1', 'Mmgt2', 'Rpl26', 'Kdm6b', 'Trp53', 'Atp1b2', 'Cd68', 'Nlgn2', 'Mg

In [29]:
conn_results["Cd55"]["new_conns"].loc[conn_results["Cd55"]["new_conns"]["TF_CON"] == "Gata1"].sort_values(by="start")

Unnamed: 0,Gene,seqlet_idx,r,index,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score,Cisbp2_TF,Cisbp2_TF_Family,TF_CON
23521,Cd55,12747852,0.616601,12747852,chr1,130462237,130462267,chr1:130462037-130463037,8.967998,201,+,MultiLin_1_MEP__pattern_4,False,chr1:130462237-130462267,0.292962,Trps1,GATA,Gata1
23366,Cd55,9914898,0.646602,9914898,chr1,130508677,130508707,chr1:130508232-130509232,10.065369,446,+,MKP__pattern_3,True,chr1:130508677-130508707,0.602899,Trps1,GATA,Gata1
23113,Cd55,4965645,0.648227,4965645,chr1,130508678,130508708,chr1:130508232-130509232,9.349007,447,+,HSCP_HPC_Hist1h2af__pattern_5,False,chr1:130508678-130508708,0.225130,Gata1,GATA,Gata1
23175,Cd55,6134095,0.647968,6134095,chr1,130508678,130508708,chr1:130508232-130509232,9.272098,447,+,HSCP_MKP__pattern_3,False,chr1:130508678-130508708,0.355339,Trps1,GATA,Gata1
23141,Cd55,5601305,0.646755,5601305,chr1,130508678,130508708,chr1:130508232-130509232,8.848876,447,-,HSCP_HPC_Tk1__pattern_8,False,chr1:130508678-130508708,0.408941,Gata1,GATA,Gata1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23568,Cd55,12748099,0.737403,12748099,chr1,131074762,131074792,chr1:131074102-131075102,8.566227,661,+,MultiLin_1_MEP__pattern_4,False,chr1:131074762-131074792,0.129768,Trps1,GATA,Gata1
23037,Cd55,2834413,0.783729,2834413,chr1,131087544,131087574,chr1:131086598-131087598,7.887138,947,-,ERP2__pattern_0,False,chr1:131087544-131087574,0.217571,Gata3,GATA,Gata1
23365,Cd55,9223717,0.784526,9223717,chr1,131087544,131087574,chr1:131086598-131087598,8.353361,947,+,MEP__pattern_3,False,chr1:131087544-131087574,0.131946,Tal1,bHLH,Gata1
22918,Cd55,1466857,0.785324,1466857,chr1,131087544,131087574,chr1:131086598-131087598,8.147350,947,+,eHSC__pattern_4,False,chr1:131087544-131087574,0.159680,Gata6,GATA,Gata1


In [30]:
# Filter out connections for the same TF within a given distance threshold
cut_distance_threshold = 10
tmp_save_conns = []
for tmp_gene in conn_results:
    if type(conn_results[tmp_gene]["new_conns"]) != type(None):
        for tmp_tf in conn_results[tmp_gene]["new_conns"]["TF_CON"].unique():
            tmp_conns = conn_results[tmp_gene]["new_conns"].copy()
            tmp_conns = tmp_conns.loc[tmp_conns["TF_CON"] == tmp_tf]
            tmp_conns["midpoint"] = tmp_conns[["start", "end"]].mean(axis=1)
            tmp_conns = tmp_conns[["dp_score", "midpoint"]].sort_values(\
                by="dp_score", ascending=False)
            saved_indices = []
            while tmp_conns.shape[0] > 0:
                saved_indices.append(tmp_conns.index.values[0])
                tmp_conns = tmp_conns.loc[\
                    (tmp_conns["midpoint"] - tmp_conns["midpoint"].iloc[0]).abs() > \
                        cut_distance_threshold]

            tmp_save_conns.append(conn_results[tmp_gene]["new_conns"].loc[\
                saved_indices])

In [31]:
unique_conns = pd.concat(tmp_save_conns)

In [32]:
(unique_conns["Gene"] + ":" + unique_conns["TF_CON"]).value_counts()

Smim14:Spi1      3956
Smim14:Etv6      3938
Smim14:Erg       3861
Smim14:Ets1      3861
Smim14:Fli1      3701
                 ... 
Chd7:Thap11         1
Cmtm8:Atf4          1
Cmtm8:Nfe2l1        1
Tox:Gata6           1
Rnf122:Zfp212       1
Length: 21961, dtype: int64

In [33]:
unique_conns.loc[unique_conns["TF_CON"] == "Cebpe", "Gene"].value_counts().head(50)

Hvcn1            931
Lta4h            629
Tor1a            427
Spcs2            410
Hk2              292
Ms4a3            276
Clec5a           262
Ern1             251
Pgam1            250
Tex2             235
Cox5a            232
Mical1           228
Tyrobp           219
Fndc3b           214
Gsr              201
Sdf2l1           199
Dmkn             197
Clec12a          194
Pdia4            183
1700020L24Rik    180
Adgrg3           177
Nfam1            169
Papss2           168
Slpi             159
Ube2w            156
Fdps             149
Cebpe            148
Gmppb            147
Aprt             143
Pfkfb3           139
Erp29            138
Nt5dc2           132
Mogs             128
Txndc17          126
Ift57            125
Ly6c2            123
Gda              121
Mgl2             121
Fgl2             121
Trem3            117
Tmem165          115
Lman2            106
Prss57           106
Igsf6            105
Fam49b           101
Pdia6             99
Pdia3             98
Abcd2        

In [34]:
unique_conns

Unnamed: 0,Gene,seqlet_idx,r,index,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score,Cisbp2_TF,Cisbp2_TF_Family,TF_CON
215,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Mlx
215,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Usf1
215,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Atf6b
215,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Xbp1
215,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Atf6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1365828,Prdx4,13899421,0.498424,13899421,chrX,155195322,155195352,chrX:155194641-155195641,8.504064,682,+,MultiLin_2_F13a1__pattern_15,False,chrX:155195322-155195352,0.103848,Taf1,Unknown,Taf1
1365830,Prdx4,13899472,0.441628,13899472,chrX,155338404,155338434,chrX:155337965-155338965,5.269765,440,+,MultiLin_2_F13a1__pattern_15,False,chrX:155338404-155338434,0.081610,Taf1,Unknown,Taf1
1365785,Prdx4,12017455,0.457291,12017455,chrX,155338451,155338481,chrX:155337965-155338965,4.196037,487,-,MPP5_Egr1__pattern_3,False,chrX:155338451-155338481,0.711442,Nrf1,Unknown,Nrf1
1365499,Prdx4,3762026,0.541284,3762026,chrX,155338432,155338462,chrX:155337965-155338965,5.529413,468,+,ERP2__pattern_6,False,chrX:155338432-155338462,0.136637,Nrf1,Unknown,Nrf1


In [35]:
# Read in the dotproduct scores for chrombpnet factors
dpscores = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_dp_scores.fea")
dpscores = dpscores.set_index("index")
dpscores

Unnamed: 0_level_0,BMCP,CD127_MP,CLP1_Rrm2,eHSC,eHSC_Pcna,ERP1,ERP2,HSCP_ERP1,HSCP_HPC_Cenpf,HSCP_HPC_Hist1h2af,...,MPP5_Egr1,MPP5_Flt3,MultiLin_1,MultiLin_1_MEP,MultiLin_2_F13a1,MultiLin_2_Ms4a3,pre_MultiLin_1,pre_MultiLin_2,proNeu_1,ST_HSC
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.318700,0.182511,0.072353,-0.032767,0.016769,-0.010280,-0.011614,-0.018222,-0.001942,0.103659,...,0.006640,0.014308,0.193462,0.080617,0.056081,0.215750,0.040248,0.071136,0.142187,-0.013152
1,0.309506,0.181138,0.072640,-0.033765,0.004125,-0.009126,-0.010140,-0.018198,-0.003931,0.091665,...,-0.013929,0.012258,0.192524,0.086213,0.043606,0.212994,0.037983,0.068489,0.150639,-0.025311
2,0.153212,0.162171,0.122635,0.164242,0.111058,0.076880,0.040337,0.076493,0.186548,0.236789,...,0.085075,0.124055,0.407299,0.136561,0.053496,0.203368,0.149513,0.106462,0.122166,0.099793
3,0.096635,0.089786,0.069505,0.070600,0.069142,0.054534,0.084802,0.076695,0.084515,0.077108,...,0.083484,0.069578,0.078721,0.056535,0.049417,0.062896,0.050087,0.068793,0.041139,0.081893
4,0.092790,0.087732,0.059760,0.061709,0.055632,0.050725,0.081025,0.075760,0.073507,0.070626,...,0.076456,0.071330,0.076957,0.050349,0.039781,0.065570,0.048880,0.066689,0.034787,0.078672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16097284,0.226564,0.257879,0.167280,0.189565,0.590309,0.371887,0.173323,0.296042,0.369818,0.313356,...,0.223087,0.374202,0.429426,0.248314,0.296760,0.207561,0.360683,0.287468,0.295474,0.413159
16097285,0.211593,0.270772,0.173492,0.190040,0.619284,0.372037,0.177751,0.317392,0.407950,0.333013,...,0.225168,0.362093,0.412769,0.268819,0.312048,0.206281,0.344867,0.277470,0.297095,0.442239
16097286,0.157064,0.160043,0.098415,0.402791,0.357010,0.225733,0.281937,0.334802,0.853461,0.294814,...,0.455269,0.400698,0.307154,0.291346,0.118884,0.161782,0.435491,0.293549,0.050509,0.512000
16097287,1.418154,1.215912,1.336111,2.568458,2.638613,1.873825,1.706186,1.909492,3.397888,2.753970,...,2.027387,2.788655,1.795369,1.130344,1.609973,1.572978,1.808873,1.640395,0.898614,2.852918


In [36]:
dpscores_max = dpscores.max(axis=1)

In [37]:
gexscores_max = cite_rna.max(axis=1)

In [38]:
dpscores.columns.values

array(['BMCP', 'CD127_MP', 'CLP1_Rrm2', 'eHSC', 'eHSC_Pcna', 'ERP1',
       'ERP2', 'HSCP_ERP1', 'HSCP_HPC_Cenpf', 'HSCP_HPC_Hist1h2af',
       'HSCP_HPC_Tk1', 'HSCP_MKP', 'IG2_MP', 'IG2_proNeu1',
       'LT_HSC_Mllt3', 'MDP_Cpa3', 'MDP_Irf8', 'MEP', 'MKP',
       'ML_cell_cycle', 'MPP4_Hlf', 'MPP4_Nkx2_3', 'MPP5_Egr1',
       'MPP5_Flt3', 'MultiLin_1', 'MultiLin_1_MEP', 'MultiLin_2_F13a1',
       'MultiLin_2_Ms4a3', 'pre_MultiLin_1', 'pre_MultiLin_2', 'proNeu_1',
       'ST_HSC'], dtype=object)

In [39]:
cite_rna.columns.values

array(['ERP4_Bcl2l15', 'MultiLin_2_F13a1', 'ERP2_Arhgdig', 'ERP2',
       'Ebf1+ proB_Smtnl2', 'proNeu_1_ADT', 'preNeu_Ebf1', 'cMoP_S100a4',
       'immNeu_1', 'CD127_MP', 'proNeu_2', 'Ebf1+ proB_Uhrf1', 'ERP1',
       'MDP_Irf8', 'preNeu_3', 'ERP4_Alas2', 'preNeu_3_ADT', 'immNeu_2',
       'preNeu_1', 'immNeu_3', 'cMoP_Mki67', 'pre_MultiLin_2',
       'Bcl11b+_preETP_Tdrd5', 'MPP5_Flt3', 'Ebf1+ proB_Mxd3',
       'IG2_proNeu1', 'cKit_Mast', 'ML_cell_cycle', 'HSCP_HPC_Cenpf',
       'MultiLin_2_Ms4a3', 'MDP_Cpa3', 'proNeu_1', 'pre_cDC1_Xcr1',
       'HSCP_HPC_Tk1', 'Pro_B', 'pre_B_Fcer2a', 'CLP1_Hist1h1c',
       'MultiLin_1_Elane', 'ETP_A_0', 'ETP_CC_4', 'eHSC_Pcna',
       'MPP4_Ccr9', 'IG2_MP', 'Baso', 'B_cell progenitor', 'HSCP_ERP1',
       'plasmaDC', 'preNeu_3_Mac_C1qa', 'DN4_DP_trans_Hist1h3c',
       'MultiLin_1_MEP', 'ILC1_ILC3_NKP', 'pre_cDC1_Egfl8', 'MEP_UNK',
       'MKP', 'HSCP_MKP', 'pre_cDC2', 'pDC_2', 'MultiLin_1', 'CLP1_Rrm2',
       'BMCP', 'alphaLP', 'CLP2', 'Eosino

In [48]:
dpscore_threshold = 0.05
gex_threshold = 0.1

cluster_gexscores = {}
cluster_conns = {}
for tmp_cluster in dpscores.columns.values:
    print(f"{tmp_cluster}...")
    tmp_dpscores = dpscores[tmp_cluster].copy()
    tmp_dpscores.loc[tmp_dpscores < dpscore_threshold] = 0
    tmp_dpscores = tmp_dpscores / dpscores_max.loc[tmp_dpscores.index.values].abs()
    tmp_gexscores = cite_rna[tmp_cluster].copy()
    tmp_gexscores.loc[tmp_gexscores < 0.1] = 0
    tmp_gexscores = tmp_gexscores / gexscores_max.loc[tmp_gexscores.index.values]
    tmp_conns = unique_conns.copy()
    tmp_conns = tmp_conns.loc[~tmp_conns["Gene"].str.startswith("Rps")]
    tmp_conns = tmp_conns.loc[~tmp_conns["Gene"].str.startswith("Rpl")]
    tmp_conns["con_score"] = (\
        tmp_dpscores.loc[tmp_conns["seqlet_idx"].values] * \
        tmp_gexscores.loc[tmp_conns["Gene"].values].values * \
        tmp_gexscores.loc[tmp_conns["TF_CON"].values].values).values
    tmp_conns = tmp_conns.loc[tmp_conns["con_score"] > 0]
    cluster_conns[tmp_cluster] = tmp_conns
    cluster_gexscores[tmp_cluster] = tmp_gexscores

BMCP...
CD127_MP...
CLP1_Rrm2...
eHSC...
eHSC_Pcna...
ERP1...
ERP2...
HSCP_ERP1...
HSCP_HPC_Cenpf...
HSCP_HPC_Hist1h2af...
HSCP_HPC_Tk1...
HSCP_MKP...
IG2_MP...
IG2_proNeu1...
LT_HSC_Mllt3...
MDP_Cpa3...
MDP_Irf8...
MEP...
MKP...
ML_cell_cycle...
MPP4_Hlf...
MPP4_Nkx2_3...
MPP5_Egr1...
MPP5_Flt3...
MultiLin_1...
MultiLin_1_MEP...
MultiLin_2_F13a1...
MultiLin_2_Ms4a3...
pre_MultiLin_1...
pre_MultiLin_2...
proNeu_1...
ST_HSC...


In [89]:
total_tf_gene_conns = (unique_conns["TF_CON"] + ":" + \
    unique_conns["Gene"]).value_counts()

In [90]:
tf2gene_conn_counts = pd.DataFrame(0, index=total_tf_gene_conns.index.values, columns=list(cluster_conns.keys()))
for tmp_cluster in cluster_conns:
    tmp_vcounts = (cluster_conns[tmp_cluster]["TF_CON"] + ":" + \
        cluster_conns[tmp_cluster]["Gene"]).value_counts()
    tf2gene_conn_counts.loc[\
        tmp_vcounts.index.values, 
        tmp_cluster] = tmp_vcounts.values


tf2gene_conn_counts

Unnamed: 0,BMCP,CD127_MP,CLP1_Rrm2,eHSC,eHSC_Pcna,ERP1,ERP2,HSCP_ERP1,HSCP_HPC_Cenpf,HSCP_HPC_Hist1h2af,...,MPP5_Egr1,MPP5_Flt3,MultiLin_1,MultiLin_1_MEP,MultiLin_2_F13a1,MultiLin_2_Ms4a3,pre_MultiLin_1,pre_MultiLin_2,proNeu_1,ST_HSC
Spi1:Smim14,3324,3749,3726,3563,3658,0,0,1385,3921,3872,...,3799,3801,3811,3334,3720,3823,3797,3685,3554,3565
Etv6:Smim14,3370,3731,3717,3596,3662,1727,0,1453,3915,3887,...,3786,3803,3821,3375,3714,3803,3819,3707,3577,3556
Erg:Smim14,3297,3655,3644,3522,3585,0,0,1431,3838,3811,...,3709,3726,3744,3307,3640,3728,3742,3635,3507,3481
Ets1:Smim14,3299,0,3646,0,0,0,0,0,3838,3812,...,3709,3725,0,0,0,0,3742,0,3526,3481
Fli1:Smim14,3165,3510,3501,3385,3437,0,0,1412,3682,3657,...,3550,3566,3591,3193,3514,3580,3587,3492,3387,3327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Thap11:Chd7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Atf4:Cmtm8,0,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
Nfe2l1:Cmtm8,0,0,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
Gata6:Tox,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
total_tf_conns = unique_conns["TF_CON"].value_counts()
tf_conn_counts = pd.DataFrame(0, index=total_tf_conns.index.values, columns=list(cluster_conns.keys()))
for tmp_cluster in cluster_conns:
    tmp_vcounts = cluster_conns[tmp_cluster]["TF_CON"].value_counts()
    tf_conn_counts.loc[\
        tmp_vcounts.index.values, 
        tmp_cluster] = tmp_vcounts.values


tf_conn_counts

Unnamed: 0,BMCP,CD127_MP,CLP1_Rrm2,eHSC,eHSC_Pcna,ERP1,ERP2,HSCP_ERP1,HSCP_HPC_Cenpf,HSCP_HPC_Hist1h2af,...,MPP5_Egr1,MPP5_Flt3,MultiLin_1,MultiLin_1_MEP,MultiLin_2_F13a1,MultiLin_2_Ms4a3,pre_MultiLin_1,pre_MultiLin_2,proNeu_1,ST_HSC
Etv6,52330,36063,80292,81224,100365,16405,8973,22555,101507,86096,...,106264,107929,69588,56694,53468,55496,95343,91495,36105,103163
Erf,47166,30181,75810,77640,97670,14719,9119,20618,97815,81704,...,103756,105009,64678,51501,48243,50012,91684,87745,0,101299
Erg,50337,34443,76894,77895,95710,0,0,21740,97195,82487,...,101443,103058,66554,54616,51189,53056,91166,87597,34770,98490
Elf2,43545,26827,68663,71411,88445,13993,8685,19181,89478,74908,...,93499,94724,58823,48766,43480,45505,82897,80116,27529,91087
Elk1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Foxd4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Elf3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Pax2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Trps1,1,1,1,0,3,0,0,0,0,1,...,2,1,1,0,1,1,3,1,0,0


In [112]:
log2_tf_conn_counts = np.log2(tf_conn_counts + 1)

In [117]:
from pyInfinityFlow.InfinityFlow_Utilities import marker_finder

mfr, mfp = marker_finder(\
    log2_tf_conn_counts.T, 
    log2_tf_conn_counts.columns.values)

mfr = mfr.dropna()
tf_conn_markers = pd.DataFrame({\
        "top_cluster": mfr.idxmax(axis=1),
        "top_corr": mfr.max(axis=1)},
    index=mfr.index.values)

tf_conn_markers

Unnamed: 0,top_cluster,top_corr
Etv6,MPP4_Hlf,0.195127
Erf,MPP4_Hlf,0.092043
Erg,MPP4_Hlf,0.081364
Elf2,MPP4_Hlf,0.200655
Elf4,MPP4_Hlf,0.083809
...,...,...
Zfp422,MPP4_Nkx2_3,0.191234
Max,ERP2,0.493332
Zfp143,MKP,0.300556
Trps1,eHSC_Pcna,0.315945


In [120]:
print(tf_conn_markers.sort_values(by="top_corr", ascending=False).to_string())

                top_cluster  top_corr
Foxd2          LT_HSC_Mllt3  1.000000
Crem                    MEP  1.000000
Etv5              MPP5_Flt3  0.697649
Arntl              MPP4_Hlf  0.697071
Maf                MPP4_Hlf  0.563806
Mitf           LT_HSC_Mllt3  0.560341
Max                    ERP2  0.493332
Bhlhe40        LT_HSC_Mllt3  0.477084
Myc                    ERP1  0.434158
Maff               MPP4_Hlf  0.423409
Atf3               MPP4_Hlf  0.421621
Gata3    HSCP_HPC_Hist1h2af  0.420747
Tfeb               MPP4_Hlf  0.420530
Irf3         MultiLin_1_MEP  0.402424
E2f4                   ERP1  0.389797
Bach2          LT_HSC_Mllt3  0.384452
Cebpe              CD127_MP  0.379528
Six5               MPP4_Hlf  0.378058
Hes1           LT_HSC_Mllt3  0.377568
Egr2               MPP4_Hlf  0.352704
Rfx1            MPP4_Nkx2_3  0.345975
Taf1         HSCP_HPC_Cenpf  0.322370
Foxk2              MDP_Irf8  0.321159
Ctcf     HSCP_HPC_Hist1h2af  0.318081
Trps1             eHSC_Pcna  0.315945
Usf2        

In [100]:
print(tf_conn_counts.var(axis=1).sort_values(ascending=False).to_string())

Ets1       1.583540e+09
Erf        1.224591e+09
Erg        9.378807e+08
Etv6       9.305325e+08
Elf2       7.751248e+08
Klf2       7.561066e+08
Elf4       7.023678e+08
Ets2       6.844390e+08
Etv3       6.439800e+08
Elk4       6.297425e+08
Elk3       5.830859e+08
Sp4        5.825011e+08
Sp1        5.083561e+08
Zbtb17     4.669724e+08
Etv5       3.078177e+08
Klf11      3.076364e+08
Sp3        3.063435e+08
Bcl11a     2.932696e+08
Klf4       2.872646e+08
Klf6       2.710303e+08
Egr1       2.296081e+08
Elf1       1.940645e+08
Klf10      1.922203e+08
Fli1       1.536520e+08
Zfp281     1.070684e+08
Patz1      7.673837e+07
Cebpa      7.112431e+07
Fosl2      6.310317e+07
Atf3       6.303789e+07
Jun        4.245257e+07
Jund       3.844603e+07
Junb       3.679532e+07
Stat2      2.888025e+07
Irf7       2.873331e+07
Klf3       2.860153e+07
Irf1       2.418629e+07
Cebpb      2.181271e+07
Irf2       2.179442e+07
Egr2       2.165552e+07
Gabpa      2.107595e+07
Stat1      1.973524e+07
Bach2      1.954

In [110]:
tf_conn_counts.loc["Cebpe"].sort_values()

BMCP                      0
pre_MultiLin_2            0
pre_MultiLin_1            0
MultiLin_2_Ms4a3          0
MultiLin_2_F13a1          0
MultiLin_1_MEP            0
MultiLin_1                0
MPP5_Flt3                 0
MPP5_Egr1                 0
MPP4_Nkx2_3               0
MPP4_Hlf                  0
MKP                       0
MEP                       0
MDP_Irf8                  0
ST_HSC                    0
HSCP_ERP1                 0
CLP1_Rrm2                 0
eHSC                      0
HSCP_MKP                  0
HSCP_HPC_Tk1              0
HSCP_HPC_Hist1h2af        0
HSCP_HPC_Cenpf            0
LT_HSC_Mllt3              0
ERP2                      0
ERP1                      0
eHSC_Pcna                 0
ML_cell_cycle          8556
MDP_Cpa3               9930
IG2_MP                10589
proNeu_1              11607
IG2_proNeu1           11846
CD127_MP              11974
Name: Cebpe, dtype: int64

In [109]:
print(total_tf_conns.to_string())

Etv6       114263
Erf        111619
Erg        109298
Elf2       100649
Elk1        97861
Elf4        97535
Ets1        96568
Elk4        91283
Elf5        88823
Sp1         87227
Elk3        86297
Klf2        83961
Etv1        82474
Spic        81652
Etv3        81605
Etv5        76856
Zbtb17      74830
Sp4         73277
Sp6         71695
Sp3         65805
Klf6        62213
Ets2        61650
Bcl11a      58391
Elf1        57991
Egr1        53901
Klf10       53256
Klf4        52914
Klf8        52386
Klf11       52243
Klf12       47442
Fli1        45078
Irf4        44423
Zfp281      37661
Patz1       34443
Jun         25357
Klf5        25158
Atf3        24354
Egr4        24174
Jund        23787
Fosl1       23072
Junb        23033
Fosl2       23003
Cebpa       22321
Wt1         21668
Klf3        20289
Nfil3       18499
Fev         17694
Gabpa       16784
Irf2        16625
Irf1        16191
Stat1       15742
Egr2        15728
Stat2       15679
Irf7        14654
Spi1        14546
Bach2     

In [130]:
unique_conns.loc[unique_conns["TF_CON"] == "Cebpe", "Gene"].value_counts()

Hvcn1      931
Lta4h      629
Tor1a      427
Spcs2      410
Hk2        292
          ... 
Capza1      10
G0s2        10
Prom1        5
Noct         2
Pglyrp1      1
Name: Gene, Length: 107, dtype: int64

In [125]:
unique_conns["Gene"].value_counts()

Nlgn2     79629
Apobr     78979
Abcg1     71234
Ndst1     67344
Socs2     64247
          ...  
Tespa1        1
Ifi27         1
Cdkn3         1
Cstf3         1
Mcm3          1
Name: Gene, Length: 724, dtype: int64

In [131]:
cluster_conns["BMCP"].keys()

Index(['Gene', 'seqlet_idx', 'r', 'index', 'chr', 'start', 'end', 'peak',
       'score', 'pos', 'strand', 'pattern', 'in_modisco', 'seq_name',
       'dp_score', 'Cisbp2_TF', 'Cisbp2_TF_Family', 'TF_CON', 'con_score'],
      dtype='object')

In [134]:
pd.DataFrame(cluster_gexscores).to_csv("output/GRN/cluster_gex_scores.csv",
    header=True, index=True, index_label="UID")

In [137]:
for tmp_cluster in cluster_conns:
    cluster_conns[tmp_cluster].reset_index(drop=True).to_feather(\
        f"output/GRN/cluster_grn_connections/{tmp_cluster}_connections.fea")

In [136]:
unique_conns.reset_index(drop=True).to_feather(\
    "output/GRN/all_unique_connections.fea")

In [139]:
unique_conns.loc[unique_conns["Gene"] == "Cpa3"]

Unnamed: 0,Gene,seqlet_idx,r,index,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score,Cisbp2_TF,Cisbp2_TF_Family,TF_CON
708170,Cpa3,12619184,0.405944,12619184,chr3,19320115,19320145,chr3:19319628-19320628,8.809033,488,+,MultiLin_1__pattern_2,False,chr3:19320115-19320145,0.702543,Cebpa,bZIP,Cebpa
708177,Cpa3,12619317,0.539525,12619317,chr3,19998516,19998546,chr3:19997865-19998865,9.594018,652,+,MultiLin_1__pattern_2,False,chr3:19998516-19998546,0.50142,Cebpa,bZIP,Cebpa
708223,Cpa3,14128525,0.514315,14128525,chr3,19143483,19143513,chr3:19143159-19144159,9.742616,325,+,MultiLin_2_Ms4a3__pattern_3,False,chr3:19143483-19143513,0.496359,Cebpa,bZIP,Cebpa
707873,Cpa3,984502,0.408167,984502,chr3,19143621,19143651,chr3:19143159-19144159,9.217225,463,+,CD127_MP__pattern_1,False,chr3:19143621-19143651,0.488866,Cebpb,bZIP,Cebpa
707877,Cpa3,984558,0.438608,984558,chr3,19406529,19406559,chr3:19406041-19407041,8.754275,489,+,CD127_MP__pattern_1,False,chr3:19406529-19406559,0.429369,Cebpb,bZIP,Cebpa
708117,Cpa3,10396097,0.405631,10396097,chr3,19438244,19438274,chr3:19437553-19438553,8.616581,692,-,ML_cell_cycle__pattern_2,False,chr3:19438244-19438274,0.418826,Cebpa,bZIP,Cebpa
707851,Cpa3,437501,0.497,437501,chr3,20017741,20017771,chr3:20017183-20018183,7.106138,559,+,BMCP__pattern_32,False,chr3:20017741-20017771,0.400711,Cebpa,bZIP,Cebpa
708225,Cpa3,14128556,0.466855,14128556,chr3,19260829,19260859,chr3:19260269-19261269,9.090455,561,-,MultiLin_2_Ms4a3__pattern_4,False,chr3:19260829-19260859,0.382224,Atf4,bZIP,Cebpa
708120,Cpa3,10396117,0.528565,10396117,chr3,19534765,19534795,chr3:19534592-19535592,9.137907,174,+,ML_cell_cycle__pattern_2,False,chr3:19534765-19534795,0.334548,Cebpa,bZIP,Cebpa
707870,Cpa3,437640,0.629785,437640,chr3,20273089,20273119,chr3:20272289-20273289,9.605664,801,+,BMCP__pattern_9,False,chr3:20273089-20273119,0.274468,Atf4,bZIP,Cebpa
