In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import re
from gimmemotifs.motif import Motif,read_motifs

from pyInfinityFlow.InfinityFlow_Utilities import pearson_corr_df_to_df

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Arial"
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

os.chdir("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/"\
    "2023_06_12_tea_seq_atac_processing/")

INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


In [2]:
# Read in the CITE-seq RNA counts
path_cite_data = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/"

adata_cite = sc.read(os.path.join(\
    path_cite_data, 
    "cite_seq_adata_rna_combined.h5ad"))
print("Computing CPTT normalized scRNA-seq from CITE-seq...")
adata_cite.X = np.log2((10000 * (adata_cite.X.T / \
    adata_cite.X.sum(axis=1).T).T) + 1)
cite_cell_anno = pd.read_csv(os.path.join(\
    path_cite_data, 
    "cite_seq_cell_annotations.csv"))
cite_cell_anno.index = cite_cell_anno["Cell_Barcode"].values

# Make pseudobulk RNA counts from cite clusters
print("Computing RNA centroids...")
shared_cells_cite = np.intersect1d(cite_cell_anno.index.values, 
    adata_cite.obs.index.values)
cite_cell_anno = cite_cell_anno.loc[shared_cells_cite]
cite_rna = {}
for tmp_cluster in cite_cell_anno["sctri_cite"].unique():
    print("\t{}...".format(tmp_cluster))
    tmp_barcodes = cite_cell_anno.loc[\
        cite_cell_anno["sctri_cite"] == tmp_cluster].index.values
    cite_rna[tmp_cluster] = pd.Series(np.asarray(\
            adata_cite[tmp_barcodes].X.mean(axis=0)).reshape(-1),
            index=adata_cite.var.index.values)

cite_rna = pd.DataFrame(cite_rna)
cite_rna = cite_rna.drop("Unknown", axis=1)
map_r7_names = pd.read_csv(os.path.join(\
    path_cite_data, 
    "map_r7-v1_to_r7-v2_names.csv"))
map_r7_names_v1_to_v2 = pd.Series(\
    map_r7_names["R7_V2"].values,
    index = map_r7_names["R7_V1"].values)

cite_rna = cite_rna.rename(map_r7_names_v1_to_v2, axis=1)
cite_rna.columns = [i.replace("-", "_") for i in cite_rna.columns.values]

Computing CPTT normalized scRNA-seq from CITE-seq...
Computing RNA centroids...
	ERP4--Ex...
	MultiLin-2_ML-c9--RNA...
	ERP2_Kit-c14--RNA...
	ERP2--Ex...
	Ebf1+_proB_CD127-c2--RNA...
	proNeu-1_1-4--ADT...
	Unknown...
	preNeu-2_1-8--ADT...
	cMoP_ML-c15--RNA...
	immNeu_Kit-c3--RNA...
	MP--Ex...
	proNeu-2_Kit-c4--RNA...
	Ebf1+_proB--Ex...
	ML_ERP1_ML-c4--RNA...
	MDP--Ex...
	preNeu-3_Kit-c2--RNA...
	ERP4_Kit-c13--RNA...
	preNeu-3_2-3--ADT...
	immNeu_2-5--ADT...
	preNeu-1_Kit-c5--RNA...
	immNeu_Kit-c1--RNA...
	cMoP--Ex...
	neoHPC_Myc--Ex...
	Bcl11b+_preETP--Ex...
	ML_Multi-Lin-2_HSC-c7--RNA...
	Ebf1+_proB_CD127-c8--RNA...
	IG2-proNeu1_ML-c14--RNA...
	ML_Mast--Ex...
	ML_cell_cycle_ML-c5--RNA...
	HSC-HPC-Cenpf_HSC-c12--RNA...
	MultiLin-2_ERP_ML-c10--RNA...
	ML_MDP_ML-c13--RNA...
	proNeu-1--Ex...
	SiglecH-Ly6C-pre-DC--Ex...
	HSC-HPC-Mki67--Ex...
	Pro-B_CD127-c3--RNA...
	precursor_B_cell_5-2--ADT...
	CLP1_CD127-c5--RNA...
	ML_MultiLin-1_ML-c8--RNA...
	ETP-A-0-Ccl4--Ex...
	ETP-CC-4-Ung--Ex...
	e

In [3]:
# Read in the motifs
path_cisbp = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/reference/"\
    "cisbp2/Mus_musculus_2020_06_01_11_53_pm/"
path_pwms = os.path.join(path_cisbp, "pwms_all_motifs/")
path_cisbp_anno = os.path.join(path_cisbp, "TF_Information_all_motifs.txt")

cisbp_anno = pd.read_table(path_cisbp_anno)
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"] != "."]

cisbp2_motifs = read_motifs(os.path.join(path_cisbp, "mouse_cisbp2_all_motif_ids_ppm_format.motif"))
cisbp2_names = np.array([str(x) for x in cisbp2_motifs])

# Filter cisbp2 annotation to only motifs that have a real entry
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"].isin(["_".join(i.split("_")[:2]) for i in cisbp2_names])]

map_id_to_tf = cisbp_anno[["Motif_ID", "TF_Name"]].drop_duplicates()

In [4]:
path_to_cisbp_correlations = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "correlate_modisco_to_cisbp2_motifs/"

corr_files = [i for i in os.listdir(path_to_cisbp_correlations) if \
    i.endswith("csv") and not i.startswith(".")]

modisco_pattern_re = re.compile(r'(\w+__pattern_[0-9]+)_\w+.csv')

corr_files = pd.Series(\
    corr_files, 
    index=[modisco_pattern_re.findall(i)[0] for i in corr_files])

pat_corrs = {}
for tmp_pat in corr_files.index.values:
    pat_corrs[tmp_pat] = pd.read_csv(os.path.join(\
        path_to_cisbp_correlations,
        corr_files[tmp_pat]))

for tmp_pat in pat_corrs:
    pat_corrs[tmp_pat].loc[:,"motif_pattern"] = [i.split("_")[-1] \
        for i in pat_corrs[tmp_pat]["motif"].values]
    pat_corrs[tmp_pat].loc[:,"motif"] = ["_".join(i.split("_")[:-1]) \
        for i in pat_corrs[tmp_pat]["motif"].values]
    pat_corrs[tmp_pat] = pat_corrs[tmp_pat].set_index("motif")

In [5]:
pat_corrs["BMCP__pattern_25"].sort_values(by="r", ascending=False)

Unnamed: 0_level_0,pattern,offset,strand,r,motif_pattern
motif,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M08127_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.937167,rnAGATAAGrA
M09123_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.930855,nnAGATAAGA
M09121_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.928374,nsAGATAAGrn
M08126_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,12,-,0.925646,nnsAGATAAGr
M09120_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,13,-,0.915643,nnAGATAAGn
...,...,...,...,...,...
M02935_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,17,+,0.095082,nCCCCCCCAC
M00139_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,19,+,0.093277,CnwCCCCCmn
M08978_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,10,+,0.091463,nnCnnGCCCCGCCCCCnnnn
M00143_2.00,BMCP__pattern_25_nnnnnGTnCTTATCTGnnnnnnnnnnnnnn,18,+,0.089067,nCCCCCCnnn


In [6]:

unique_motifs = []
for tmp_pat in pat_corrs:
    unique_motifs += list(pat_corrs[tmp_pat].index.values)

unique_motifs = np.unique(unique_motifs)
mo_similarities = pd.DataFrame(0, index=list(pat_corrs.keys()), 
    columns=unique_motifs)
for tmp_pat in pat_corrs:
    mo_similarities.loc[\
        tmp_pat, 
        pat_corrs[tmp_pat].index.values] = pat_corrs[tmp_pat]["r"].values
    
mo_similarities

Unnamed: 0,M00111_2.00,M00112_2.00,M00113_2.00,M00114_2.00,M00115_2.00,M00116_2.00,M00117_2.00,M00118_2.00,M00119_2.00,M00120_2.00,...,M09648_2.00,M09649_2.00,M09650_2.00,M09651_2.00,M09652_2.00,M09653_2.00,M10469_2.00,M10570_2.00,M10767_2.00,M11049_2.00
HSCP_HPC_Cenpf__pattern_25,0.202201,0.240010,0.211908,0.257100,0.258435,0.185724,0.291062,0.311243,0.226841,0.318314,...,0.232542,0.352796,0.453523,0.349308,0.263264,0.258160,0.285762,0.302035,0.221077,0.186552
LT_HSC_Mllt3__pattern_8,0.413618,0.382552,0.361313,0.392718,0.077250,0.152473,0.322053,0.353005,0.282305,0.410082,...,0.317835,0.386771,0.383392,0.268356,0.385213,0.389015,0.250820,0.152642,0.192357,0.124400
MPP5_Flt3__pattern_1,0.497018,0.434836,0.437839,0.434691,0.164640,0.092760,0.417630,0.494317,0.322522,0.448245,...,0.353817,0.478664,0.291022,0.285284,0.273864,0.265764,0.337265,0.212127,0.137601,0.157886
BMCP__pattern_0,0.483937,0.384676,0.436416,0.427572,0.047236,0.086255,0.166355,0.294221,0.280555,0.169208,...,0.283571,0.358853,0.306188,0.127854,0.183160,0.161740,0.309193,0.101758,0.048325,0.090653
BMCP__pattern_10,0.273003,0.279414,0.300172,0.268997,0.259189,0.185422,0.643987,0.699773,0.900607,0.678878,...,0.524756,0.557052,0.309297,0.301051,0.226679,0.253312,0.326794,0.287368,0.248150,0.331538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HSCP_HPC_Cenpf__pattern_20,0.341943,0.322229,0.332862,0.376382,0.157247,0.163326,0.333682,0.318763,0.217711,0.368670,...,0.271803,0.360738,0.490038,0.245841,0.298795,0.309190,0.322025,0.209153,0.173870,0.207671
HSCP_HPC_Cenpf__pattern_21,0.288247,0.334039,0.281859,0.309338,0.317346,0.175181,0.413132,0.375163,0.217356,0.402259,...,0.357111,0.392018,0.281211,0.328510,0.498873,0.491448,0.369715,0.403050,0.187095,0.163493
HSCP_HPC_Cenpf__pattern_22,0.549329,0.612334,0.555310,0.599159,0.325205,0.169126,0.289043,0.365376,0.393517,0.340932,...,0.258803,0.247206,0.327207,0.296509,0.507785,0.529500,0.455119,0.419917,0.228951,0.227655
HSCP_HPC_Cenpf__pattern_23,0.306296,0.267893,0.258015,0.282877,0.231764,0.409715,0.371785,0.399047,0.225301,0.361275,...,0.231620,0.372803,0.337738,0.285202,0.347221,0.326198,0.239051,0.349767,0.202582,0.273310


In [7]:
new_cluster_order = pd.read_csv("/media/kyle_storage/kyle_ferchen/"\
    "grimes_lab_main/data/2021_11_mouse_optimized_cite_seq/processed_files/"\
    "r7_cluster_order_kf_2024_01.csv")
map_r7_to_lv3 = pd.Series(\
    new_cluster_order["Level 3"].values,
    index=new_cluster_order["Cluster"].values)
map_lvl3_to_order = pd.Series(\
    new_cluster_order["Order"].values,
    index=new_cluster_order["Level 3"].values)
map_r7_to_replace_dash = pd.Series(\
    new_cluster_order["Cluster"].values,
    index=[i.replace("-", "_") for i in new_cluster_order["Cluster"].values])
map_new_cluster_to_order = pd.Series(\
    new_cluster_order["Order"].values,
    index=new_cluster_order["Level 3"].values)
new_cluster_order

Unnamed: 0,Cluster,Group,Old_order,CITE-to-TEA,Order,Level 1,Level 2,Level 3,Level Kairavee
0,LT-HSC_Mllt3,HSCP,1,LT-HSC_Mllt3,1.0,HSPC,HSC,qHSC,HSC
1,ST-HSC,HSCP,2,ST-HSC,2.0,HSPC,HSC,aHSC,HSC
2,MPP4-Hlf,HSCP,3,MPP4-Hlf,3.0,HSPC,MPP4,HSC-Ly,HSPC
3,MPP5-Egr1,HSCP,8,MPP5-Egr1,4.0,HSPC,MPP5,MPP5-IER,MPP5-IER
4,MPP5-Flt3,HSCP,7,MPP5-Flt3,5.0,HSPC,MPP5,MPP5 Ly-I,HSPC
...,...,...,...,...,...,...,...,...,...
83,ILC2,ILC,84,ILC2,84.0,ILC,ILC,ILC2,ILC
84,Bcl11b+_preETP_Cd3d,ILC,85,Bcl11b+_preETP_Cd3d,85.0,T cell,preETP,pre-ILC1-ILC3-NKP,preETP
85,Bcl11b+_preETP_Tdrd5,ILC,88,Bcl11b+_preETP_Tdrd5,86.0,T cell,preETP,pre-NKP,preETP
86,ILC1-ILC3-NKP,ILC,87,ILC1-ILC3-NKP,87.0,ILC,ILC,ILC1-ILC3-NKP,ILC


In [8]:
# Seqlet annotation
seqlets = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_anno.fea")

# Seqlets
dp_scores = pd.read_feather("output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/all_seqlit_hits_above_modisco_min_dp_scores.fea")
dp_scores = dp_scores.set_index("index").astype(np.float32)
dp_scores = dp_scores.rename(map_r7_to_replace_dash.to_dict(), axis=1)
dp_scores = dp_scores.rename(map_r7_to_lv3.to_dict(), axis=1)

In [9]:
gene_to_seqlets = pd.read_csv("output/chrombpnet/modisco_merged_results/"\
    "fold_0/redo_extract_seqlets/"\
    "correlate_seqlets_to_gene_expression_all_genes/sig_conns.csv")
gene_to_seqlets = pd.concat([\
    gene_to_seqlets,
    seqlets.loc[gene_to_seqlets["seqlet_idx"].values].reset_index(drop=True)], 
        axis=1)
gene_to_seqlets = gene_to_seqlets.loc[\
    ~gene_to_seqlets["pattern"].str.contains("__neg_pattern_")]
gene_to_seqlets["pattern"] = gene_to_seqlets["pattern"].str.replace(\
    "__pos_pattern_", "__pattern_")
gene_to_seqlets

Unnamed: 0,Gene,seqlet_idx,r,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score
2,Mrpl15,2458694,0.547196,chr1,4540571,4540601,chr1:4540111-4541111,8.932149,461,-,ERP1__pattern_0,True,chr1:4540571-4540601,0.481389
3,Mrpl15,2807710,0.547052,chr1,4540571,4540601,chr1:4540111-4541111,8.829016,461,-,ERP2__pattern_0,True,chr1:4540571-4540601,0.405852
4,Mrpl15,3763246,0.547004,chr1,4540571,4540601,chr1:4540111-4541111,9.052924,461,+,HSCP_ERP1__pattern_1,False,chr1:4540571-4540601,0.083670
5,Mrpl15,9202943,0.547779,chr1,4540571,4540601,chr1:4540111-4541111,8.155797,461,+,MEP__pattern_3,False,chr1:4540571-4540601,0.157136
6,Mrpl15,9906506,0.547904,chr1,4540570,4540600,chr1:4540111-4541111,8.989227,460,+,MKP__pattern_3,False,chr1:4540570-4540600,0.196055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30813220,Hccs,14808102,0.618932,chrX,169062729,169062759,chrX:169062299-169063299,7.126749,431,+,pre_MultiLin_1__pattern_11,False,chrX:169062729-169062759,0.142939
30813221,Hccs,14808105,0.434179,chrX,169094790,169094820,chrX:169094308-169095308,10.263774,483,-,pre_MultiLin_1__pattern_4,False,chrX:169094790-169094820,0.406595
30813222,Hccs,15314097,0.441781,chrX,169094790,169094820,chrX:169094308-169095308,9.768803,483,+,pre_MultiLin_2__pattern_5,False,chrX:169094790-169094820,0.165522
30813223,Hccs,15796481,0.441943,chrX,169094786,169094816,chrX:169094308-169095308,10.079901,479,-,proNeu_1__pattern_7,False,chrX:169094786-169094816,0.222236


In [10]:
### Build TF to gene correlation matrix (pseudobulk CITE-seq CPTT)
## Only use clusters with modisco results
clusters_to_use = np.unique([i.split("__")[0] for i in list(pat_corrs.keys())])

shared_tfs = np.intersect1d(\
    map_id_to_tf["TF_Name"].unique(), 
    cite_rna.index.values)

tf2gene = pearson_corr_df_to_df(\
    cite_rna.loc[shared_tfs, clusters_to_use].T,
    cite_rna.loc[gene_to_seqlets["Gene"].unique(), clusters_to_use].T).dropna()

tf2gene

Unnamed: 0,Mrpl15,Lypla1,Tcea1,Atp6v1h,4732440D04Rik,Rb1cc1,St18,Pcmtd1,Rrs1,Adhfe1,...,Tceanc,Tmsb4x,Tlr8,Tlr7,Prps2,Msl3,Arhgap6,Gm15261,Amelx,Hccs
AC168977.1,-0.207345,0.214908,0.310641,0.447806,0.100249,0.101964,-0.096922,0.254469,-0.264164,-0.008615,...,0.118304,0.223562,-0.190263,0.525339,0.252252,0.071206,0.532635,0.484260,0.096185,0.271224
Ahctf1,0.351291,-0.283189,0.721642,-0.395829,0.042972,-0.182254,-0.124383,-0.044344,0.245067,-0.047850,...,0.129361,-0.714817,-0.255081,-0.235827,-0.117408,0.580536,0.132948,0.077102,0.121676,0.456680
Ahr,-0.740351,-0.110981,-0.349024,0.424515,0.227347,0.653538,0.412167,0.820725,-0.752553,0.123100,...,0.510348,0.125170,-0.225358,0.090179,-0.244987,-0.515346,0.368094,0.170119,-0.042461,-0.491934
Aire,-0.663790,0.029738,-0.197959,0.366057,0.538739,0.709377,0.207697,0.789649,-0.731934,-0.235657,...,0.640141,-0.040788,-0.218868,0.102753,-0.377064,-0.353798,0.356167,0.160371,-0.075698,-0.337738
Alx1,-0.138140,0.128608,0.109710,-0.015732,-0.026644,0.148997,-0.102979,0.141999,-0.171809,-0.118312,...,0.260712,-0.007479,-0.064929,-0.067750,-0.030915,0.259122,0.122319,0.134667,0.762830,0.038794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zscan20,-0.739673,0.189856,-0.152174,0.428758,0.495468,0.751963,0.489203,0.829072,-0.798257,-0.025336,...,0.796856,0.099543,-0.190500,0.121783,-0.270472,-0.274467,0.466787,0.231449,0.197603,-0.317767
Zscan26,-0.806257,-0.116374,-0.428609,0.402260,0.430397,0.836742,0.543816,0.901533,-0.828382,-0.029879,...,0.711038,0.006284,-0.157231,0.022427,-0.498710,-0.531891,0.301573,0.103166,-0.093738,-0.532424
Zscan29,-0.705735,0.153951,-0.426196,0.534282,0.415066,0.759371,0.145234,0.714268,-0.626279,-0.002207,...,0.509478,0.241187,-0.185488,-0.156000,-0.562612,-0.432589,0.207287,0.076315,-0.175622,-0.352948
Zscan4d,0.000730,0.228454,-0.148824,0.068063,0.229488,0.124778,0.482092,-0.087063,-0.078548,-0.118312,...,0.048984,0.049833,-0.064929,0.054471,0.126177,0.197613,-0.056416,-0.018415,-0.046150,0.067401


In [13]:
tf2gene["Il5ra"].sort_values().tail(25)

Ascl2      0.306576
Runx1      0.312322
Zfp422     0.320923
Hmga2      0.330324
Zfp105     0.333526
Arid5a     0.333959
Tcf7       0.339182
Zbtb7c     0.341304
Batf3      0.358722
Foxn4      0.391349
Ikzf3      0.404335
Myb        0.406089
Foxd3      0.415712
Nr1i3      0.424621
Spi1       0.429388
Foxf2      0.436581
Grhl2      0.491049
Hnf1a      0.497140
Ikzf2      0.556903
Pax6       0.566752
Rxrb       0.572795
Cebpa      0.631248
Tfap2c     0.636350
Prrx2      0.675681
Zscan4d    0.675681
Name: Il5ra, dtype: float64

: 

In [11]:
# mo_sim_threshold = 0.75
# tf2gene_threshold = 0.4

# conn_results = {}
# unique_genes = gene_to_seqlets["Gene"].unique()
# for i, tmp_gene in enumerate(unique_genes):
#     print(f"Working on gene: {i+1}/{len(unique_genes)}...", flush=True, end='\r')
#     # Filter to seqlet connections for the given gene
#     seg_gene2seqlets = gene_to_seqlets.loc[gene_to_seqlets["Gene"] == tmp_gene]
#     # Pull out the unique patterns connected to that gene
#     seg_pats = seg_gene2seqlets["pattern"].unique()
#     # Pull out the motifs that match the patterns above given threshold
#     seg_mo_sims = mo_similarities.loc[seg_pats]
#     seg_mo = seg_mo_sims.columns.values[seg_mo_sims.max(axis=0) > mo_sim_threshold]

#     # Pull out the TFs with a PWM that match that pattern
#     seg_tf = map_id_to_tf.loc[\
#         map_id_to_tf["Motif_ID"].isin(seg_mo), "TF_Name"].unique()
#     seg_tf = np.intersect1d(seg_tf, tf2gene.index.values)

#     # Identify the TFs that correlate to the gene
#     seg_tf2gene = tf2gene.loc[seg_tf, tmp_gene]
#     seg_tf2gene = seg_tf2gene[seg_tf2gene > tf2gene_threshold]
#     seg_tf = seg_tf2gene.index.values

#     # Pull out the IDs for those TFs
#     seg_id2tf = map_id_to_tf.loc[map_id_to_tf["TF_Name"].isin(seg_tf)]
#     seg_id2tf = seg_id2tf.loc[seg_id2tf["Motif_ID"].isin(seg_mo)]

#     n_con = len(seg_id2tf['TF_Name'].unique())
#     # print(f"\tFound {n_con} TF-gene connections...")
#     if n_con > 0:
#         # Get the motif to TF names
#         seg_mo_sims = seg_mo_sims[seg_id2tf["Motif_ID"].unique()]
#         seg_mo_sims = seg_mo_sims.loc[seg_mo_sims.max(axis=1) > mo_sim_threshold]
#         seg_mo_sims = seg_mo_sims.unstack().reset_index()
#         seg_mo_sims.columns = ["Motif", "Pattern", "Motif_r"]
#         seg_mo_sims = seg_mo_sims.loc[seg_mo_sims["Motif_r"] > mo_sim_threshold]



#         new_conns = []
#         for tmp_tf in seg_id2tf["TF_Name"].unique():
#             # Reselect motif ids that map to the indicated TF
#             tmp_mo_ids = seg_id2tf.loc[\
#                 seg_id2tf["TF_Name"] == tmp_tf, "Motif_ID"].values
#             # Reselect patterns that map to those motif ids above
#             tmp_pats = seg_mo_sims.loc[\
#                 seg_mo_sims["Motif"].isin(tmp_mo_ids), "Pattern"].unique()
#             # Select the connections that match those patterns
#             tmp_new_conns = seg_gene2seqlets.loc[\
#                 seg_gene2seqlets["pattern"].isin(tmp_pats)].copy()
#             # Label the connection with the TF name
#             tmp_new_conns["TF_CON"] = tmp_tf
#             new_conns.append(tmp_new_conns)

#         new_conns = pd.concat(new_conns)

#     else:
#         seg_mo_sims = None
#         new_conns = None

#     conn_results[tmp_gene] = {\
#         "new_conns": new_conns,
#         "seg_mo_sims": seg_mo_sims,
#         "seg_id2tf": seg_id2tf,
#         "seg_tf2gene": seg_tf2gene}

In [12]:
mo_sim_threshold = 0.75
tf2gene_threshold = 0.4

list_tfs = map_id_to_tf["TF_Name"][map_id_to_tf["TF_Name"].isin(\
    tf2gene.index.values)].unique()

conn_results = []
for i, tmp_tf in enumerate(list_tfs):
    print(f"Working on TF {i+1}/{len(list_tfs)}...",
        end='\r', flush=True)

    # Get the motif names that Cisbp2 says map to this TF
    tmp_tf_ids = map_id_to_tf.loc[\
        map_id_to_tf["TF_Name"] == tmp_tf, 
        "Motif_ID"].unique()

    # Get the modisco patterns that match the motif in similarity (correlation)
    tmp_pats = (mo_similarities[tmp_tf_ids] > mo_sim_threshold).sum(axis=1)
    tmp_pats = tmp_pats.loc[tmp_pats > 0].index.values

    # Get the genes that are connected to those patterns
    seg_gene_to_seqlets = gene_to_seqlets.loc[\
        gene_to_seqlets["pattern"].isin(tmp_pats)]

    # Get the list of genes that correlate to that TF
    seg_gene_targets = tf2gene.columns.values[\
        (tf2gene.loc[tmp_tf] > tf2gene_threshold).values]

    seg_gene_to_seqlets = seg_gene_to_seqlets.loc[\
        seg_gene_to_seqlets["Gene"].isin(seg_gene_targets)]

    # Add the relevant TF information for the connection
    seg_gene_to_seqlets["TF_con"] = tmp_tf
    seg_gene_to_seqlets["TF_gene_corr"] = tf2gene.loc[\
        tmp_tf, 
        seg_gene_to_seqlets["Gene"].values].values

    conn_results.append(seg_gene_to_seqlets)
    seg_gene_to_seqlets.reset_index(drop=True).to_feather(\
        f"output/GRN/all_genes/tf_links_observed/{tmp_tf}_links.fea")
    
conn_results = pd.concat(conn_results)

Working on TF 726/727...

In [13]:
conn_results = pd.concat(conn_results)

In [14]:
test = conn_results.loc[conn_results["TF_con"] == "Cebpe"]

In [17]:
print(test["Gene"].value_counts().to_string())

Tmem156          1700
0610040J01Rik    1657
Tmed2            1573
Hvcn1            1544
Orai1            1543
Arpc3            1479
Vps29            1363
Gm43409          1330
Gdpd3            1291
Arl6ip4          1289
Atp2a2           1250
Aldoa            1227
Sephs2           1164
Pgm2             1158
Sec61a1          1061
Lta4h            1047
Pcyox1l          1032
Idh2              999
Fes               999
Tm6sf1            999
Sec11a            969
Txndc11           969
Rpn1              966
Ncstn             901
Hal               896
Fcgr2b            894
Atf6              887
Ufc1              873
Btbd1             860
Rnpep             845
Snx4              837
Tmem205           810
Mpi               810
Fcgr3             806
Fcgr4             781
Cox5a             776
Prkcsh            771
Bace1             770
Cep164            770
Pgs1              764
Gm43569           756
Rab3d             753
Dad1              750
Scamp2            748
Ola1              746
Ccdc84    

: 