In [1]:
import os 
import pandas as pd
import numpy as np
import json
import scanpy as sc

os.chdir("/Volumes/Kyle_T7_2/grimes_lab/analysis/"\
    "2024_01_10_multilin_grn_visualization/")

In [2]:
path_grn_input = "/Volumes/Kyle_T7_2/grimes_lab/analysis/"\
    "2023_06_12_tea_seq_atac_processing/output/GRN/"

uconns = pd.read_feather(os.path.join(path_grn_input, 
    "all_unique_connections.fea"))



In [3]:
clusters_to_use = np.unique([i.split("__")[0] for i in uconns["pattern"].values])

In [5]:
# Read in the CITE-seq RNA counts
path_cite_data = "/Volumes/Kyle_T7_2/grimes_lab/data/"\
    "2021_11_mouse_optimized_cite_seq/processed_files/" 

adata_cite = sc.read(os.path.join(\
    path_cite_data, 
    "cite_seq_adata_rna_combined.h5ad"))
print("Computing CPTT normalized scRNA-seq from CITE-seq...")
adata_cite.X = np.log2((10000 * (adata_cite.X.T / \
    adata_cite.X.sum(axis=1).T).T) + 1)
cite_cell_anno = pd.read_csv(os.path.join(\
    path_cite_data, 
    "cite_seq_cell_annotations.csv"))
cite_cell_anno.index = cite_cell_anno["Cell_Barcode"].values

# Make pseudobulk RNA counts from cite clusters
print("Computing RNA centroids...")
shared_cells_cite = np.intersect1d(cite_cell_anno.index.values, 
    adata_cite.obs.index.values)
cite_cell_anno = cite_cell_anno.loc[shared_cells_cite]
cite_rna = {}
for tmp_cluster in cite_cell_anno["sctri_cite"].unique():
    print("\t{}...".format(tmp_cluster))
    tmp_barcodes = cite_cell_anno.loc[\
        cite_cell_anno["sctri_cite"] == tmp_cluster].index.values
    cite_rna[tmp_cluster] = pd.Series(np.asarray(\
            adata_cite[tmp_barcodes].X.mean(axis=0)).reshape(-1),
            index=adata_cite.var.index.values)

cite_rna = pd.DataFrame(cite_rna)
cite_rna = cite_rna.drop("Unknown", axis=1)
map_r7_names = pd.read_csv(os.path.join(\
    path_cite_data, 
    "map_r7-v1_to_r7-v2_names.csv"))
map_r7_names_v1_to_v2 = pd.Series(\
    map_r7_names["R7_V2"].values,
    index = map_r7_names["R7_V1"].values)

cite_rna = cite_rna.rename(map_r7_names_v1_to_v2, axis=1)
cite_rna.columns = [i.replace("-", "_") for i in cite_rna.columns.values]

Computing CPTT normalized scRNA-seq from CITE-seq...
Computing RNA centroids...
	ERP4--Ex...
	MultiLin-2_ML-c9--RNA...
	ERP2_Kit-c14--RNA...
	ERP2--Ex...
	Ebf1+_proB_CD127-c2--RNA...
	proNeu-1_1-4--ADT...
	Unknown...
	preNeu-2_1-8--ADT...
	cMoP_ML-c15--RNA...
	immNeu_Kit-c3--RNA...
	MP--Ex...
	proNeu-2_Kit-c4--RNA...
	Ebf1+_proB--Ex...
	ML_ERP1_ML-c4--RNA...
	MDP--Ex...
	preNeu-3_Kit-c2--RNA...
	ERP4_Kit-c13--RNA...
	preNeu-3_2-3--ADT...
	immNeu_2-5--ADT...
	preNeu-1_Kit-c5--RNA...
	immNeu_Kit-c1--RNA...
	cMoP--Ex...
	neoHPC_Myc--Ex...
	Bcl11b+_preETP--Ex...
	ML_Multi-Lin-2_HSC-c7--RNA...
	Ebf1+_proB_CD127-c8--RNA...
	IG2-proNeu1_ML-c14--RNA...
	ML_Mast--Ex...
	ML_cell_cycle_ML-c5--RNA...
	HSC-HPC-Cenpf_HSC-c12--RNA...
	MultiLin-2_ERP_ML-c10--RNA...
	ML_MDP_ML-c13--RNA...
	proNeu-1--Ex...
	SiglecH-Ly6C-pre-DC--Ex...
	HSC-HPC-Mki67--Ex...
	Pro-B_CD127-c3--RNA...
	precursor_B_cell_5-2--ADT...
	CLP1_CD127-c5--RNA...
	ML_MultiLin-1_ML-c8--RNA...
	ETP-A-0-Ccl4--Ex...
	ETP-CC-4-Ung--Ex...
	e

In [55]:
unique_genes = uconns["Gene"].unique()
unique_tfs = uconns["TF_CON"].unique()

In [61]:
from pyInfinityFlow.InfinityFlow_Utilities import pearson_corr_df_to_df

tf2gene = pearson_corr_df_to_df(\
    cite_rna.loc[unique_tfs, clusters_to_use].T,
    cite_rna.loc[unique_genes, clusters_to_use].T).dropna()


In [66]:
uconns["corr"] = [tf2gene.loc[row["TF_CON"], row["Gene"]] for i, row in \
    uconns.iterrows()]

In [74]:
gene_max_corr = pd.pivot_table(\
    uconns[["Gene", "corr"]].sort_values(by="corr"), 
    index="Gene", aggfunc=np.max)

filtered_genes = gene_max_corr.loc[gene_max_corr["corr"] > 0.8].index.values
len(filtered_genes)

463

In [75]:
uconns_filtered = uconns.loc[uconns["Gene"].isin(filtered_genes)].copy()
uconns_filtered

Unnamed: 0,Gene,seqlet_idx,r,index,chr,start,end,peak,score,pos,strand,pattern,in_modisco,seq_name,dp_score,Cisbp2_TF,Cisbp2_TF_Family,TF_CON,corr
0,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Mlx,0.579458
1,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Usf1,0.601847
2,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Atf6b,0.735982
3,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Xbp1,0.872854
4,Ube2w,2459399,0.627987,2459399,chr1,16220342,16220372,chr1:16219841-16220841,5.955649,502,-,ERP1__pattern_5,False,chr1:16220342-16220372,0.300545,Atf6b,bZIP,Atf6,0.766628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3404572,Gprasp2,13340622,0.649025,13340622,chrX,135975335,135975365,chrX:135974847-135975847,5.678154,489,+,MultiLin_1_MEP__pattern_11,False,chrX:135975335-135975365,0.411284,Tfeb,bHLH,Hey1,0.724798
3404573,Gprasp2,13340622,0.649025,13340622,chrX,135975335,135975365,chrX:135974847-135975847,5.678154,489,+,MultiLin_1_MEP__pattern_11,False,chrX:135975335-135975365,0.411284,Tfeb,bHLH,Hes5,0.601851
3404574,Gprasp2,13340622,0.649025,13340622,chrX,135975335,135975365,chrX:135974847-135975847,5.678154,489,+,MultiLin_1_MEP__pattern_11,False,chrX:135975335-135975365,0.411284,Tfeb,bHLH,Arntl,0.505521
3404575,Gprasp2,13340622,0.649025,13340622,chrX,135975335,135975365,chrX:135974847-135975847,5.678154,489,+,MultiLin_1_MEP__pattern_11,False,chrX:135975335-135975365,0.411284,Tfeb,bHLH,Creb3l2,0.631822


In [82]:

unique_tf_to_gene_filtered = (uconns_filtered["TF_CON"] + ":" + \
    uconns_filtered["Gene"]).unique()
len(unique_tf_to_gene_filtered)
sif_filtered = pd.DataFrame([i.split(":") for i in unique_tf_to_gene_filtered], 
    columns=["TF", "Gene"])

sif_filtered["Type"] = "pd"

sif_filtered[["TF", "Type", "Gene"]].to_csv("output/all_unique_connections_corr_0_8.sif",
    sep="\t", header=False, index=False)

In [9]:

unique_tf_to_gene = (uconns["TF_CON"] + ":" + uconns["Gene"]).unique()
len(unique_tf_to_gene)
sif = pd.DataFrame([i.split(":") for i in unique_tf_to_gene], 
    columns=["TF", "Gene"])

sif["Type"] = "pd"

sif[["TF", "Type", "Gene"]].to_csv("output/all_unique_connections.sif",
    sep="\t", header=False, index=False)