In [1]:
import os
import pandas as pd
import numpy as np
import re
import scanpy as sc
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import seaborn as sns
import matplotlib.pyplot as plt

os.chdir("/data/salomonis2/LabFiles/Kyle/Analysis/2023_06_12_tea_seq_atac_processing/")

In [2]:
## Helper functions
def pearson_corr_df_to_df(df1, df2):
    norm1 = df1 - df1.mean(axis=0)
    norm2 = df2 - df2.mean(axis=0)
    sqsum1 = (norm1**2).sum(axis=0)
    sqsum2 = (norm2**2).sum(axis=0)
    return((norm1.T @ norm2) / np.sqrt(sqsum1.apply(lambda x: x*sqsum2)))


def marker_finder(input_df, groups):
    """
    Function to find pearson correlation coefficient values and p-values for 
    the given data and groups for groups to test. The function will perform a 
    Pearson correlation of the input_df feature values to an "idealized" 
    group specific expression vector, where each observation in a given group
    is set to a value of 1, and the observations in other groups are set to 0.
    Arguments
    ---------
    input_df : pandas.DataFrame
        DataFrame with observations as index and features as columns (Required)
    
    groups : list[str]
        List-like of specified groups corresponding to observations from the 
        input_df. The order of groups should match the order in input_df.index
        (Required)
    Returns
    -------
    tuple (pandas.DataFrame, pandas.DataFrame)
        The first item in the tuple is a pandas.DataFrame containing the pearson
        correlation coefficient values for each marker to the idealized vector
        for each cluster.
        The second item is also a pandas.DataFrame, but contains the p-values 
        for each comparison.
    """
    ideal_vectors = pd.get_dummies(groups)
    ideal_vectors.index = input_df.index.values
    degrees_f = input_df.shape[0] - 2
    r_df = pearson_corr_df_to_df(input_df, ideal_vectors)
    t_df = r_df*np.sqrt(degrees_f) / np.sqrt(1-(r_df**2))
    p_df = t_df.applymap(lambda x: stats.t.sf(abs(x), df=degrees_f)*2)
    return((r_df, p_df))

In [3]:
# Read in the ATAC data
path_atac_data = "output/tea_r7_pseudobulk_from_pmat/"

peaks = pd.read_table(\
    os.path.join(path_atac_data, "r7_tea_cluster_peak_set_with_tss.bed"),
    header=None)
peaks.columns = ["chr", "start", "end", "name", "score", "strand"]

atac_counts = pd.read_csv(\
    os.path.join(path_atac_data, "r7_tea_pbulk_cpm_from_binary_pmat.csv"))

atac_counts


Unnamed: 0,IG2-proNeu1,ST-HSC,HSCP-HPC_Tk1,IG2-MP,MPP5-Egr1,HSCP-ERP1,MPP4-Hlf,MDP-Cpa3,MPP4-Nkx2-3,eHSC,...,immNeu-2,CHILP,alphaLP,immNeu-3,pre-cDC1_Xcr1,cMoP-Mki67,cKit-Mast,CLP2,pre-cDC2,cMoP-S100a4
0,0.406455,0.000000,0.103804,0.284799,0.000000,0.217408,0.00000,0.291983,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.310092,0.000000,0.070026,0.051544,0.000000,0.166050,0.07199,0.489317,0.091988,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.387690,0.041731,0.035438,0.368475,0.058635,0.217408,0.00000,0.343904,0.091988,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.443267,0.000000,0.000000,0.408561,0.058635,0.000000,0.00000,0.238124,0.000000,0.364004,...,0.0,0.0,0.0,0.0,0.0,3.881428,0.0,0.0,0.0,0.0
4,0.443267,0.000000,0.103804,0.485538,0.058635,0.112795,0.00000,0.238124,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757535,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
757536,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
757537,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
757538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [4]:
# Filter to only the clusters with at least 25 cells
atac_r7_vcounts = pd.read_csv(\
    os.path.join(path_atac_data, "r7_cell_counts_tea_seq_merged_captures.csv"),
    index_col=0, header=None).iloc[:,0]
atac_r7_vcounts.index.name = None

clusters_to_use = atac_r7_vcounts[atac_r7_vcounts > 25].index.values

clusters_to_use

array(['IG2-proNeu1', 'ST-HSC', 'HSCP-HPC_Tk1', 'IG2-MP', 'MPP5-Egr1',
       'HSCP-ERP1', 'MPP4-Hlf', 'MDP-Cpa3', 'MPP4-Nkx2-3', 'eHSC',
       'HSCP-HPC_Hist1h2af', 'pre-MultiLin-1', 'MDP-Irf8', 'MEP',
       'MultiLin-2_F13a1', 'LT-HSC_Mllt3', 'eHSC-Pcna', 'HSCP-HPC_Cenpf',
       'HSCP-MKP', 'MPP5-Flt3', 'MultiLin-1', 'pre-MultiLin-2', 'BMCP',
       'proNeu-1', 'ML-cell-cycle', 'ERP1', 'MultiLin-1_MEP',
       'MultiLin-2_Ms4a3', 'CLP1-Rrm2', 'MKP', 'HSC-Mac_Fcna', 'CD127-MP',
       'ETP-CC-4', 'Baso', 'MultiLin-1_preBMCP', 'ERP2', 'Eosinophils',
       'ML-Mast', 'DN4-DP-trans_Hist1h3c', 'CLP1-Hist1h1c',
       'pre-cDC1_Egfl8', 'ST-HSC-CC-Mac-1', 'proNeu-1-ADT', 'immNeu-1',
       'MPP4-Ccr9', 'Ebf1+ proB_Hmga2', 'ETP-A-0'], dtype=object)

In [5]:
# Read in the CITE-seq RNA counts
path_cite_data = "/data/salomonis2/LabFiles/Kyle/Data/2021_11_mouse_optimized_cite_seq/processed_files/"

adata_cite = sc.read(os.path.join(\
    path_cite_data, 
    "cite_seq_adata_rna_combined.h5ad"))
print("Computing CPTT normalized scRNA-seq from CITE-seq...")
adata_cite.X = np.log2((10000 * (adata_cite.X.T / \
    adata_cite.X.sum(axis=1).T).T) + 1)
cite_cell_anno = pd.read_csv(os.path.join(\
    path_cite_data, 
    "cite_seq_cell_annotations.csv"))
cite_cell_anno.index = cite_cell_anno["Cell_Barcode"].values

cite_cell_anno

Computing CPTT normalized scRNA-seq from CITE-seq...


Unnamed: 0,Cell_Barcode,batch,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,identity,source,sctri_cite
AS_3CITE_Kit_AAACCCAAGAGAACCC-1,AS_3CITE_Kit_AAACCCAAGAGAACCC-1,cite,4595,31300.0,766.0,2.447284,Kit,other,ERP4--Ex
AS_3CITE_Kit_AAACCCAAGGACTAAT-1,AS_3CITE_Kit_AAACCCAAGGACTAAT-1,cite,2852,12667.0,266.0,2.099945,Kit,other,MultiLin-2_ML-c9--RNA
AS_3CITE_Kit_AAACCCACAACCAATC-1,AS_3CITE_Kit_AAACCCACAACCAATC-1,cite,5128,46822.0,1297.0,2.770065,Kit,other,ERP2_Kit-c14--RNA
AS_3CITE_Kit_AAACCCACAAGGCAAC-1,AS_3CITE_Kit_AAACCCACAAGGCAAC-1,cite,3114,38088.0,554.0,1.454526,Kit,other,ERP2--Ex
AS_3CITE_Kit_AAACCCACAGATTTCG-1,AS_3CITE_Kit_AAACCCACAGATTTCG-1,cite,2209,6442.0,160.0,2.483701,Kit,other,Ebf1+_proB_CD127-c2--RNA
...,...,...,...,...,...,...,...,...,...
AS_CITE_CD127_TTTGTTGGTCGTCGGT-1,AS_CITE_CD127_TTTGTTGGTCGTCGGT-1,cite,4356,26022.0,548.0,2.105910,CD127,other,cMoP--Ex
AS_CITE_CD127_TTTGTTGTCAAGGTGG-1,AS_CITE_CD127_TTTGTTGTCAAGGTGG-1,cite,4271,26772.0,287.0,1.072016,CD127,other,MP_ML-c12--RNA
AS_CITE_CD127_TTTGTTGTCCAACTGA-1,AS_CITE_CD127_TTTGTTGTCCAACTGA-1,cite,2816,9932.0,136.0,1.369311,CD127,other,CLP2_CD127-c10--RNA
AS_CITE_CD127_TTTGTTGTCCTTTGAT-1,AS_CITE_CD127_TTTGTTGTCCTTTGAT-1,cite,4817,28690.0,244.0,0.850471,CD127,other,MDP--Ex


In [6]:
# Make pseudobulk RNA counts from cite clusters
print("Computing RNA centroids...")
shared_cells_cite = np.intersect1d(cite_cell_anno.index.values, 
    adata_cite.obs.index.values)
cite_cell_anno = cite_cell_anno.loc[shared_cells_cite]
cite_rna = {}
for tmp_cluster in cite_cell_anno["sctri_cite"].unique():
    print("\t{}...".format(tmp_cluster))
    tmp_barcodes = cite_cell_anno.loc[\
        cite_cell_anno["sctri_cite"] == tmp_cluster].index.values
    cite_rna[tmp_cluster] = pd.Series(np.asarray(\
            adata_cite[tmp_barcodes].X.mean(axis=0)).reshape(-1),
            index=adata_cite.var.index.values)

cite_rna = pd.DataFrame(cite_rna)
cite_rna = cite_rna.drop("Unknown", axis=1)
map_r7_names = pd.read_csv(os.path.join(\
    path_cite_data, 
    "map_r7-v1_to_r7-v2_names.csv"))
map_r7_names_v1_to_v2 = pd.Series(\
    map_r7_names["R7_V2"].values,
    index = map_r7_names["R7_V1"].values)

cite_rna = cite_rna.rename(map_r7_names_v1_to_v2, axis=1)
cite_rna = cite_rna[clusters_to_use]

Computing RNA centroids...
	ERP4--Ex...
	MultiLin-2_ML-c9--RNA...
	ERP2_Kit-c14--RNA...
	ERP2--Ex...
	Ebf1+_proB_CD127-c2--RNA...
	proNeu-1_1-4--ADT...
	Unknown...
	preNeu-2_1-8--ADT...
	cMoP_ML-c15--RNA...
	immNeu_Kit-c3--RNA...
	MP--Ex...
	proNeu-2_Kit-c4--RNA...
	Ebf1+_proB--Ex...
	ML_ERP1_ML-c4--RNA...
	MDP--Ex...
	preNeu-3_Kit-c2--RNA...
	ERP4_Kit-c13--RNA...
	preNeu-3_2-3--ADT...
	immNeu_2-5--ADT...
	preNeu-1_Kit-c5--RNA...
	immNeu_Kit-c1--RNA...
	cMoP--Ex...
	neoHPC_Myc--Ex...
	Bcl11b+_preETP--Ex...
	ML_Multi-Lin-2_HSC-c7--RNA...
	Ebf1+_proB_CD127-c8--RNA...
	IG2-proNeu1_ML-c14--RNA...
	ML_Mast--Ex...
	ML_cell_cycle_ML-c5--RNA...
	HSC-HPC-Cenpf_HSC-c12--RNA...
	MultiLin-2_ERP_ML-c10--RNA...
	ML_MDP_ML-c13--RNA...
	proNeu-1--Ex...
	SiglecH-Ly6C-pre-DC--Ex...
	HSC-HPC-Mki67--Ex...
	Pro-B_CD127-c3--RNA...
	precursor_B_cell_5-2--ADT...
	CLP1_CD127-c5--RNA...
	ML_MultiLin-1_ML-c8--RNA...
	ETP-A-0-Ccl4--Ex...
	ETP-CC-4-Ung--Ex...
	eHSC-Pcna_HSC-c9--RNA...
	MPP4-Flt3_HSC-c6--RNA...
	MP

In [7]:
# Pull out the TSS positions from peaks
tss_loci = peaks.loc[peaks["name"].str.contains("TSS:")].copy()
tss_loci["gene"] = [i.split(":")[-1] for i in tss_loci["name"].values]
tss_loci = tss_loci[["gene", "chr", "start"]]
tss_loci["start"] = tss_loci["start"] + 500
tss_loci

Unnamed: 0,gene,chr,start
114,Xkr4,chr1,3671497
373,Mrpl15,chr1,4785709
382,Lypla1,chr1,4807822
404,Tcea1,chr1,4857813
442,Rgs20,chr1,5018734
...,...,...,...
751317,Uty,chrY,1245690
751334,Ddx3y,chrY,1286628
751345,Usp9y,chrY,1459781
757491,Gm21860,chrY,90755466


In [8]:
# Filter to expressed genes that have TSS annotation
genes_to_use = cite_rna.loc[cite_rna.sum(axis=1) > 0].index.values
genes_to_use = np.intersect1d(\
    genes_to_use,
    tss_loci["gene"].values)
len(genes_to_use)

15573

In [9]:
# Read in the TADs
tads = pd.read_table("input/GSE119347_BMHSC_TADs/GSE119347_BMHSC_TADs_mm10_liftover.bed", 
    header=None)
tads.columns = ["chr", "start", "end", "name", "score"]
tads

Unnamed: 0,chr,start,end,name,score
0,chr10,5881540,7341540,chr10:3005001-4465000,1
1,chr10,5321540,5881540,chr10:4465001-5025000,1
2,chr10,4401540,5321540,chr10:5025001-5945000,1
3,chr10,3961540,4401540,chr10:5945001-6385000,1
4,chr10,3100000,3961540,chr10:6385001-7265000,1
...,...,...,...,...,...
2511,chrX,162907068,164497068,chrX:159345001-160935000,1
2512,chrX,164497068,166137068,chrX:160935001-162575000,1
2513,chrX,166137068,167267068,chrX:162575001-163705000,1
2514,chrX,167267068,168537068,chrX:163705001-164975000,1


In [10]:
## Assign each atac peak a TAD
print("Assigning TADs to ATAC peaks...")
atac_tad_assignments = pd.Series([np.nan] * peaks.shape[0], 
    index=peaks.index.values)
for tmp_chr in tads["chr"].unique():
    print("\tWorking on {}...".format(tmp_chr))
    seg_tads = tads.loc[tads["chr"] == tmp_chr].copy()
    seg_peaks = peaks.loc[peaks["chr"] == tmp_chr].copy()
    for i, tmp_tad in seg_tads.iterrows():
        tmp_hits = ~((tmp_tad["start"] > seg_peaks["end"]) | \
            (tmp_tad["end"] < seg_peaks["start"]))
        tmp_hits = tmp_hits[tmp_hits].index.values
        if len(tmp_hits) > 0:
            atac_tad_assignments.loc[tmp_hits] = i

atac_tad_assignments = atac_tad_assignments[\
    ~atac_tad_assignments.isna()].astype(int)

peaks["tad"] = ""
peaks.loc[atac_tad_assignments.index.values, "tad"] = \
    (tads.loc[atac_tad_assignments.values, "chr"] + ":" + \
    tads.loc[atac_tad_assignments.values, "start"].astype(str) + "-" + \
    tads.loc[atac_tad_assignments.values, "end"].astype(str)).values

Assigning TADs to ATAC peaks...
	Working on chr10...
	Working on chr11...
	Working on chr12...
	Working on chr19...
	Working on chrX...
	Working on chr8...
	Working on chr13...
	Working on chr5...
	Working on chr2...
	Working on chr6...
	Working on chr14...
	Working on chr15...
	Working on chr18...
	Working on chr16...
	Working on chr17...
	Working on chr4...
	Working on chr1...
	Working on chr3...
	Working on chr7...
	Working on chr9...
	Working on chrX_GL456233_random...
	Working on chrY...


In [11]:
# Assign TADs to tss start sites
print("Assigning TADs to start sites of variably expressed genes...")
tss_tad_assignments = pd.Series([np.nan] * tss_loci.shape[0], 
    index=tss_loci.index.values)
for tmp_chr in tads["chr"].unique():
    print("\tWorking on {}...".format(tmp_chr))
    seg_tads = tads.loc[tads["chr"] == tmp_chr].copy()
    seg_starts = tss_loci.loc[tss_loci["chr"] == tmp_chr].copy()
    for i, tmp_tad in seg_tads.iterrows():
        tmp_hits = ~((tmp_tad["start"] > seg_starts["start"]) | \
            (tmp_tad["end"] < seg_starts["start"]))
        tmp_hits = tmp_hits[tmp_hits].index.values
        if len(tmp_hits) > 0:
            tss_tad_assignments.loc[tmp_hits] = i

tss_tad_assignments = tss_tad_assignments[\
    ~tss_tad_assignments.isna()].astype(int)

tss_loci["tad"] = ""
tss_loci.loc[tss_tad_assignments.index.values, "tad"] = \
    (tads.loc[tss_tad_assignments.values, "chr"] + ":" + \
    tads.loc[tss_tad_assignments.values, "start"].astype(str) + "-" + \
    tads.loc[tss_tad_assignments.values, "end"].astype(str)).values

tss_loci.index = tss_loci["gene"]
tss_loci = tss_loci.loc[genes_to_use]

Assigning TADs to start sites of variably expressed genes...
	Working on chr10...
	Working on chr11...
	Working on chr12...
	Working on chr19...
	Working on chrX...
	Working on chr8...
	Working on chr13...
	Working on chr5...
	Working on chr2...
	Working on chr6...
	Working on chr14...
	Working on chr15...
	Working on chr18...
	Working on chr16...
	Working on chr17...
	Working on chr4...
	Working on chr1...
	Working on chr3...
	Working on chr7...
	Working on chr9...
	Working on chrX_GL456233_random...
	Working on chrY...


In [12]:
## Correlate ATAC with gene expression within TADs
print("Finding correlations between peaks and genes in TADs...")
corr_dict = {}
tads_without_peaks = []
for tmp_tad in tss_loci["tad"].unique():
    if tmp_tad != "":
        seg_gene_anno = tss_loci.loc[tss_loci["tad"] == tmp_tad]
        tmp_rna = cite_rna.loc[seg_gene_anno.index.values].T
        tmp_atac = atac_counts.loc[(peaks["tad"] == tmp_tad).values, cite_rna.columns.values].T
        if tmp_atac.shape[1] > 0:
            print("\tWorking on tad {} containing {} genes and {} peaks..."\
                .format(tmp_tad, tmp_rna.shape[1], tmp_atac.shape[1]))
            r_df = pearson_corr_df_to_df(tmp_rna, tmp_atac)
            degrees_f = cite_rna.shape[1] - 2
            t_df = r_df*np.sqrt(degrees_f) / np.sqrt(1-(r_df**2))
            p_df = t_df.applymap(lambda x: stats.t.sf(abs(x), df=degrees_f)*2)
            corr_res = pd.concat([r_df.stack(), p_df.stack()], axis=1).reset_index()
            corr_res.iloc[:,1] = (peaks.loc[corr_res.iloc[:,1].values,"chr"] + ":" + \
                peaks.loc[corr_res.iloc[:,1].values,"start"].astype(str) + "-" + \
                peaks.loc[corr_res.iloc[:,1].values,"end"].astype(str)).values
            corr_res.columns = ["gene", "peak", "r", "p"]
            corr_dict[tmp_tad] = {}
            corr_dict[tmp_tad]["r_df"] = r_df
            corr_dict[tmp_tad]["p_df"] = p_df
            corr_dict[tmp_tad]["sig_conns"] = corr_res.loc[corr_res["p"] < 0.001]
        else:
            tads_without_peaks.append(tmp_tad)

Finding correlations between peaks and genes in TADs...
	Working on tad chr11:51591498-52401498 containing 19 genes and 371 peaks...
	Working on tad chr11:119273686-120403686 containing 22 genes and 608 peaks...
	Working on tad chr11:21574997-23995000 containing 16 genes and 881 peaks...
	Working on tad chr11:69611498-70881498 containing 69 genes and 644 peaks...
	Working on tad chr16:31924914-34144914 containing 37 genes and 1073 peaks...
	Working on tad chr6:71835006-72355006 containing 7 genes and 262 peaks...
	Working on tad chr8:77371101-77561101 containing 2 genes and 87 peaks...
	Working on tad chr2:163439264-163959264 containing 8 genes and 261 peaks...
	Working on tad chr6:108045006-108825006 containing 7 genes and 344 peaks...
	Working on tad chr5:63783761-67253761 containing 31 genes and 1549 peaks...
	Working on tad chr4:101152395-101672395 containing 8 genes and 260 peaks...
	Working on tad chr5:109475981-110195981 containing 15 genes and 212 peaks...
	Working on tad chr12

	Working on tad chr18:37935346-38925346 containing 10 genes and 506 peaks...
	Working on tad chr4:128547756-129137756 containing 11 genes and 276 peaks...
	Working on tad chr19:8650510-9160510 containing 27 genes and 246 peaks...
	Working on tad chr16:96033393-98093835 containing 16 genes and 888 peaks...
	Working on tad chr10:76232255-77952255 containing 21 genes and 797 peaks...
	Working on tad chr6:29715000-30395000 containing 7 genes and 322 peaks...
	Working on tad chr4:136199085-138389085 containing 30 genes and 1111 peaks...
	Working on tad chr11:109203686-110343686 containing 16 genes and 457 peaks...
	Working on tad chr17:29558055-30968055 containing 10 genes and 716 peaks...
	Working on tad chr13:102654916-103134920 containing 4 genes and 185 peaks...
	Working on tad chr9:107552669-108652669 containing 52 genes and 562 peaks...
	Working on tad chr14:122175778-122925778 containing 5 genes and 262 peaks...
	Working on tad chr14:61296163-62476163 containing 11 genes and 443 peak

	Working on tad chr1:118278423-119448423 containing 6 genes and 539 peaks...
	Working on tad chr7:79960114-82656490 containing 39 genes and 1125 peaks...
	Working on tad chr5:143984131-144344131 containing 6 genes and 179 peaks...
	Working on tad chr2:162889264-163439264 containing 7 genes and 271 peaks...
	Working on tad chr6:148176478-148946478 containing 7 genes and 321 peaks...
	Working on tad chr16:13064907-13684907 containing 5 genes and 278 peaks...
	Working on tad chr1:119998423-120618423 containing 4 genes and 281 peaks...
	Working on tad chr4:108682395-109242395 containing 8 genes and 194 peaks...
	Working on tad chr1:169474869-170184869 containing 6 genes and 196 peaks...
	Working on tad chr13:118276193-119495505 containing 6 genes and 300 peaks...
	Working on tad chr5:138729046-139799046 containing 17 genes and 571 peaks...
	Working on tad chr18:37415346-37935346 containing 33 genes and 212 peaks...
	Working on tad chr12:99056790-99906790 containing 5 genes and 397 peaks...

	Working on tad chr7:111981486-114841486 containing 16 genes and 1058 peaks...
	Working on tad chr7:82656490-83556490 containing 2 genes and 235 peaks...
	Working on tad chr10:18255194-20315194 containing 14 genes and 826 peaks...
	Working on tad chr14:24475778-27531814 containing 23 genes and 1299 peaks...
	Working on tad chr11:43391498-43771498 containing 6 genes and 185 peaks...
	Working on tad chr8:27024528-27304528 containing 6 genes and 137 peaks...
	Working on tad chr8:121921100-123571100 containing 40 genes and 894 peaks...
	Working on tad chr7:142749095-143539095 containing 12 genes and 335 peaks...
	Working on tad chr2:90414843-90834843 containing 5 genes and 197 peaks...
	Working on tad chr3:30556078-31126078 containing 9 genes and 250 peaks...
	Working on tad chr4:62293966-63353966 containing 14 genes and 509 peaks...
	Working on tad chr3:107582082-108182082 containing 14 genes and 300 peaks...
	Working on tad chr9:74867193-75647193 containing 11 genes and 353 peaks...
	Wor

	Working on tad chr8:47459646-48149646 containing 8 genes and 273 peaks...
	Working on tad chr2:122029264-122789264 containing 13 genes and 315 peaks...
	Working on tad chr1:130438423-131098423 containing 8 genes and 260 peaks...
	Working on tad chr2:34669480-35189480 containing 10 genes and 248 peaks...
	Working on tad chr17:27018055-28198055 containing 20 genes and 627 peaks...
	Working on tad chr4:53012128-53712128 containing 4 genes and 286 peaks...
	Working on tad chr8:34151946-37121946 containing 10 genes and 1077 peaks...
	Working on tad chr3:105982082-107582082 containing 17 genes and 494 peaks...
	Working on tad chr11:82731498-83301498 containing 17 genes and 247 peaks...
	Working on tad chr5:31532627-32202627 containing 6 genes and 279 peaks...
	Working on tad chr2:128609264-130289264 containing 19 genes and 645 peaks...
	Working on tad chr13:21143131-22083131 containing 33 genes and 330 peaks...
	Working on tad chr7:74350114-75890114 containing 5 genes and 543 peaks...
	Work

	Working on tad chr2:57962589-58692589 containing 6 genes and 208 peaks...
	Working on tad chr2:47959480-48879480 containing 2 genes and 224 peaks...
	Working on tad chr12:85094050-85764050 containing 13 genes and 330 peaks...
	Working on tad chr2:120319264-121469264 containing 29 genes and 509 peaks...
	Working on tad chr9:69987193-71177193 containing 11 genes and 533 peaks...
	Working on tad chr11:101623686-103073686 containing 32 genes and 687 peaks...
	Working on tad chr11:45641498-46731498 containing 11 genes and 443 peaks...
	Working on tad chr14:64596163-66326163 containing 20 genes and 750 peaks...
	Working on tad chr12:81504013-82534013 containing 7 genes and 472 peaks...
	Working on tad chr14:68566943-69156943 containing 2 genes and 115 peaks...
	Working on tad chr8:24524528-25204528 containing 8 genes and 239 peaks...
	Working on tad chr12:80944013-81504013 containing 8 genes and 169 peaks...
	Working on tad chr16:85764755-87314755 containing 1 genes and 271 peaks...
	Workin

	Working on tad chr16:4964907-5234907 containing 6 genes and 146 peaks...
	Working on tad chr15:89594569-90234569 containing 1 genes and 129 peaks...
	Working on tad chr8:21974528-22124528 containing 2 genes and 65 peaks...
	Working on tad chr4:47422128-48482128 containing 7 genes and 340 peaks...
	Working on tad chr3:54291078-54801078 containing 5 genes and 173 peaks...
	Working on tad chr5:135879130-136189130 containing 11 genes and 150 peaks...
	Working on tad chr7:30229981-31189981 containing 48 genes and 487 peaks...
	Working on tad chr6:115814982-116634982 containing 11 genes and 295 peaks...
	Working on tad chr18:65265346-65805346 containing 4 genes and 259 peaks...
	Working on tad chr1:58388156-59678156 containing 20 genes and 565 peaks...
	Working on tad chr9:109862486-111332496 containing 28 genes and 745 peaks...
	Working on tad chr11:120403686-121982542 containing 47 genes and 729 peaks...
	Working on tad chr1:170864869-172374869 containing 48 genes and 690 peaks...
	Workin

	Working on tad chr8:4795000-9205000 containing 2 genes and 942 peaks...
	Working on tad chr18:6035002-6575002 containing 4 genes and 252 peaks...
	Working on tad chr2:43659480-46519480 containing 5 genes and 639 peaks...
	Working on tad chr7:122951486-123521486 containing 6 genes and 252 peaks...
	Working on tad chr10:26275194-27045194 containing 2 genes and 283 peaks...
	Working on tad chr19:41080510-42200510 containing 18 genes and 554 peaks...
	Working on tad chr9:51316895-51916895 containing 1 genes and 107 peaks...
	Working on tad chr2:20803373-21313373 containing 4 genes and 194 peaks...
	Working on tad chr14:33031814-33551814 containing 3 genes and 239 peaks...
	Working on tad chr5:102155981-103305981 containing 1 genes and 274 peaks...
	Working on tad chr18:38925346-40225346 containing 5 genes and 471 peaks...
	Working on tad chr17:67735660-68285660 containing 1 genes and 201 peaks...
	Working on tad chr9:31457415-33277415 containing 7 genes and 685 peaks...
	Working on tad ch

	Working on tad chr12:101896790-102776790 containing 12 genes and 458 peaks...
	Working on tad chr14:13922486-14122486 containing 3 genes and 93 peaks...
	Working on tad chr10:112197944-115167944 containing 2 genes and 473 peaks...
	Working on tad chr13:51639631-53109631 containing 8 genes and 677 peaks...
	Working on tad chr5:130919130-131489130 containing 1 genes and 100 peaks...
	Working on tad chr2:112164843-112784843 containing 7 genes and 224 peaks...
	Working on tad chr6:56705006-57935006 containing 9 genes and 333 peaks...
	Working on tad chrX:99699661-100499661 containing 3 genes and 85 peaks...
	Working on tad chr11:108443686-109203686 containing 2 genes and 188 peaks...
	Working on tad chr9:118025882-118535882 containing 4 genes and 194 peaks...
	Working on tad chr5:116034991-117054991 containing 3 genes and 260 peaks...
	Working on tad chr2:101444843-101654843 containing 3 genes and 63 peaks...
	Working on tad chr10:11365202-13005194 containing 3 genes and 488 peaks...
	Wor

	Working on tad chr5:43403761-43653761 containing 1 genes and 71 peaks...
	Working on tad chr6:124414982-124804982 containing 11 genes and 171 peaks...
	Working on tad chr6:122804982-124414982 containing 13 genes and 404 peaks...
	Working on tad chr17:83255660-83855660 containing 6 genes and 286 peaks...
	Working on tad chr11:67881498-68441498 containing 5 genes and 235 peaks...
	Working on tad chr9:95934581-96204581 containing 6 genes and 114 peaks...
	Working on tad chr2:33569480-34669480 containing 5 genes and 446 peaks...
	Working on tad chr4:94548309-94998309 containing 7 genes and 127 peaks...
	Working on tad chr14:59136163-60086163 containing 9 genes and 302 peaks...
	Working on tad chr10:75512255-76012255 containing 16 genes and 250 peaks...
	Working on tad chr4:100562395-101152395 containing 2 genes and 156 peaks...
	Working on tad chr2:24279480-24829480 containing 2 genes and 244 peaks...
	Working on tad chr6:118574982-119114982 containing 1 genes and 177 peaks...
	Working on

	Working on tad chr12:117366527-118896527 containing 4 genes and 589 peaks...
	Working on tad chr8:106201100-107601100 containing 21 genes and 654 peaks...
	Working on tad chr18:16246491-19866499 containing 1 genes and 835 peaks...
	Working on tad chr10:60272252-61142252 containing 7 genes and 419 peaks...
	Working on tad chr2:178350295-179860295 containing 4 genes and 597 peaks...
	Working on tad chr13:17653166-18033166 containing 5 genes and 131 peaks...
	Working on tad chr5:4935000-5785000 containing 5 genes and 283 peaks...
	Working on tad chr1:131888423-132508423 containing 9 genes and 320 peaks...
	Working on tad chr11:80421498-80891498 containing 4 genes and 199 peaks...
	Working on tad chr4:70173966-70653966 containing 2 genes and 140 peaks...
	Working on tad chr4:109412395-110052395 containing 4 genes and 191 peaks...
	Working on tad chr9:21287277-21293472 containing 2 genes and 5 peaks...
	Working on tad chr9:34417415-35637415 containing 13 genes and 442 peaks...
	Working on 

	Working on tad chr6:45015001-47155001 containing 1 genes and 328 peaks...
	Working on tad chr8:126421100-126981100 containing 5 genes and 293 peaks...
	Working on tad chr11:11994997-16464997 containing 1 genes and 999 peaks...
	Working on tad chr2:64926943-66266943 containing 7 genes and 399 peaks...
	Working on tad chr14:75055193-75765193 containing 4 genes and 246 peaks...
	Working on tad chr3:53011078-53521078 containing 6 genes and 145 peaks...
	Working on tad chr7:121661486-122131486 containing 6 genes and 174 peaks...
	Working on tad chr11:88883686-89073686 containing 4 genes and 85 peaks...
	Working on tad chr10:33895194-34425194 containing 6 genes and 217 peaks...
	Working on tad chr9:79717193-80257193 containing 7 genes and 237 peaks...
	Working on tad chr15:55053445-56073445 containing 8 genes and 395 peaks...
	Working on tad chr3:130092082-130702082 containing 1 genes and 178 peaks...
	Working on tad chr1:44198155-45778155 containing 3 genes and 245 peaks...
	Working on tad

	Working on tad chr13:8825754-9855754 containing 7 genes and 390 peaks...
	Working on tad chr14:99055781-99915781 containing 3 genes and 246 peaks...
	Working on tad chr12:31010135-31430135 containing 3 genes and 143 peaks...
	Working on tad chr14:23565778-24475778 containing 1 genes and 185 peaks...
	Working on tad chr17:69505660-70325660 containing 1 genes and 138 peaks...
	Working on tad chr18:49705346-50195346 containing 4 genes and 193 peaks...
	Working on tad chr6:72605006-73305006 containing 9 genes and 303 peaks...
	Working on tad chr1:45778155-46558155 containing 4 genes and 174 peaks...
	Working on tad chr10:59282252-59882252 containing 4 genes and 250 peaks...
	Working on tad chr5:29698460-30168460 containing 7 genes and 192 peaks...
	Working on tad chr12:44154013-44644013 containing 3 genes and 163 peaks...
	Working on tad chr2:80294843-80754843 containing 4 genes and 171 peaks...
	Working on tad chr2:105444843-106144843 containing 3 genes and 161 peaks...
	Working on tad c

	Working on tad chr13:95495045-95975045 containing 3 genes and 239 peaks...
	Working on tad chrX:95919661-96899661 containing 4 genes and 156 peaks...
	Working on tad chr4:154840891-155370891 containing 11 genes and 270 peaks...
	Working on tad chr11:115173686-115343686 containing 4 genes and 87 peaks...
	Working on tad chr4:3987853-6187853 containing 2 genes and 525 peaks...
	Working on tad chr19:24240510-24890510 containing 6 genes and 295 peaks...
	Working on tad chrX:52891823-53391823 containing 6 genes and 162 peaks...
	Working on tad chr14:62476163-63026163 containing 4 genes and 197 peaks...
	Working on tad chr8:9205000-9935000 containing 1 genes and 179 peaks...
	Working on tad chr2:83574843-83974843 containing 4 genes and 133 peaks...
	Working on tad chr1:94108423-95808423 containing 2 genes and 325 peaks...
	Working on tad chr10:53605194-53785194 containing 1 genes and 63 peaks...
	Working on tad chr5:45443761-46383761 containing 6 genes and 197 peaks...
	Working on tad chr7:

	Working on tad chr4:14477853-15867853 containing 4 genes and 289 peaks...
	Working on tad chr11:99323686-100313686 containing 2 genes and 235 peaks...
	Working on tad chr11:33155000-33695000 containing 4 genes and 177 peaks...
	Working on tad chr11:45061498-45641498 containing 1 genes and 99 peaks...
	Working on tad chr11:58321498-58861498 containing 4 genes and 168 peaks...
	Working on tad chr4:34917751-35397751 containing 3 genes and 156 peaks...
	Working on tad chr4:102172395-103382395 containing 8 genes and 416 peaks...
	Working on tad chr4:103382395-104432395 containing 1 genes and 147 peaks...
	Working on tad chr2:35979480-36309480 containing 6 genes and 153 peaks...
	Working on tad chr2:49749480-50299480 containing 4 genes and 183 peaks...
	Working on tad chr6:31035000-31525000 containing 7 genes and 223 peaks...
	Working on tad chrX:93009661-94189661 containing 5 genes and 321 peaks...
	Working on tad chr7:102476486-104676486 containing 22 genes and 568 peaks...
	Working on ta

	Working on tad chr15:93244569-93594569 containing 5 genes and 176 peaks...
	Working on tad chr6:100575006-101635006 containing 4 genes and 314 peaks...
	Working on tad chr10:45045194-46035194 containing 3 genes and 287 peaks...
	Working on tad chr12:31940135-32310135 containing 3 genes and 161 peaks...
	Working on tad chr13:117336193-118276193 containing 1 genes and 206 peaks...
	Working on tad chr1:91898423-92478423 containing 2 genes and 272 peaks...
	Working on tad chrX:102199661-102689661 containing 2 genes and 66 peaks...
	Working on tad chr12:33440135-35210328 containing 2 genes and 482 peaks...
	Working on tad chr10:30515194-31355194 containing 5 genes and 232 peaks...
	Working on tad chr19:36070510-36830510 containing 3 genes and 320 peaks...
	Working on tad chr19:38390510-38970510 containing 5 genes and 221 peaks...
	Working on tad chr6:3055000-3515000 containing 3 genes and 186 peaks...
	Working on tad chr5:16279182-16899182 containing 1 genes and 184 peaks...
	Working on ta

	Working on tad chr1:4314919-4834919 containing 2 genes and 183 peaks...
	Working on tad chr8:69111101-69231101 containing 2 genes and 57 peaks...
	Working on tad chrX:105099661-105749661 containing 1 genes and 58 peaks...
	Working on tad chr6:93585006-94495006 containing 1 genes and 317 peaks...
	Working on tad chrX:70591825-71351825 containing 2 genes and 93 peaks...
	Working on tad chrX:16397874-16877874 containing 2 genes and 43 peaks...
	Working on tad chr8:125141100-125911100 containing 3 genes and 371 peaks...
	Working on tad chr11:110343686-110503686 containing 1 genes and 69 peaks...
	Working on tad chrX:159407068-160137068 containing 3 genes and 230 peaks...
	Working on tad chr17:80585660-81105660 containing 3 genes and 233 peaks...
	Working on tad chr18:73895346-74215346 containing 2 genes and 132 peaks...
	Working on tad chr8:65131101-66531101 containing 2 genes and 286 peaks...
	Working on tad chr15:25995245-26475245 containing 1 genes and 112 peaks...
	Working on tad chr1

	Working on tad chr17:83855660-84515660 containing 3 genes and 330 peaks...
	Working on tad chr13:110384792-111424792 containing 1 genes and 253 peaks...
	Working on tad chr9:92110162-94724581 containing 5 genes and 574 peaks...
	Working on tad chr4:104932395-106202395 containing 2 genes and 353 peaks...
	Working on tad chr3:82901078-83261078 containing 1 genes and 100 peaks...
	Working on tad chrX:75699661-76309661 containing 1 genes and 68 peaks...
	Working on tad chr8:27304528-27504528 containing 1 genes and 48 peaks...
	Working on tad chr5:51303761-52103761 containing 1 genes and 221 peaks...
	Working on tad chr18:52955346-53725346 containing 3 genes and 297 peaks...
	Working on tad chr17:17928036-21128036 containing 2 genes and 610 peaks...
	Working on tad chr18:42595346-43315346 containing 1 genes and 165 peaks...
	Working on tad chr4:142975097-143445097 containing 2 genes and 191 peaks...
	Working on tad chr18:40225346-42195346 containing 2 genes and 404 peaks...
	Working on tad

	Working on tad chrX:153750457-153990457 containing 1 genes and 39 peaks...
	Working on tad chr17:85635660-86115660 containing 1 genes and 211 peaks...
	Working on tad chrX:41781823-42531823 containing 3 genes and 219 peaks...
	Working on tad chr11:31165000-31645000 containing 1 genes and 167 peaks...
	Working on tad chr5:53713761-54163761 containing 2 genes and 154 peaks...
	Working on tad chr10:9745202-10345202 containing 1 genes and 199 peaks...
	Working on tad chr12:44644013-45284013 containing 1 genes and 205 peaks...
	Working on tad chrX:133730461-134030461 containing 2 genes and 49 peaks...
	Working on tad chr5:118784991-119704991 containing 1 genes and 265 peaks...
	Working on tad chr12:101096790-101896790 containing 1 genes and 149 peaks...
	Working on tad chr18:68485346-69775346 containing 1 genes and 398 peaks...
	Working on tad chr19:55320510-56310510 containing 1 genes and 472 peaks...
	Working on tad chr7:96106490-96446490 containing 1 genes and 94 peaks...
	Working on ta

  result = func(self.values, **kwargs)


	Working on tad chr4:47092128-47422128 containing 1 genes and 139 peaks...
	Working on tad chr14:17382486-18222486 containing 1 genes and 261 peaks...
	Working on tad chr6:12295000-13285000 containing 2 genes and 215 peaks...
	Working on tad chr1:128868423-129358423 containing 1 genes and 90 peaks...
	Working on tad chr9:67027193-67677193 containing 2 genes and 186 peaks...
	Working on tad chr4:66713966-70173966 containing 1 genes and 517 peaks...
	Working on tad chr5:125494630-127184630 containing 1 genes and 323 peaks...
	Working on tad chr5:127704630-128449136 containing 1 genes and 114 peaks...
	Working on tad chr13:89635395-90905395 containing 3 genes and 286 peaks...
	Working on tad chr15:43853454-44353454 containing 1 genes and 108 peaks...
	Working on tad chr10:103412366-105827944 containing 1 genes and 484 peaks...
	Working on tad chr14:122925778-123365778 containing 1 genes and 96 peaks...
	Working on tad chr15:75204570-75694570 containing 2 genes and 151 peaks...
	Working on

In [13]:
## Change name of tads to prefix with 0s so it is easy to navigate
# Build temporary df to store unique tads
unique_tads = list(corr_dict.keys())
tad_re = re.compile(r'(.*):(.*)-(.*)')
unique_tads = pd.DataFrame(\
    [list(tad_re.findall(item)[0]) for item in unique_tads],
    columns=["chr", "start", "end"],
    index=unique_tads)

In [14]:
# Build list of zfilled ints for TAD positions
list_seg_tads = []
for tmp_chr in unique_tads["chr"].unique():
    seg_tads = unique_tads.loc[unique_tads["chr"] == tmp_chr]
    tmp_zfill = seg_tads["end"].apply(len).max()
    seg_tads.loc[:,["start", "end"]] = seg_tads.loc[:,["start", "end"]].applymap(\
        lambda x: x.zfill(tmp_zfill)).values
    list_seg_tads.append(seg_tads)
    
zfilled_tads = pd.concat(list_seg_tads)
tad_to_zfill = zfilled_tads["chr"] + "_" + \
    zfilled_tads["start"] + "_" + zfilled_tads["end"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seg_tads.loc[:,["start", "end"]] = seg_tads.loc[:,["start", "end"]].applymap(\
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seg_tads.loc[:,["start", "end"]] = seg_tads.loc[:,["start", "end"]].applymap(\
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seg_tads.loc[:,["start", "end"]] = seg_tads.loc[:,["start", "end"]].applymap(\
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [15]:
# Pull out "significant" connections to build a separate dataframe
sig_conns = []
for tmp_tad in corr_dict:
    sig_conns.append(corr_dict[tmp_tad]["sig_conns"].copy())
    
sig_conns = pd.concat(sig_conns)


In [16]:
### Write out the correlations of TAD contained peak to gene connections
path_output = "output/correlate_tea_atac_to_cite_rna_across_r7_clusters/"
path_test_name = "peak_to_gene_correlation_within_tads"

# Make sure directories exist
if not os.path.isdir(path_output):
    os.mkdir(path_output)
    
if not os.path.isdir(os.path.join(path_output, path_test_name)):
    os.mkdir(os.path.join(path_output, path_test_name))
    
if not os.path.isdir(os.path.join(path_output, path_test_name, "significant_connections")):
    os.mkdir(os.path.join(path_output, path_test_name, "significant_connections"))
    
if not os.path.isdir(os.path.join(path_output, path_test_name, "r_values")):
    os.mkdir(os.path.join(path_output, path_test_name, "r_values"))
    
if not os.path.isdir(os.path.join(path_output, path_test_name, "p_values")):
    os.mkdir(os.path.join(path_output, path_test_name, "p_values"))

# Loop through TADs and write out the correlation results
for tmp_tad in unique_tads.index.values:
    corr_dict[tmp_tad]['sig_conns'].to_csv(os.path.join(\
        path_output,
        path_test_name,
        "significant_connections",
        "{}_significat_connections.csv".format(tad_to_zfill[tmp_tad])))
    corr_dict[tmp_tad]['r_df'].to_csv(os.path.join(\
        path_output,
        path_test_name,
        "r_values",
        "{}_r_values.csv".format(tad_to_zfill[tmp_tad])))
    corr_dict[tmp_tad]['p_df'].to_csv(os.path.join(\
        path_output,
        path_test_name,
        "p_values",
        "{}_p_values.csv".format(tad_to_zfill[tmp_tad])))

In [17]:
# # Filter the ATAC down to the regions that have a significant correlation to genes
# seg_atac = atac_counts[cite_rna.columns.values].copy()
# seg_atac.index = peaks["chr"] + ":" + \
#     peaks["start"].astype(str) + "-" + \
#     peaks["end"].astype(str)
# seg_atac = seg_atac.loc[sig_conns["peak"].unique()]

# seg_atac = seg_atac.loc[~pd.Series(seg_atac.index.values).duplicated().values]

In [18]:
# # Calculate correlations
# # All expressed genes vs. all peaks with significant gene correlations
# r_df_tot = pearson_corr_df_to_df(\
#     cite_rna.loc[genes_to_use].T, 
#     seg_atac[cite_rna.columns.values].T)

In [19]:
# # Cluster the genes
# Z = linkage(r_df_tot, 'ward')
# cluster_order = np.array([int(item) for item in dendrogram(Z)["ivl"]])
# cluster_order = pd.Series(list(range(r_df_tot.shape[0])),
#     index=r_df_tot.index.values[cluster_order])
# gene_clusters = pd.Series(fcluster(Z, 4, depth=10), 
#     index=r_df_tot.index.values[cluster_order])

In [20]:
# # Group the peaks using MarkerFinder
# from pyInfinityFlow.InfinityFlow_Utilities import marker_finder

# r_df_tot, p_df = marker_finder(r_df_tot.loc[gene_clusters.index.values], gene_clusters)

# marker_df = pd.DataFrame({\
#         "peak": r_df.index.values,
#         "r": r_df.max(axis=1),
#         "gene_cluster": r_df.idxmax(axis=1)},
#     index=r_df.index.values).sort_values(by=["gene_cluster", "r"], 
#         ascending=[True, False])

In [21]:
# # Save figure of peak to gene correlation heatmap
# sns.heatmap(\
#     r_df_tot.loc[cluster_order.index.values, marker_df["peak"].values].T,
#     cmap="coolwarm", vmax=0.8, vmin=-0.4)

# plt.savefig("output/correlate_atac_peaks_to_gene_expression/"\
#     "heatmap_peak_to_gene_correlations_top_10k_variable_genes.png")

In [None]:
# gene_tad_anno = pd.DataFrame({\
#         "cluster": gene_clusters.loc[cluster_order.index.values],
#         "tad": tss_loci.loc[cluster_order.index.values,"tad"]},
#     index=cluster_order.index.values)
# # gene_tad_anno = gene_tad_anno.loc[gene_tad_anno["tad"] != ""]
# gene_tad_anno["tad_file_name"] = [\
#     tad_to_zfill[item] if item != "" else "" for item in \
#         gene_tad_anno["tad"].values]

# gene_tad_anno.to_csv("output/correlate_tea_atac_to_cite_rna_across_r7_clusters/"\
#     "/genes_to_TADs_with_gene_cluster_annotations.csv", index=True, 
#     header=True, index_label="gene")

In [23]:
# Save annotation files
tss_loci.to_csv("output/correlate_tea_atac_to_cite_rna_across_r7_clusters/gene_loci_annotation.csv",
    header=True, index=True, index_label="id")

peaks.to_csv("output/correlate_tea_atac_to_cite_rna_across_r7_clusters/peak_annotation.csv",
    header=True, index=True, index_label="id")
