In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
seq_re = re.compile(r'(.+):(.+)-(.+)')
from gimmemotifs.motif import Motif,read_motifs

os.chdir("/Volumes/Kyle_T7_2/grimes_lab/analysis/2023_06_12_tea_seq_atac_processing/")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read in the motifs
path_cisbp = "/Volumes/Kyle_T7_2/Reference/Cisbp2/Mus_musculus_2020_06_01_11_53_pm/"
path_pwms = os.path.join(path_cisbp, "pwms_all_motifs/")
path_cisbp_anno = os.path.join(path_cisbp, "TF_Information_all_motifs.txt")

cisbp_anno = pd.read_table(path_cisbp_anno)
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"] != "."]

cisbp2_motifs = read_motifs(os.path.join(path_cisbp, "mouse_cisbp2_all_motif_ids_ppm_format.motif"))
cisbp2_names = np.array([str(x) for x in cisbp2_motifs])

# Filter cisbp2 annotation to only motifs that have a real entry
cisbp_anno = cisbp_anno.loc[cisbp_anno["Motif_ID"].isin(["_".join(i.split("_")[:2]) for i in cisbp2_names])]

In [3]:
# Read in the modisco patterns observed (patterns from modisco)
path_pat_mo = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "modisco_fold_0_expanded_peaks_motif_instances_ppm.motif"
pat_mo = read_motifs(path_pat_mo)
pat_mo_names = np.array([str(x) for x in pat_mo])

In [4]:
top_corrs = {}
for tmp_pat_ppm, tmp_pat_name in zip(pat_mo, pat_mo_names):
    tmp_pat_ppm = tmp_pat_ppm._ppm
    print(f"Working on {tmp_pat_name}...")
    tmp_res = []
    for tmp_ref_ppm, tmp_ref_name in zip(cisbp2_motifs, cisbp2_names):
        tmp_ref_ppm = tmp_ref_ppm._ppm
        tmp_ref_template = np.concatenate([\
            0.25 * np.ones(tmp_pat_ppm.shape),
            tmp_ref_ppm,
            0.25 * np.ones(tmp_pat_ppm.shape)])
        n_tests = tmp_pat_ppm.shape[0] + tmp_ref_ppm.shape[0]
        test_ppms = np.array([tmp_ref_template[i:i+tmp_pat_ppm.shape[0],:] for \
            i in range(1,n_tests)])
        tmp_pat_mean_devs = tmp_pat_ppm - 0.25
        tmp_pat_sqm_devs = (tmp_pat_mean_devs**2).sum()
        fwrd_corrs = [(tmp_pat_mean_devs*(x-0.25)).sum() / \
            np.sqrt(tmp_pat_sqm_devs*((x - 0.25)**2).sum()) for x in test_ppms]
        rev_corrs = [(tmp_pat_mean_devs[::-1,::-1]*(x-0.25)).sum() / \
            np.sqrt(tmp_pat_sqm_devs*((x - 0.25)**2).sum()) for x in test_ppms]
        tmp_results = pd.concat([\
            pd.DataFrame({\
                "pattern": tmp_pat_name,
                "motif": tmp_ref_name,
                "offset": np.array(range(1,n_tests)) - tmp_ref_ppm.shape[0],
                "strand": "+",
                "r": fwrd_corrs}),
            pd.DataFrame({\
                "pattern": tmp_pat_name,
                "motif": tmp_ref_name,
                "offset": (np.array(range(1,n_tests)) - \
                    tmp_ref_ppm.shape[0])[::-1],
                "strand": "-",
                "r": rev_corrs})])
        
        tmp_res.append(tmp_results.sort_values(by="r", 
            ascending=False).head(1))
    
    top_corrs[tmp_pat_name] = pd.concat(tmp_res)


Working on MEP__pattern_0_nnnnnnnnnnnGCCCCGCCCCCnnnnnnnn...
Working on MEP__pattern_1_nnnnnnnnCCAsnAGrGGGCrCynnnnnnn...
Working on MEP__pattern_10_nnnnnnnnnGTCACGTGrynnnnnnnnnnn...
Working on MEP__pattern_11_GCrnnGCmTkCTGGGAnwTGTAGTyynnnn...
Working on MEP__pattern_12_nnnnnnnnnnGCGnnnGCGCGnnsCnnnnn...
Working on MEP__pattern_13_nnwAAAAAAAAAAAAAAAAAAAAAAAAAAA...
Working on MEP__pattern_14_nnnnnnnnnnGnGCGCGCGCnnnnnnnnnn...
Working on MEP__pattern_15_nnnnnnnnnnGsCGCCGCCATCTTGnnnnn...
Working on MEP__pattern_16_nnnnnnnnnnnynGTTGCCATGGnAACnnn...
Working on MEP__pattern_17_nnnnnnnnnnnArnTCTCGCGAGAnyTnnn...
Working on MEP__pattern_18_nnnnnCmGGAAGyCCCrCCCCynnnnnnnn...
Working on MEP__pattern_19_nnnynCCCCnCCCCCCCCCCCCCCCCCCCC...
Working on MEP__pattern_2_nnnnnnnAAAsAGGAAGTnnnnnnnnnnnn...
Working on MEP__pattern_20_nnnnCnnCTTGTGGTynnnnnnGGwAnTGn...
Working on MEP__pattern_21_TCnGGAAAAAAAAAAAAAAAAAAAnAnnnn...
Working on MEP__pattern_22_nnGGGmGCTGTCCATGGTGCTGAAnnnnnn...
Working on MEP__pattern_23_

In [76]:
path_save_results = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "correlate_modisco_to_cisbp2_motifs/"

print("Working on pattern:")
for tmp_pat in top_corrs:
    print(f"\t{tmp_pat}...")
    top_corrs[tmp_pat].to_csv(os.path.join(\
            path_save_results, 
            f"{tmp_pat}.csv"),
        header=True, index=False)

Working on pattern:
	MEP__pattern_0_nnnnnnnnnnnGCCCCGCCCCCnnnnnnnn...
	MEP__pattern_1_nnnnnnnnCCAsnAGrGGGCrCynnnnnnn...
	MEP__pattern_10_nnnnnnnnnGTCACGTGrynnnnnnnnnnn...
	MEP__pattern_11_GCrnnGCmTkCTGGGAnwTGTAGTyynnnn...
	MEP__pattern_12_nnnnnnnnnnGCGnnnGCGCGnnsCnnnnn...
	MEP__pattern_13_nnwAAAAAAAAAAAAAAAAAAAAAAAAAAA...
	MEP__pattern_14_nnnnnnnnnnGnGCGCGCGCnnnnnnnnnn...
	MEP__pattern_15_nnnnnnnnnnGsCGCCGCCATCTTGnnnnn...
	MEP__pattern_16_nnnnnnnnnnnynGTTGCCATGGnAACnnn...
	MEP__pattern_17_nnnnnnnnnnnArnTCTCGCGAGAnyTnnn...
	MEP__pattern_18_nnnnnCmGGAAGyCCCrCCCCynnnnnnnn...
	MEP__pattern_19_nnnynCCCCnCCCCCCCCCCCCCCCCCCCC...
	MEP__pattern_2_nnnnnnnAAAsAGGAAGTnnnnnnnnnnnn...
	MEP__pattern_20_nnnnCnnCTTGTGGTynnnnnnGGwAnTGn...
	MEP__pattern_21_TCnGGAAAAAAAAAAAAAAAAAAAnAnnnn...
	MEP__pattern_22_nnGGGmGCTGTCCATGGTGCTGAAnnnnnn...
	MEP__pattern_23_nnnryTTCCTGTGGTTwnnnnnnnnnnnnn...
	MEP__pattern_24_sTGGCnnnnnnrGGAAnTGCAnnCwTsnnn...
	MEP__pattern_25_GGTTnGGGTTrkGGTTwGGGTTrkGGTTwG...
	MEP__pattern_