In [1]:
import os
import pandas as pd
import numpy as np
import re
seq_re = re.compile(r'(.+):(.+)-(.+)')
from gimmemotifs.motif import Motif,read_motifs

os.chdir("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/"\
    "2023_06_12_tea_seq_atac_processing/")

INFO:matplotlib.font_manager:Failed to extract font properties from /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf: In FT2Font: Can not load face (unknown file format; error code 0x2)


In [2]:
# Define a path for the genome fasta file
path_to_mm10 = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/reference/genomes/mm10/mm10.fa"
# Read in the mm10 fasta indexing file
fai_mm10 = pd.read_table(path_to_mm10 + ".fai", header=None)
fai_mm10.columns = ["NAME", "LENGTH", "OFFSET", "LINEBASES", "LINEWIDTH"]
fai_mm10 = fai_mm10.set_index("NAME")

In [3]:
### Helper functions

# Define helper function to extract sequences from fasta file
def read_seq_from_fasta(input_bed, fai_annotation, fasta):
    bed_df = input_bed.copy()
    # Correct start values for 0 index
    bed_df.iloc[:,1] = bed_df.iloc[:,1].values - 1
    # Define fai annotation based parameters for reading from fasta file
    bed_df["offset"] = [fai_annotation.loc[i, "OFFSET"] for i in \
        bed_df.iloc[:,0].values]
    bed_df["lw"] = [fai_annotation.loc[i, "LINEWIDTH"] for i in \
        bed_df.iloc[:,0].values]
    bed_df["lb"] = [fai_annotation.loc[i, "LINEBASES"] for i in \
        bed_df.iloc[:,0].values]
    # Define positions in file to which to seek
    bed_df["len_new_line"] = bed_df["lw"] - bed_df["lb"]
    bed_df["lines_to_start"] = bed_df.iloc[:,1].values // bed_df["lb"].values
    bed_df["char_to_start"] = bed_df["lines_to_start"] * bed_df["lw"]
    bed_df["bases_before_start"] = bed_df.iloc[:,1].values % bed_df["lb"].values
    bed_df["bases_to_read"] = bed_df.iloc[:,2] - bed_df.iloc[:,1]
    bed_df["bases_to_next_line"] = bed_df["lb"] - bed_df["bases_before_start"]
    # Check bases to next line
    mask_need_next_line = (bed_df["bases_to_read"] < \
        bed_df["bases_to_next_line"]).values
    # Define number of char values to read
    bed_df["char_to_read"] = 0
    if mask_need_next_line.sum() > 0:
        bed_df.loc[mask_need_next_line, "char_to_read"] = bed_df.loc[\
            mask_need_next_line, "bases_to_read"]
    if (~mask_need_next_line).sum() > 0:
        new_lines_to_read = 1 + ((\
            bed_df.loc[~mask_need_next_line, "bases_to_read"].values - \
            bed_df.loc[~mask_need_next_line, "bases_to_next_line"].values) // \
                bed_df.loc[~mask_need_next_line, "lb"].values)
        bed_df.loc[~mask_need_next_line, "char_to_read"] = (\
            bed_df.loc[~mask_need_next_line, "len_new_line"].values * \
                new_lines_to_read) + \
                    bed_df.loc[~mask_need_next_line, "bases_to_read"].values

    # Filter df to only the columns needed (clear memory)
    bed_df = bed_df[[\
        "offset", "char_to_start", "bases_before_start", "char_to_read"]]
    # Read in the sequence from the fasta
    output_seqs = []
    with open(fasta, "r") as f:
        for i, row in bed_df.iterrows():
            f.seek(\
                row["offset"] + \
                row["char_to_start"] + \
                row["bases_before_start"])
            output_seqs.append(f.read(row["char_to_read"]).replace("\n", ""))
    
    return(output_seqs)

def get_reverse_complement(seq):
    # Define replacements
    dict_replace = {\
        "A": "T", 
        "C": "G",
        "G": "C",
        "T": "A",
        "a": "t",
        "c": "g",
        "g": "c",
        "t": "a",
        "n": "n",
        "N": "N"}
    # Replace and return
    return("".join([dict_replace[i] for i in seq[::-1]]))

In [4]:
# Read in the motifs < 3s
path_modisco_motifs = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/"\
    "analysis/2023_06_12_tea_seq_atac_processing/output/chrombpnet/"\
    "modisco_merged_results/fold_0/modisco_fold_0_merged_ppm.motif"
modisco_motifs = read_motifs(path_modisco_motifs)
modisco_names = np.array([str(x) for x in modisco_motifs])
cluster_naming_re = re.compile(r'modisco_fold_0_(.+)_modisco')
def get_motif_names_from_cispb_names(tmp_name):
    cluster_name, tmp_remainder = tmp_name.split("__")
    cluster_name = cluster_naming_re.findall(cluster_name)[0]
    pattern_name = "_".join(tmp_remainder.split("_")[:-1])
    return(f"{cluster_name}__{pattern_name}")

modisco_pattern_names = np.array([get_motif_names_from_cispb_names(item) for \
    item in modisco_names])

modisco_motif_sizes = np.array([item._ppm.shape[0] for item in modisco_motifs])

In [5]:
# Build an annotation table to segregate modisco hit results
pattern_anno = pd.DataFrame({\
    "cluster": [item.split("__")[0] for item in modisco_pattern_names],
    "pattern": modisco_pattern_names})

pattern_anno

Unnamed: 0,cluster,pattern
0,MEP,MEP__pos_pattern_0
1,MEP,MEP__pos_pattern_1
2,MEP,MEP__pos_pattern_10
3,MEP,MEP__pos_pattern_11
4,MEP,MEP__pos_pattern_12
...,...,...
1142,ST_HSC,ST_HSC__pos_pattern_9
1143,ST_HSC,ST_HSC__neg_pattern_0
1144,ST_HSC,ST_HSC__neg_pattern_1
1145,ST_HSC,ST_HSC__neg_pattern_2


In [None]:
# Read in the peaks for which to scan
# path_peak_nominations = "output/"\
#     "correlate_tea_atac_to_cite_rna_across_r7_clusters/"\
#     "peak_to_gene_correlation_within_tads/sig_conns_df.csv"

# !!! THIS HAS TO BE EXACTLY THE SAME PEAKS FILE USED TO CALCULATE THE CONTRIBUTION
# !!! SCORES WITH CHROMBPNET
path_peak_nominations = ("/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/"
    "2024_08_05_help_sid_with_seqlet_scanning_and_modisco_overlap/input/"
    "Intermediate_Mono-1_peaks_no_blacklist.bed")

PEAK_WIDTH = 500

path_peak_nominations = "/media/kyle_storage/kyle_ferchen/grimes_lab_main/analysis/2024_08_05_help_sid_with_seqlet_scanning_and_modisco_overlap/input/Intermediate_Mono-1_.interpreted_regions.bed"

peaks = pd.read_csv(path_peak_nominations,sep="\t",header=None)
peaks.columns = ["chr", "start", "end", "name", "score", "strand", "score2", "score3", "score4", "offset"]
peaks["start"] = peaks["start"] + peaks["offset"] - (round(PEAK_WIDTH/2))
peaks["end"] = peaks["start"] + PEAK_WIDTH
peaks['name'] = peaks.apply(lambda row: f"{row['chr']}:{row['start']}-{row['end']}", axis=1)
peaks

In [6]:
# Read in the peaks for which to scan
# NOTE: the bed coordinates here should match the scanning done for tfmodisco-lite
# Eg. width of tfmodisco-lite search arround summit of the narrow peak (start+offset)
path_peak_nominations = "output/"\
    "correlate_tea_atac_to_cite_rna_across_r7_clusters/"\
    "peak_to_gene_correlation_within_tads/sig_conns_df.csv"

peak_anno = pd.read_csv(path_peak_nominations)
peak_anno = peak_anno.rename({"peak.1": "name"}, axis=1)
peak_anno["chr"] = [i.split(":")[0] for i in peak_anno.iloc[:,2].values]
peak_anno["start"] = [int(i.split(":")[-1].split("-")[0]) for i in \
    peak_anno.iloc[:,2].values]
peak_anno["end"] = [int(i.split("-")[-1]) for i in peak_anno.iloc[:,2].values]
peaks = peak_anno[["name", "chr", "start", "end"]]

peaks = peaks.loc[~peaks["name"].duplicated()]
peaks = peaks.sort_values(by=["chr", "start"])
peaks.index = list(range(peaks.shape[0]))
peaks

Unnamed: 0,name,chr,start,end
0,chr1:4456181-4457181,chr1,4456181,4457181
1,chr1:4540111-4541111,chr1,4540111,4541111
2,chr1:4614190-4615190,chr1,4614190,4615190
3,chr1:4615468-4616468,chr1,4615468,4616468
4,chr1:4621593-4622593,chr1,4621593,4622593
...,...,...,...,...
116373,chrX:169263066-169264066,chrX,169263066,169264066
116374,chrX:169266871-169267871,chrX,169266871,169267871
116375,chrX:169299283-169300283,chrX,169299283,169300283
116376,chrX:169303654-169304654,chrX,169303654,169304654


In [7]:
# # Read in the DNA sequences for the peaks of interest
# # 116,378 peaks - read into memory in about 18 seconds
# seqs = pd.Series(\
#     read_seq_from_fasta(\
#         input_bed = peaks[["chr", "start", "end"]], 
#         fai_annotation = fai_mm10, 
#         fasta = path_to_mm10),
#     index=peaks["name"].values)

In [8]:
# Write out the DNA sequences to a file
path_tmp_fasta = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/sig_conns_dynamic_peak_set_fasta/"\
    "sig_conns_unique_peaks.fa"

# with open(path_tmp_fasta, "w") as tmp_f:
#     for tmp_region in peaks["name"].values:
#         if tmp_region == peaks["name"].values[-1]:
#             tmp_f.write(f">{tmp_region}\n{seqs[tmp_region]}")
#         else:
#             tmp_f.write(f">{tmp_region}\n{seqs[tmp_region]}\n")

In [9]:
# 32 clusters
# ~32 seqlet patterns for each cluster
# 116,378 peaks to search
# Took about 40 minutes
path_save_motif_hits = "output/chrombpnet/modisco_merged_results/fold_0/"\
    "redo_extract_seqlets/cluster_pwm_hits/"

path_seqlet_beds = "output/chrombpnet/modisco_fold_0_seqlet_bed/"

# Run motif scanner
from gimmemotifs.scanner import Scanner
from gimmemotifs.fasta import Fasta

failed_clusters = []
for tmp_cluster in pattern_anno["cluster"].unique():
    try:
        print(f"Working on {tmp_cluster}...")
        # Get the motif indices that correspond to the current cluster
        tmp_mo_i = np.where(pattern_anno["cluster"] == tmp_cluster)[0]
        # Filter the motif set
        tmp_mo = [item for i, item in enumerate(modisco_motifs) if i in \
            tmp_mo_i]
        # Filter the motif names
        tmp_mo_names = modisco_names[tmp_mo_i]
        # Filter the pattern names
        tmp_mo_pattern_names = modisco_pattern_names[tmp_mo_i]
        # Filter the pattern lengths
        tmp_mo_lens = modisco_motif_sizes[tmp_mo_i]

        # Initialize the scanner
        s = Scanner()
        s.set_genome("mm10")
        s.set_motifs(tmp_mo)
        s.set_threshold(fpr=0.05)

        scan_results = []
        seqs = Fasta(path_tmp_fasta)
        for i,result in enumerate(s.scan(seqs)):
            seqname = seqs.ids[i]
            for m,matches in enumerate(result):
                for score, pos, strand in matches:
                    scan_results.append([seqname, m, score, pos, strand])

        scan_results = pd.DataFrame(\
            scan_results,
            columns=["peak", "motif_index", "score", "pos", "strand"])

        # Delete scanner to free up processes
        del s

        tmp_anno = pd.DataFrame(\
            [seq_re.findall(i)[0] for i in scan_results["peak"].values],
            columns=["chr", "start", "end"])
        tmp_anno["start"] = tmp_anno["start"].astype(int).values + \
            scan_results["pos"].astype(int).values - 1
        tmp_anno["end"] = tmp_anno["start"] + \
            tmp_mo_lens[scan_results["motif_index"].values] - 1
        tmp_anno["peak"] = scan_results["peak"].values
        tmp_anno["score"] = scan_results["score"].values
        tmp_anno["pos"] = scan_results["pos"].values
        tmp_anno["strand"] = scan_results["strand"].replace(\
            {-1: "-", 1: "+"}).values
        tmp_anno["pattern"] = tmp_mo_pattern_names[\
            scan_results["motif_index"].values]

        # Adjust for real start
        tmp_anno["start"] = tmp_anno["start"] - 1
        # Build index to find shared loci with modisco results
        tmp_anno.index = (\
            tmp_anno["pattern"].apply(lambda x: x.split("__")[-1]) + "::" +\
            tmp_anno["chr"] + ":" + \
            tmp_anno["start"].astype(str) + "-" + \
            tmp_anno["end"].astype(str)).values
    
        ### Compare against the tfmodisco-lite instances
        # Read in tfmodisco-lite instances
        modisco = pd.read_table(os.path.join(\
                path_seqlet_beds, 
                f"{tmp_cluster}_seqlets.bed"), 
            header=None)
        modisco.columns = ["chr", "start", "end", "name", "score", "strand"]
        modisco.index = (modisco["name"] + "::" + modisco["chr"] + ":" + \
            modisco["start"].astype(str) + "-" + \
            modisco["end"].astype(str)).values
        # Remove duplicate instances
        modisco = modisco.loc[\
            ~pd.Series(modisco.index.values).duplicated().values]

        # Find shared loci
        shared_loci = modisco.index.values[\
            pd.Series(modisco.index.values).isin(tmp_anno.index.values).values]
        tmp_anno["in_modisco"] = False
        tmp_anno.loc[shared_loci, "in_modisco"] = True

        # Write out results
        tmp_anno = tmp_anno.sort_values(by=["chr", "start"])
        tmp_anno.index = list(range(tmp_anno.shape[0]))
        tmp_anno.to_feather(os.path.join(\
            path_save_motif_hits,
            f"{tmp_cluster}.fea"))

    except Exception as e:
        print(f"{tmp_cluster} FAILED!!!\n\nError:\n{e}\n\n:(\n")
        failed_clusters.append(tmp_cluster)

Working on MEP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on BMCP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on CD127_MP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on CLP1_Rrm2...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on eHSC...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on eHSC_Pcna...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on ERP1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on ERP2...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on HSCP_ERP1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on HSCP_HPC_Cenpf...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on HSCP_HPC_Hist1h2af...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on HSCP_HPC_Tk1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on HSCP_MKP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on IG2_MP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on IG2_proNeu1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on LT_HSC_Mllt3...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MDP_Cpa3...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MDP_Irf8...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MKP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on ML_cell_cycle...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MPP4_Hlf...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MPP4_Nkx2_3...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MPP5_Egr1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MPP5_Flt3...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MultiLin_1_MEP...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MultiLin_1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MultiLin_2_F13a1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on MultiLin_2_Ms4a3...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on pre_MultiLin_1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on pre_MultiLin_2...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on proNeu_1...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

Working on ST_HSC...


DEBUG:gimme.scanner:using background: genome /home/kyle/.local/share/genomes/mm10/mm10.fa with size 200


Scanning:   0%|          | 0/116378 [00:00<?, ? sequences/s]

In [10]:
for tmp_cluster in failed_clusters:
    try:
        print(f"Working on {tmp_cluster}...")
        # Get the motif indices that correspond to the current cluster
        tmp_mo_i = np.where(pattern_anno["cluster"] == tmp_cluster)[0]
        # Filter the motif set
        tmp_mo = [item for i, item in enumerate(modisco_motifs) if i in \
            tmp_mo_i]
        # Filter the motif names
        tmp_mo_names = modisco_names[tmp_mo_i]
        # Filter the pattern names
        tmp_mo_pattern_names = modisco_pattern_names[tmp_mo_i]
        # Filter the pattern lengths
        tmp_mo_lens = modisco_motif_sizes[tmp_mo_i]

        # Initialize the scanner
        s = Scanner()
        s.set_genome("mm10")
        s.set_motifs(tmp_mo)
        s.set_threshold(fpr=0.05)

        scan_results = []
        seqs = Fasta(path_tmp_fasta)
        for i,result in enumerate(s.scan(seqs)):
            seqname = seqs.ids[i]
            for m,matches in enumerate(result):
                for score, pos, strand in matches:
                    scan_results.append([seqname, m, score, pos, strand])

        scan_results = pd.DataFrame(\
            scan_results,
            columns=["peak", "motif_index", "score", "pos", "strand"])

        # Delete scanner to free up processes
        del s

        tmp_anno = pd.DataFrame(\
            [seq_re.findall(i)[0] for i in scan_results["peak"].values],
            columns=["chr", "start", "end"])
        tmp_anno["start"] = tmp_anno["start"].astype(int).values + \
            scan_results["pos"].astype(int).values - 1
        tmp_anno["end"] = tmp_anno["start"] + \
            tmp_mo_lens[scan_results["motif_index"].values] - 1
        tmp_anno["peak"] = scan_results["peak"].values
        tmp_anno["score"] = scan_results["score"].values
        tmp_anno["pos"] = scan_results["pos"].values
        tmp_anno["strand"] = scan_results["strand"].replace(\
            {-1: "-", 1: "+"}).values
        tmp_anno["pattern"] = tmp_mo_pattern_names[\
            scan_results["motif_index"].values]

        # Adjust for real start
        tmp_anno["start"] = tmp_anno["start"] - 1
        # Build index to find shared loci with modisco results
        tmp_anno.index = (\
            tmp_anno["pattern"].apply(lambda x: x.split("__")[-1]) + "::" +\
            tmp_anno["chr"] + ":" + \
            tmp_anno["start"].astype(str) + "-" + \
            tmp_anno["end"].astype(str)).values
    
        ### Compare against the tfmodisco-lite instances
        # Read in tfmodisco-lite instances
        modisco = pd.read_table(os.path.join(\
                path_seqlet_beds, 
                f"{tmp_cluster}_seqlets.bed"), 
            header=None)
        modisco.columns = ["chr", "start", "end", "name", "score", "strand"]
        modisco.index = (modisco["name"] + "::" + modisco["chr"] + ":" + \
            modisco["start"].astype(str) + "-" + \
            modisco["end"].astype(str)).values
        # Remove duplicate instances
        modisco = modisco.loc[\
            ~pd.Series(modisco.index.values).duplicated().values]

        # Find shared loci
        shared_loci = modisco.index.values[\
            pd.Series(modisco.index.values).isin(tmp_anno.index.values).values]
        tmp_anno["in_modisco"] = False
        tmp_anno.loc[shared_loci, "in_modisco"] = True

        # Write out results
        tmp_anno = tmp_anno.sort_values(by=["chr", "start"])
        tmp_anno.index = list(range(tmp_anno.shape[0]))
        tmp_anno.to_feather(os.path.join(\
            path_save_motif_hits,
            f"{tmp_cluster}.fea"))

    except Exception as e:
        print(f"{tmp_cluster} FAILED!!!\n\nError:\n{e}\n\n:(\n")