In [None]:
import numpy as np
import modisco
import sys
print (sys.version)
import os
from matplotlib import pyplot as plt
import modisco.visualization
from modisco.visualization import viz_sequence
import h5py
import hdf5plugin
import gzip
from collections import OrderedDict

In [None]:
dir = "chrombpnet_out/"
modisco_path = dir + "/Ependymal/Ependymal_modisco.h5"

In [None]:
# interpret enhancer with ependymal model
task_to_scores_epend = OrderedDict()
task_to_hyp_scores_epend = OrderedDict()

f = h5py.File("Ependymal_contrib_ependymal.counts_scores.h5","r")
n = 1000

#reverse-complement some of the sequences to simulate that
take_rc = (np.random.RandomState(1234).uniform(size=n) > 0.5)

#Note that the sequences can be of variable lengths;
task_to_scores_epend = [np.array(x) if not rc else np.array(x)[::-1,::-1]
                            for x,rc in zip(f['projected_shap']['seq'][:n], take_rc)]
task_to_hyp_scores_epend = [np.array(x) if not rc else np.array(x)[::-1,::-1]
                                for x,rc in zip(f['shap']['seq'][:n], take_rc)]


In [None]:
for region in regions:
    print(viz_sequence.plot_weights(task_to_scores_epend[region], subticks_frequency=200))

In [None]:
# other relevant functions used to count number of motifs per regions, total motifs per cell type etc..

# write table with total number of patterns
def write_n_seqlets_to_csv(file_path, output_csv):
    with h5py.File(file_path, 'r') as h5_file:
        pos_patterns = h5_file['pos_patterns']
        data = []

        for pattern in pos_patterns:
            pattern_group = pos_patterns[pattern]
            seqlets_group = pattern_group['seqlets']
            n_seqlets = seqlets_group['n_seqlets'][:]
            
            # Append the pattern and number of seqlets to the data list
            data.append([pattern, n_seqlets])

    # Write the data to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Pattern', 'n_seqlets'])  # Write the header
        writer.writerows(data)  # Write the data

        
# write table with: pattern, peak_id, count
def write_example_idx_to_csv(file_path, output_csv):
    with h5py.File(file_path, 'r') as h5_file:
        pos_patterns = h5_file['pos_patterns']
        data = []

        for pattern in pos_patterns:
            pattern_group = pos_patterns[pattern]
            seqlets_group = pattern_group['seqlets']
            example_idx = seqlets_group['example_idx'][:]

            # Count occurrences of each example_idx
            example_idx_counts = Counter(example_idx)

            # Append the pattern and counts to the data list
            for idx, count in example_idx_counts.items():
                data.append([pattern, idx, count])

    # Write the data to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Pattern', 'Example_idx', 'Count'])  # Write the header
        writer.writerows(data)  # Write the data


# write table with number of patterns per peak
def write_motif_counts_to_csv(file_path, output_csv):
    with h5py.File(file_path, 'r') as h5_file:
        pos_patterns = h5_file['pos_patterns']
        overall_counts = Counter()

        # Iterate over each pattern and aggregate counts
        for pattern in pos_patterns:
            pattern_group = pos_patterns[pattern]
            seqlets_group = pattern_group['seqlets']
            example_idx = seqlets_group['example_idx'][:]

            # Aggregate counts of example_idx across all patterns
            overall_counts.update(example_idx)

        # Prepare data for CSV
        data = [[idx, count] for idx, count in overall_counts.items()]

    # Write the data to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Example_idx', 'Count'])  # Write the header
        writer.writerows(data)  # Write the data
