# The webserver notebook
Make sure you have both model files in `/data/models/`. The model can be initialized with `*_model_params.dat`. The python script will then automatically read in the matching `*_parameter.pth` file.

In [39]:
import os
# Specify your model
model_file='CTCFaH3K27acaH3K36me3aH3K4me3aH33aH3K27me3aH3K4me1aATAConseq2krcomp_mh0-cv10-1_Cormsek512l19TfEXPGELUmax10rcTvlCota_tc2dNoned1s1r1l7ma5nfc3s1024cbnoTfdo0.1tr1e-05SGD0.9bs64-F'
model_params = './models/'+model_file+'_model_params.dat'

# Specify path to drg_tools
drgclis='~/Git/DRG/scripts/'

## Specify input to server
The model takes 2000bp sequence intervals as input and predicts the ATAC-seq signal within a 250bp window in the center of that interval, as well as the signal in 1000 bp windows of 7 Cut&Run data sets. The sequence attributions show all bases and motifs that contribute to the center window only. 

1. Determine if you want to provide genomic location of a fasta file.

    a. if provided a sequence, it will either be cut into overlapping 2000 bp fragments shifted by 250bp, or padded to 2000bp.

2. If given a location 
    
    a. determine if you want the attribution for the signal in this region (i.e. all attributions that contribute to a window in this region, **output centric view**) or for the sequence (i.e. all attributions that that fall into this interval, **input centric view**)

In [40]:
# Specify model input
create_sequence = True
bed = './data/inputs/test.bed'
original_bed = './data/inputs/test.bed'
input_centric = False
sequence = './data/inputs/mm10test2000.fasta' # only used if bed given and create_sequence set to False.
input_length = 2000 # length of the input sequence
atac_window_size = 250
cr_window_size = 1000
window_size = min(atac_window_size, cr_window_size)
step_size = window_size // 2


In [41]:
# Create necessary input files for the model
# Note: include --'save_pos_info' so that this info can be used to create .bw and .bed files for Webserver

import os
import numpy as np
bed = './data/inputs/test.bed'
mm10='./data/mm10' # contains all chr1-19 in as chr1.fa.gz

def create_centric_bed_file(bed, window_size, step_size, input_length, input_centric):
    """
    Create a bed file with 2000bp sequences for all 250 bp windows in the center of the 2000bp sequence.
    If input_centric is True, All 2000bp sequences that overlap with at least step_size with the region in the bed file are created.
    If input_centric is False, only the 2000pb sequences whose center 250bp overlap with the region in the bed file are created.
    The new bed file is saved with '_ic.bed' or '_oc.bed' suffix depending on the value of input_centric.
    250bp windows are created in the center of the 2000bp sequence.
    """
    newbed = bed.replace('.bed', '_ic.bed' if input_centric else '_oc.bed')
    newfile = open(newbed, 'w')
    for line in open(bed):
        if not line.startswith('#'):
            fields = line.strip().split()
            chrom, start, end, name = fields[0], int(fields[1]), int(fields[2]), fields[3]
            center = (start + end) // 2
            region_length = end - start
            newfile.write(f"{chrom}\t{center-step_size}\t{center + step_size}\t{name}_p\n")
            # Need to add the input_length/2 only if input_centric is True
            n_splits = int((region_length + input_length * int(input_centric)) / (2 * step_size)) - 1 + int((region_length + input_length * int(input_centric)) % (2 * step_size) > 0)
            print(f'Creating {2*n_splits+1} sequences of length {input_length} for {region_length} region with input_centric={input_centric}.')
            for i in range(int((region_length+input_length*int(input_centric))/2/step_size)-1+int((region_length+input_length*int(input_centric))%(2*step_size)>0)):
                newfile.write(f"{chrom}\t{center + i*step_size}\t{center + i*step_size + window_size}\t{name}_cp{i}\n")
                newfile.write(f"{chrom}\t{center -i*step_size-window_size}\t{center - i*step_size}\t{name}_cm{i}\n")
    newfile.close()
    return newbed


if create_sequence:

    newbed = create_centric_bed_file(bed, window_size, step_size, input_length, input_centric)
    # If input_centric is True, the new bed file will have '_ic.bed' suffix, otherwise '_oc.bed'

    original_bed = bed
    bed = newbed

    prefix = os.path.split(bed)[0]
    if prefix != '':
        prefix = prefix+'/'
    sequence = prefix+mm10.strip('/').split('/')[-1]+os.path.splitext(os.path.split(bed.strip('.gz'))[1])[0]+f'{input_length}.fasta'
    original_sequence = prefix+mm10.strip('/').split('/')[-1]+os.path.splitext(os.path.split(original_bed.strip('.gz'))[1])[0]+'.fasta'
    
    # Create fasta form mm10 genome with bed file
    !python {drgclis+'data_preprocessing/generate_fasta_from_bedgtf_and_genome.py'} {mm10} {original_bed} --'save_pos_info'
    # Create one-hot encoding for input to model and attributions
    !python {drgclis+'data_preprocessing/transform_seqtofeature.py'} {original_sequence}
    original_sequencenpz=f'{os.path.splitext(original_sequence)[0]}_onehot-ACGT_alignleft.npz'


# Create fasta form mm10 genome with bed file
!python {drgclis+'data_preprocessing/generate_fasta_from_bedgtf_and_genome.py'} {mm10} {bed} --extend_to_length {input_length} --'save_pos_info'

# Potentially create background shuffled sequences for motif detection
# Attribution from shuffled sequences will be used to determine significant acttributions in the actual sequences.
# If you want to use the background sequences, set the variable 'background' to True
background = False
if background:
    # read fasta file and create background sequences
    from drg_tools.io_utils import readinfasta
    seq_names, seqs = readinfasta(sequence)
    from tangermeme.ersatz import dinucleotide_shuffle
    from tangermeme.utils import one_hot_encode
    import torch
    ohseq = np.array([one_hot_encode(seq) for seq in seqs])
    print('Original sequences:', ohseq.shape)
    shuffled_seqs = dinucleotide_shuffle(torch.tensor(ohseq), n=1).squeeze(1).numpy()
    print('Shuffled sequences:', shuffled_seqs.shape)
    rseq_names = ['shuffled_'+name for name in seq_names]
    print(rseq_names)
    # convert shuffled sequences back to string format
    shuffled_seqs = [''.join(np.array(list('ACGT'))[np.argmax(base, axis = -2)]) for base in shuffled_seqs]
    # add the shuffled sequences to the sequence file
    with open(sequence, 'a') as f:
        for name, seq in zip(rseq_names, shuffled_seqs):
            print(name, seq)
            f.write(f'>{name}\n'+''.join(seq) + '\n')


# Create one-hot encoding for input to model and attributions
!python {drgclis+'data_preprocessing/transform_seqtofeature.py'} {sequence}
sequencenpz=f'{os.path.splitext(sequence)[0]}_onehot-ACGT_alignleft.npz'

Creating 7 sequences of length 2000 for 1000 region with input_centric=False.
Data format (1, 6)
['chr1']
./data/inputs/mm10test
Read ./data/mm10/chr1.fa.gz
Length 195471971
Generate seq for chr1
Locations in chr 1
Max sequence length ImmGenATAC1219.peak_3 1000
Saved as 
./data/inputs/mm10test_onehot-ACGT_alignleft.npz
Data format (7, 4)
['chr1']
./data/inputs/mm10test_oc2000
Read ./data/mm10/chr1.fa.gz
Length 195471971
Generate seq for chr1
Locations in chr 7
Max sequence length ImmGenATAC1219.peak_3_cm0 2000
Saved as 
./data/inputs/mm10test_oc2000_onehot-ACGT_alignleft.npz


## Specificy output from server
Now, let's specify what we want to get from the model. Below are three different possible choices
1. Return attributions for lineages and all modalities

In [5]:
return_attribution = True # If False, only returns predictions
attribution_type = 'grad' # only grad or deepshap are feasible for large sequences. deepshap is not recommended for model with weighted avg pooling
cell_types = 'all' # or specific output tracks separated by ',', e.g. B.Fem.Sp,B.Fo.Sp,B.FrE.BM,B.GC.CB.Sp
modalities = 'all' # Define data modalities that you want to look at: choose from ATAC,CTCF,H33,H3K4me1,H3K4me3,H3K27me3,H3K27ac,H3K36me3
lineages = True # Define if Attributions should be summarized to lineages
plotpermodality = True # Determine if a separate figure is generated for each modality

2. Return attributions for only one cell type and one modality

In [42]:
return_attribution = True # If False, only returns predictions
attribution_type = 'grad' # only grad or deepshap are feasible for large sequences. deepshap is not recommended for model with weighted avg pooling
cell_types = 'B.Fem.Sp' # or specific output tracks separated by ',', e.g. B.Fem.Sp,B.Fo.Sp,B.FrE.BM,B.GC.CB.Sp
modalities = 'ATAC' # Define data modalities that you want to look at: choose from ATAC,CTCF,H33,H3K4me1,H3K4me3,H3K27me3,H3K27ac,H3K36me3
lineages = False # Define if Attributions should be summarized to lineages
plotpermodality = True # Determine if a separate figure is generated for each modality

3. Compare B-cell attributions across modalities

In [None]:
return_attribution = True # If False, only returns predictions
attribution_type = 'grad' # only grad or deepshap are feasible for large sequences. deepshap is not recommended for model with weighted avg pooling
cell_types = 'B.Fem.Sp,B.Fo.Sp,B.FrE.BM,B.GC.CB.Sp,B.GC.CC.Sp,B.MZ.Sp,B.PB.Sp,B.PC.BM,B.PC.Sp,B.Sp,B.T1.Sp,B.T2.Sp,B.T3.Sp,B.mem.Sp,B1b.PC' # or specific output tracks separated by ',', e.g. B.Fem.Sp,B.Fo.Sp,B.FrE.BM,B.GC.CB.Sp
modalities = 'all' # Define data modalities that you want to look at: choose from ATAC,CTCF,H33,H3K4me1,H3K4me3,H3K27me3,H3K27ac,H3K36me3
lineages = True # Define if Attributions should be summarized to lineages
plotpermodality = False # Determine if a separate figure is generated for each modality

In [43]:
# Use the given input arguments to generate inputs for the model
import pandas as pd

# Load master file for lineages for selection and potential lineage summary
lineage_file = './data/CutandRun_and_ATAC.lineages.txt'
lineage_frame = pd.read_table(lineage_file, header = None, names = ['cell_type', 'lineage'])
data_modalities = 'ATAC,CTCF,H33,H3K4me1,H3K4me3,H3K27me3,H3K27ac,H3K36me3'.split(',')
outdir='./results/' # directory to save files to

# Select tracks that will be returned
if cell_types == 'all':
    cell_types = lineage_frame['cell_type']
elif ',' in cell_types:
    cell_types = cell_types.split(',')
else: # for single cell type
    cell_types = [cell_types]

if modalities == 'all':
    modalities = data_modalities
elif ',' in modalities:
    modalities = modalities.split(',')
else: # for single modality
    modalities = [modalities]
print(modalities, cell_types)
tracks = '--select_tracks '
for modal in modalities:
    for cell_type in cell_types:
        tracks += f'{cell_type}.{modal},'
tracks = tracks.strip(',')
print(f"Selected tracks: {tracks}")

# Average over cell lineages
mean_lineage = ''
mean_lineage_file = './data/CutandRun_and_ATAC.lineages.modsep.tsv'
if lineages:
    mean_lineage = f'--average_outclasses {mean_lineage_file}'

# Define sequence attribuitons that should be returned
seq_atts = ''
if return_attribution:
    seq_atts = f'--sequence_attributions {attribution_type} all'


['ATAC'] ['B.Fem.Sp']
Selected tracks: --select_tracks B.Fem.Sp.ATAC


## Run model to compute predictions (and attributions)

In [44]:
# Run model
## determine if you have a gpu for computations
device='cpu'

# keep track name files in that order because it's the same as during training.
track_names='--load_output_track_names ./data/CTCF_tracks.txt,./data/H3K27ac_tracks.txt,./data/H3K36me3_tracks.txt,./data/H3K4me3_tracks.txt,./data/H33_tracks.txt,./data/H3K27me3_tracks.txt,./data/H3K4me1_tracks.txt,./data/ATAC_tracks.txt'
print(f"python {drgclis+'train_models/run_cnn_model.py'} {sequencenpz} None --predictnew --cnn {model_params} {'device='+device} {track_names} {tracks} --save_predictions {seq_atts} {mean_lineage} --outname {outdir}")
!python {drgclis+'train_models/run_cnn_model.py'} {sequencenpz} None --predictnew --cnn {model_params} {'device='+device} {track_names} {tracks} --save_predictions {seq_atts} {mean_lineage} --outname {outdir}


python ~/Git/DRG/scripts/train_models/run_cnn_model.py ./data/inputs/mm10test_oc2000_onehot-ACGT_alignleft.npz None --predictnew --cnn ./models/CTCFaH3K27acaH3K36me3aH3K4me3aH33aH3K27me3aH3K4me1aATAConseq2krcomp_mh0-cv10-1_Cormsek512l19TfEXPGELUmax10rcTvlCota_tc2dNoned1s1r1l7ma5nfc3s1024cbnoTfdo0.1tr1e-05SGD0.9bs64-F_model_params.dat device=cpu --load_output_track_names ./data/CTCF_tracks.txt,./data/H3K27ac_tracks.txt,./data/H3K36me3_tracks.txt,./data/H3K4me3_tracks.txt,./data/H33_tracks.txt,./data/H3K27me3_tracks.txt,./data/H3K4me1_tracks.txt,./data/ATAC_tracks.txt --select_tracks B.Fem.Sp.ATAC --save_predictions --sequence_attributions grad all  --outname ./results/
None is not a valid file.
Input shapes X: (7, 4, 2000)
Selected list of tracks do not match the names in the data
Selected list of tracks do not match the names in the data
Selected list of tracks do not match the names in the data
Selected list of tracks do not match the names in the data
Selected list of tracks do not m

### In case of an output centric view sum all the individual attributions over the original window

In [45]:
import numpy as np

def average_attributions_over_windows(original_bed_path, att_names, att_values, att_exp, input_length, input_centric, 
                                      step_size, combination_method='average', combination_window_size=250):
    """
    Average attributions over all windows in the original sequence.
    
    Parameters:
    -----------
    original_bed_path : str
        Path to the original bed file with regions to average over
    att_names : np.array
        Array of individual attribution names, need to be in the format 'name_p', 'name_cp0', 'name_cm0', etc.
    att_values : np.array
        Array of individual attribution values
    att_exp : np.array
        Array of experiments, only required when combination_method is 'weighted'
    input_length : int
        Length of the attributions 
    input_centric : bool
        Whether input centric view is used
    step_size : int
        Step size for sliding windows
    combination_method : str, optional
        Method to combine attributions, can be 'average', 'weighted', or 'max'. Default is 'average', and only 'average' is implemented.
    combination_window_size : int, or dict, optional
        Size of the window for combination, only used if combination_method is 'weighted'. Default is 250 for all modalities.
        Can be a single integer or a dictionary with modality names as keys and window sizes as values.
        
        
    Returns:
    --------
    tuple
        (attribution_average, attribution_names, att_exp)
        - attribution_average: averaged attributions array
        - attribution_names: names array for averaged attributions
        - att_exp: experiments array (unchanged)
    """
    # Read in original bed file
    bed_obj = open(original_bed_path, 'r')
    
    attribution_average = []
    attribution_names = []
    
    # Average over all windows in the original sequence
    for region in bed_obj:
        if not region.startswith('#'):
            fields = region.strip().split()
            chrom, start, end, name = fields[0], int(fields[1]), int(fields[2]), fields[3]
            region_length = end - start

            # Get the attributions for the current region
            att_average = att_values[att_names == name + '_p']

            n_splits = int((end-start+input_length*int(input_centric))/2/step_size)-1+int((end-start+input_length*int(input_centric))%(2*step_size)>0)
            n_ = n_splits * 2 + 1
            att_average = (1/n_) *att_average[..., max(0, (input_length-region_length)//2): min(input_length, region_length + (input_length-region_length)//2)]
            print(f"Attribution average shape: {att_average.shape}")
            
            for i in range(n_splits):
                att_namep = f'{name}_cp{i}'
                att_namem = f'{name}_cm{i}'
                if att_namep in att_names:
                    idx = np.where(att_names == att_namep)[0]
                    # Add the attributions to the original sequence
                    # Calculate where the start would be given region_length and input_length
                    start_at = region_length//2 - (input_length//2 - step_size*(i+1))
                    end_at = input_length//2 - (region_length//2) - step_size*(i+1)
                    # If region_length//2 + step_size < input_length//2, we start 0 for assignment
                    # Else if region_length//2 + step_size >= input_length//2, we start at region_length//2 + step_size - input_length//2
                    start_average = max(0, start_at) # if start_at < 0, we start at 0
                    # End average is from the start to the end of the region_length or the end of the input_length
                    end_average = min(input_length+start_average, region_length)
                    # if input_length//2 > region_length//2+step_size*(i+1), not the entire region is mapped
                    start_attr = max(0, end_at)
                    # End attribute is from the start to the end of the region_length or the end of the input_length
                    end_attr = min(input_length, min(start_attr,end_at) + region_length)
                    #print(np.corrcoef(att_average[..., start_average: end_average].flatten(), att_values[idx][..., start_attr: end_attr].flatten()))
                    #print(np.corrcoef(att_average[..., start_average+1: end_average].flatten(), att_values[idx][..., start_attr: end_attr-1].flatten()))
                    att_average[..., start_average: end_average] += (1/n_) * att_values[idx][..., start_attr: end_attr]
                    #print(start_average, end_average, start_attr, end_attr, att_values[idx].shape)
                if att_namem in att_names:
                    idx = np.where(att_names == att_namem)[0]
                    # Now do the indexing to the left side of the region
                    # Calculate where the start would be given region_length and input_length
                    start_at = region_length//2 - (input_length//2 + step_size*(i+1))
                    end_at = input_length//2 - (region_length//2) - step_size*(i+1)
                    # If region_length//2 + step_size < input_length//2, we start 0 for assignment
                    # Else if region_length//2 + step_size >= input_length//2, we start at region_length//2 + step_size - input_length//2
                    start_average = max(0, start_at)
                    # End average is from the start to the end of the region_length or the end of the input_length
                    end_average = min(input_length+start_average, region_length + min(0, input_length//2-region_length//2-step_size*(i+1)))
                    # if input_length//2 > region_length//2+step_size*(i+1), not the entire region is mapped
                    start_attr = max(0, input_length//2 - (region_length//2) + step_size*(i+1))
                    # End attribute is from the start to the end of the region_length or the end of the input_length
                    end_attr = min(input_length, start_attr + region_length)
                    # Add the attributions to the original sequence
                    att_average[..., start_average: end_average] += (1/n_) * att_values[idx][..., start_attr: end_attr]
                    #print(np.corrcoef(att_average[..., start_average: end_average].flatten(), att_values[idx][..., start_attr: end_attr].flatten()))
                    #print(start_average, end_average, start_attr, end_attr)

            attribution_average.append(att_average)
            attribution_names.append(name)
    
    bed_obj.close()
    
    # Concatenate and convert to arrays
    attribution_average = np.concatenate(attribution_average, axis = 0)
    attribution_names = np.array(attribution_names)
    print(f"Attribution average shape: {attribution_average.shape}, names: {attribution_names}")
    
    return attribution_average, attribution_names, att_exp


# Combination methods: Average, Weighted, Max 
# Weighted would weight each window from the center differently with 1/((j+1)*n_), giving higher weights to intervals that are close to the window
# We would multiply this weight tensor beforehand with the attribution values.
# Need to generate two weighted tensors, one with atac_window_size and one with cr_window_size
combination_method = 'average'

# Create output centric view
# Note: this is only possible if the sequence was created from a bed file.
# Read in attributions
attarrays=outdir+'from'+model_file+'_'+attribution_type+'all.npz'
attributions = np.load(attarrays, allow_pickle=True)
att_names = attributions['names']
print(f"Attribution names: {att_names}")
att_values = attributions['values']
att_exp = attributions['experiments']

# Call the function to average attributions
attribution_average, attribution_names, att_exp = average_attributions_over_windows(
    original_bed, att_names, att_values, att_exp, input_length, input_centric, step_size, combination_method
)

# Save the average attributions
attarrays = attarrays.replace('.npz', 'avg.npz')
np.savez_compressed(attarrays, values=attribution_average, names=attribution_names, experiments=att_exp)

Attribution names: ['ImmGenATAC1219.peak_3_cm0' 'ImmGenATAC1219.peak_3_cm1'
 'ImmGenATAC1219.peak_3_cm2' 'ImmGenATAC1219.peak_3_cp0'
 'ImmGenATAC1219.peak_3_cp1' 'ImmGenATAC1219.peak_3_cp2'
 'ImmGenATAC1219.peak_3_p']
Attribution average shape: (1, 1, 4, 1000)
Attribution average shape: (1, 1, 4, 1000), names: ['ImmGenATAC1219.peak_3']


## Visualize all selected attributions

In [46]:
# Visualize attributions
# Some visualization choices
remove_edges_with_less = 0.5 # removes left and right parts of the attribution map that don't have attribtuions larger than this fraction of the maximum in the sequence
dpi=100 # resolution

from drg_tools.io_utils import readinfasta
#attarrays=outdir+'from'+model_file+'_'+attribution_type+'all.npz'
if create_sequence:
    seq_names, seqs = readinfasta(original_sequence)
    sequencenpz = original_sequencenpz
else:
    seq_names, seqs = readinfasta(sequence)

jpgs = []
if plotpermodality:
    for modal in modalities:
        for s, seq in enumerate(seq_names):
            print(seq, modal)
            !python {drgclis+'sequence_attributions/run_plot_acrosstracks_attribution_maps.py'} {attarrays} {sequencenpz} {seq} {'musthave='+modal} --remove_low_attributions {remove_edges_with_less} --dpi {dpi}
            jpgs.append(f'{attarrays.rsplit('.',1)[0]}_{seq}_{modal}.jpg')
else:
    for s, seq in enumerate(seq_names):
        print(seq)
        !python {drgclis+'sequence_attributions/run_plot_acrosstracks_attribution_maps.py'} {attarrays} {sequencenpz} {seq} 'all' --remove_low_attributions {remove_edges_with_less} --dpi {dpi}
        jpgs.append(f'{attarrays.rsplit('.',1)[0]}_{seq}_all.jpg')

ImmGenATAC1219.peak_3 ATAC
ImmGenATAC1219.peak_3 ImmGenATAC1219.peak_3 ['B.Fem.Sp.ATAC']
(1, 1000, 4)
New range [np.int64(89), np.int64(890)]
./results/fromCTCFaH3K27acaH3K36me3aH3K4me3aH33aH3K27me3aH3K4me1aATAConseq2krcomp_mh0-cv10-1_Cormsek512l19TfEXPGELUmax10rcTvlCota_tc2dNoned1s1r1l7ma5nfc3s1024cbnoTfdo0.1tr1e-05SGD0.9bs64-F_gradallavg_ImmGenATAC1219.peak_3_ATAC.jpg


In [None]:
# The best would be to automate this and show all the generated files.
from IPython.display import Image
for jpg in jpgs:
    print(jpg)
    Image(filename=jpg)

./results/fromCTCFaH3K27acaH3K36me3aH3K4me3aH33aH3K27me3aH3K4me1aATAConseq2krcomp_mh0-cv10-1_Cormsek512l19TfEXPGELUmax10rcTvlCota_tc2dNoned1s1r1l7ma5nfc3s1024cbnoTfdo0.1tr1e-05SGD0.9bs64-F_gradall_ImmGenATAC1219.peak_3_all.jpg


## Save all selected attributions as bigwig files

In [None]:
chromsizes_path = './data/inputs/chromsizes.pkl'
pos_info_path = f'{sequencenpz.split("_")[0]}_pos_info.pkl' # get this path based on sequencenpz path

In [None]:
from drg_tools.io_utils import readinfasta
attarrays=outdir+'from'+model_file+'_'+attribution_type+'all.npz'
seq_names, seqs = readinfasta(sequence)

jpgs = []
if plotpermodality:
    for modal in modalities:
        for s, seq in enumerate(seq_names):
            print(seq, modal)
            !python {drgclis+'sequence_attributions/run_plot_acrosstracks_attribution_maps.py'} {attarrays} {sequencenpz} {seq} {'musthave='+modal} --chromsizes_path {chromsizes_path} --pos_info_path {pos_info_path} --'save_bw'
            jpgs.append(f'{attarrays.rsplit('.',1)[0]}_{seq}_{modal}.jpg')
else:
    for s, seq in enumerate(seq_names):
        print(seq)
        !python {drgclis+'sequence_attributions/run_plot_acrosstracks_attribution_maps.py'} {attarrays} {sequencenpz} {seq} 'all' --chromsizes_path {chromsizes_path} --pos_info_path {pos_info_path} --'save_bw'
        jpgs.append(f'{attarrays.rsplit('.',1)[0]}_{seq}_all.jpg')

## Extract seqlets from attributions, cluster seqlets, normalize cluster motifs to use with tomtom 
#### These steps are from https://github.com/LXsasse/DRG/blob/main/examples/Attribution_analysis.md

#### Extract seqlets from attributions

In [None]:
sigcut=1.96 # z-score cut off for significant motifs
maxgap=1 # maximum gap length
minsize=4 # minimum number of significant bases in a 'motif'
norm='global' # Z-score normalization derived from all attributions. Alternatiely, 'seq' if each sequence should be normalized individually, 'condition' if all sequences in a track should be normalized individually, 'std 0.1' if we want to devide by the standard deviation of 0.1 

!python {drgclis+'sequence_attributions/run_extract_motifs_from_attributionmaps.py'} {attarrays} {sequencenpz} {sigcut} {maxgap} {minsize} {norm} --select_tracks all

seqleteffects=f'{attarrays.split('.npz')[0]}_globalmotifs1.96_1_4.txt'
seqlets=f'{attarrays.split('.npz')[0]}_globalmotifs1.96_1_4.npz'

#### Cluster seqlets

In [None]:
!python {drgclis+'interpret_models/compute_pwm_correlation_cluster_and_combine.py'} {seqlets} complete --distance_threshold 0.05 --distance_metric correlation_pvalue --clusternames

seqlet_clusters=f'{os.path.splitext(seqlets)[0]}ms4_cldcomplete0.05corpva.txt'
seqlet_clustermotifs=f'{os.path.splitext(seqlets)[0]}ms4_cldcomplete0.05corpvapfms.meme'

#### Normalize cluster motifs to use with tomtom

In [None]:
# Note: use --custom_outname to avoid errors based on automatically generated filename being too long 

In [None]:
strip=0 # To avoid 'No entries in pwm' error from parse_motifs_tomeme, set strip=0
norm_cluster_outname = 'norm_cluster_motifs'
!python {drgclis+'interpret_models/parse_motifs_tomeme.py'} {seqlet_clustermotifs} --transform exp,strip={strip},norm --'custom_outname' {norm_cluster_outname}
norm_seqlet_clustermotifs = f'{os.path.dirname(seqlets)}/{norm_cluster_outname}.meme'

## Use tomtom to match cluster motif to a database, and save .bed files for the positions of significant motif matches 

In [None]:
# Note: the number of .bed files saved will be number of sequences x number of tracks 

In [None]:
motif_database_path='./data/mouse_pfms_v4.meme'
save_dir = './results/' # where the .bed files are saved 

In [None]:
!python  {drgclis+'sequence_attributions/save_motif_labels_as_bed.py'} --motif_database_path {motif_database_path} --input_seqlet_path {norm_seqlet_clustermotifs} --motif_locations_info_path {seqlet_clusters} --pos_info_path {pos_info_path} --save_dir {save_dir}