In [1]:
import sys
import glob
import numpy
import scipy
import h5py

from scipy import sparse

from dragonnfruit.io import LocusGenerator
from dragonnfruit.io import GenomewideGenerator

from dragonnfruit.models import CellStateController
from dragonnfruit.models import DynamicBPNet
from dragonnfruit.models import DragoNNFruit
from dragonnfruit.preprocessing import extract_fragments
from dragonnfruit.preprocessing import preprocess_sparse_atac
from dragonnfruit.preprocessing import read_chrom_sizes
from dragonnfruit.io import save_data, load_data


import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import muon as mu
from muon import atac as ac

import collections
from scipy.sparse import csc_matrix

data_dir = "/workspaces/torch_ddsm/_data_pool1"


In [2]:
### get the filtered cells

import muon as mu

multiome_file = f"{data_dir}/10x_data/pbmc3kv2/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5"
mdata = mu.read_10x_h5(multiome_file)


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


Added `interval` annotation for features from /workspaces/torch_ddsm/_data_pool1/10x_data/pbmc3kv2/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5


  data_mod.loc[:, colname] = col
  data_mod.loc[:, colname] = col


In [3]:
chrom_sizes, header = read_chrom_sizes("/workspaces/torch_ddsm/_data_pool1/general/hg38.chrom.sizes")
canonical_chroms = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 
		'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 
		'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 
		'chr22', 'chrX']

filtered_chrom_sizes = {key: value for key, value in chrom_sizes.items() if key in canonical_chroms}

chroms = list(filtered_chrom_sizes.keys())

parameters = {
	'fragments': ["/workspaces/torch_ddsm/_data_pool1/10x_data/pbmc3kv2/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz"],
	'sequences': "/workspaces/torch_ddsm/_data_pool1/10x_data/refdata-gex-GRCh38-2020-A/fasta/genome.fa",
	'chrom_sizes': filtered_chrom_sizes,
	'chroms': chroms,
	'include_cells': [mdata.obs_names.tolist()],
	'exclude_cells': None,
	'cell_name_prefixes': '',
	'max_fragment_length': 1000,
	'start_offset': 4,
	'end_offset': -5,
	'n_jobs': 16,
	'verbose': True,
	'signals':'dragonnfruit_data.h5',
	'read_depths':'atac_read_depths.npz',
	'save_dir': '/workspaces/torch_ddsm/_data_pool1/test_dragonnfruit_pbmc',
    'loci': '/workspaces/torch_ddsm/_data_pool1/10x_data/pbmc3kv2/pbmc_granulocyte_sorted_3k_atac_peaks.bed',
    'n_neighbors': 500,
	'n_components': 50,
 	'neighbors': 'atac_neighbors.npz',
  	'pca': 'atac_pca.npz',
	'count_matrix': 'atac_cellxpeak_counts.npz',
	'name' : "pbmc_granulocyte_sorted_3k_atac",
}
X_cscs, read_depths = extract_fragments(
		fragments=parameters['fragments'],
		chrom_sizes=parameters['chrom_sizes'],
		chroms=parameters['chroms'],
		include_cells=parameters['include_cells'],
		exclude_cells=parameters['exclude_cells'],
		cell_name_prefixes=parameters['cell_name_prefixes'],
		max_fragment_length=parameters['max_fragment_length'],
		start_offset=parameters['start_offset'],
		end_offset=parameters['end_offset'],
		n_jobs=parameters['n_jobs'],
		verbose=parameters['verbose']
)

os.makedirs(parameters['save_dir'], exist_ok=True)
save_data(f"{parameters['save_dir']}/{parameters['signals']}", X_cscs)
numpy.savez_compressed(f"{parameters['save_dir']}/{parameters['read_depths']}", read_depths)

44110005it [01:56, 379643.05it/s]



/workspaces/torch_ddsm/_data_pool1/10x_data/pbmc3kv2/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz
Cell included:  2711
Cells excluded:  0
Cells not included:  459458
Cells added:  0
Cell inclusion list size:  2711
Cell exclusion list size:  0
Frag. included count:  39070633
Frag. filtered by chrom:  16397
Frag. filtered by length:  98970



In [4]:
cellxlocus, X_pca, neighbors = preprocess_sparse_atac(
    X_cscs, 
    peaks=parameters['loci'], 
    chroms=parameters['chroms'],
    n_components=parameters['n_components'], 
    n_neighbors=parameters['n_neighbors'],
    verbose=parameters['verbose'])

scipy.sparse.save_npz(f"{parameters['save_dir']}/{parameters['count_matrix']}", cellxlocus)
numpy.savez_compressed(f"{parameters['save_dir']}/{parameters['pca']}", X_pca)
numpy.savez_compressed(f"{parameters['save_dir']}/{parameters['neighbors']}", neighbors)

98319it [00:08, 11375.93it/s]


### chrom pseudobulk

In [5]:
from dragonnfruit.preprocessing import create_pseudobulks
create_pseudobulks(
    reads=X_cscs, 
    output_filename=f"{parameters['save_dir']}/pbulk.bw", 
    chrom_sizes=parameters['chrom_sizes'],
    chroms=parameters['chroms'])

In [6]:
X_cscs['chr1']

<2711x248956422 sparse matrix of type '<class 'numpy.int8'>'
	with 7384156 stored elements in Compressed Sparse Column format>

### Fit bias model

##### Does this need to be cell-wise bias?

In [7]:
from bpnetlite.negatives import extract_matching_loci

bp_parameters = {
	'peaks': '/workspaces/torch_ddsm/_data_pool1/10x_data/pbmc3kv2/pbmc_granulocyte_sorted_3k_atac_peaks.bed',
	'fasta': parameters['sequences'],
	'bin_width': 0.02,
	'max_n_perc': 0.1,
	'bigwig': '/workspaces/torch_ddsm/_data_pool1/test_dragonnfruit_pbmc/pbulk.bw',
	'beta': 0.5,
	'in_window': 2114,
	'out_window': 1000,
	'verbose': True,
    'chroms': chroms,
    'max_jitter': 128,
    'output': 'matched_loci.bed',
}

# Extract regions that match the GC content of the peaks
matched_loci = extract_matching_loci(
    loci=parameters['loci'], 
    fasta=bp_parameters['fasta'],
    gc_bin_width=bp_parameters['bin_width'],
    max_n_perc=bp_parameters['max_n_perc'],
    bigwig=bp_parameters['bigwig'],
    signal_beta=bp_parameters['beta'],
    in_window=bp_parameters['in_window'],
    out_window=bp_parameters['out_window'],
    verbose=bp_parameters['verbose']
)

matched_loci.to_csv(f"{parameters['save_dir']}/{bp_parameters['output']}", header=False, sep='\t', index=False)

Loading Loci: 100%|██████████| 98319/98319 [01:16<00:00, 1279.07it/s]




100%|██████████| 33/33 [00:00<00:00, 176.28it/s]


GC Bin	Background Count	Peak Count	Chosen Count
0.00:        0	       0	       0
0.02:        0	       0	       0
0.04:        0	       0	       0
0.06:        6	       0	       0
0.08:       10	       0	       0
0.10:       12	       0	       0
0.12:       17	       0	       0
0.14:       27	       0	       0
0.16:       39	       0	       0
0.18:       37	       0	       0
0.20:       69	       0	       0
0.22:      124	       4	       4
0.24:      236	       7	       7
0.26:     1361	      12	      12
0.28:     8222	      56	      56
0.30:    30614	     275	     275
0.32:    70314	    1004	    1004
0.34:   117033	    2629	    2629
0.36:   157741	    4986	    4986
0.38:   180971	    7274	    7274
0.40:   155453	    9064	    9064
0.42:   113742	   10147	   10147
0.44:    85768	    9823	    9823
0.46:    62739	    9025	    9025
0.48:    40418	    7931	    7931
0.50:    25249	    7149	    7149
0.52:    15291	    5870	    5870
0.54:     9573	    5069	    6571
0.56:     6428	    4221	    

Loading Loci: 100%|██████████| 98319/98319 [02:32<00:00, 644.67it/s] 


GC-bin KS test stat:0.0429, p-value 6.01e-79
Peak Robust Signal Minimum: 28.0
Matched Signal Maximum: 14.0


#### Train the bias model

In [8]:
# default bias parameters
from bpnetlite.io import PeakGenerator
from bpnetlite.io import extract_loci


training_data = PeakGenerator(
    loci=matched_loci, 
    sequences=parameters['sequences'],
    signals=[f"{parameters['save_dir']}/pbulk.bw"],
    controls=None,
    chroms=chroms[2:],
    in_window=bp_parameters['in_window'],
    out_window=bp_parameters['out_window'],
    max_jitter=bp_parameters['max_jitter'],
    reverse_complement=True,
    min_counts=None,
    max_counts=None,
    random_state=None,
    batch_size=64,
    verbose=True
)

valid_data = extract_loci(
    sequences=parameters['sequences'],
    signals=[f"{parameters['save_dir']}/pbulk.bw"],
    controls=None,
    loci=matched_loci,
    chroms=chroms[:2],
    in_window=bp_parameters['in_window'],
    out_window=bp_parameters['out_window'],
    max_jitter=bp_parameters['max_jitter'],
    verbose=True,
)

valid_sequences, valid_signals = valid_data
valid_controls = None
n_control_tracks = 0

Loading Loci: 100%|██████████| 82225/82225 [01:03<00:00, 1285.36it/s]
Loading Loci: 100%|██████████| 16094/16094 [00:12<00:00, 1280.41it/s]


In [9]:
bias_fit_parameters = {
    "n_filters": 256,
    "n_layers": 4,
    "max_counts": None,
    "loci": "../../../chromatin-atlas/ATAC/ENCSR637XSC/negatives_data/negatives.bed",
    "verbose": True,
    "random_state": None,
	"alpha": 10,
	"beta": 0.5,
    "lr": 0.0001,
    "max_epochs": 40,
    "batch_size": 512,
    "n_control_tracks": n_control_tracks,
    "valid_controls": valid_controls,
    "n_outputs" : 1,
    "validation_iter": 1000,
}

# bias_fit_parameters = parameters.copy()

name = f"{parameters['name']}.chrombpnet.bias.fit.json"
bias_fit_parameters['name'] = f"{parameters['name']}.bias"
parameters['bias_model'] = f"{bias_fit_parameters['name']}.torch"

min_counts = training_data.dataset.signals.sum(dim=(1, 2)).min().item()
bias_fit_parameters['max_counts'] = min_counts * bias_fit_parameters['beta']

#### Run BPnet fit

In [10]:
import torch

from bpnetlite.bpnet import BPNet

trimming = (bp_parameters['in_window'] - bp_parameters['out_window']) // 2

model = BPNet(n_filters=bias_fit_parameters['n_filters'], 
    n_layers=bias_fit_parameters['n_layers'],
    profile_output_bias=True,
    count_output_bias=True,
    alpha=bias_fit_parameters['alpha'],
    trimming=trimming,
    name=f"{parameters['save_dir']}/{parameters['name']}",
    n_control_tracks=bias_fit_parameters['n_control_tracks'],
    n_outputs=bias_fit_parameters['n_outputs']).cuda()

optimizer = torch.optim.AdamW(model.parameters(), lr=bias_fit_parameters['lr'])

model.fit(training_data, optimizer, X_valid=valid_sequences, 
		X_ctl_valid=valid_controls, y_valid=valid_signals, max_epochs=bias_fit_parameters['max_epochs'], 
    batch_size=bias_fit_parameters['batch_size'],validation_iter=bias_fit_parameters['validation_iter'],verbose=bias_fit_parameters['verbose'])


Epoch	Iteration	Training Time	Validation Time	Training MNLL	Training Count MSE	Validation MNLL	Validation Profile Pearson	Validation Count Pearson	Validation Count MSE	Saved?
0	0	2.0452	3.2416	34.0888	4.0128	34.5447	0.011361877	-0.010295238	2.93	True
0	1000	36.2942	2.7342	23.4506	0.6508	28.5127	0.10970225	0.18837732	0.8406	True
Epoch 0: Average Validation Loss = 57.87244234196407
1	2000	23.6901	2.7375	18.6945	0.9043	28.3096	0.11358431	0.204692	0.8482	True
Epoch 1: Average Validation Loss = 36.862131467589144
2	3000	14.267	2.7512	20.8583	0.6011	28.2609	0.11435855	0.23690756	0.9595	False
Epoch 2: Average Validation Loss = 37.499942527206954
3	4000	4.8318	2.7418	20.6951	0.6695	28.238	0.11467582	0.26719305	0.9278	False
3	5000	35.8539	2.7391	24.1429	0.6328	28.2304	0.11476806	0.29167333	0.826	True
Epoch 3: Average Validation Loss = 37.44275113116906
4	6000	28.5262	2.7427	26.1153	0.7199	28.2077	0.11541208	0.28453133	0.9087	False
Epoch 4: Average Validation Loss = 36.75676926193534
5	7000	19.0

## Run attribution/motif diagnosis on the bias model

#### Attribution params

In [11]:
attribute_parameters = {
	'sequences': parameters['sequences'],
	'loci': parameters['loci'],
 	'batch_size': 5,
	'in_window': 2114,
	'out_window': 1000,
	'verbose': True,
	'max_jitter': 0,
	'chroms': ['chr6', 'chr7', 'chrX'],
	'model': None,
	'output': 'profile',
	'n_shuffles':20,
	'random_state':0,
 	'ohe_filename': f"{parameters['save_dir']}/{parameters['name']}.ohe.npz",
	'attr_filename': f"{parameters['save_dir']}/{parameters['name']}.attr.npz",
}

#### Run attribution on the bias model

In [12]:
attribute_parameters['sequences']

'/workspaces/torch_ddsm/_data_pool1/10x_data/refdata-gex-GRCh38-2020-A/fasta/genome.fa'

In [13]:
from bpnetlite.attributions import attribute

model = torch.load(f"{parameters['save_dir']}/{parameters['name']}.final.torch").cuda()

X = extract_loci(
    sequences=attribute_parameters['sequences'],
    loci=attribute_parameters['loci'],
    chroms=attribute_parameters['chroms'],
    max_jitter=attribute_parameters['max_jitter'],
    verbose=attribute_parameters['verbose']
)

Loading Loci: 100%|██████████| 13711/13711 [00:08<00:00, 1711.30it/s]


In [14]:
X_ctl = None
X_attr = attribute(model, X, args=X_ctl,
    model_output=attribute_parameters['output'], hypothetical=True, 
    n_shuffles=attribute_parameters['n_shuffles'],
    batch_size=attribute_parameters['batch_size'],
    random_state=attribute_parameters['random_state'],
    verbose=attribute_parameters['verbose'])

numpy.savez_compressed(attribute_parameters['ohe_filename'], X)
numpy.savez_compressed(attribute_parameters['attr_filename'], X_attr)

100%|██████████| 2743/2743 [05:55<00:00,  7.71it/s]


#### Modisco params

In [15]:
modisco_motifs_parameters = {
    'n_seqlets': 100000,
    'output_filename': f"{parameters['save_dir']}/{parameters['name']}_modisco_results.h5",
    'verbose': True,
    'n_leiden': 2,
    'window': 400,
}

#### Run modisco on the attributions

In [16]:
import modiscolite
from modiscolite.util import calculate_window_offsets

sequences = np.load(attribute_parameters['ohe_filename'])['arr_0']
attributions = np.load(attribute_parameters['attr_filename'])['arr_0']

center = sequences.shape[2] // 2
start, end = calculate_window_offsets(center, modisco_motifs_parameters['window'])

sequences = sequences[:, :, start:end].transpose(0, 2, 1)
attributions = attributions[:, :, start:end].transpose(0, 2, 1)

if sequences.shape[1] < modisco_motifs_parameters['window']:
    raise ValueError("Window ({}) cannot be ".format(modisco_motifs_parameters['window']) +
        "longer than the sequences".format(sequences.shape))

sequences = sequences.astype('float32')
attributions = attributions.astype('float32')

pos_patterns, neg_patterns = modiscolite.tfmodisco.TFMoDISco(
    hypothetical_contribs=attributions, 
    one_hot=sequences,
    max_seqlets_per_metacluster=modisco_motifs_parameters['n_seqlets'],
    sliding_window_size=20,
    flank_size=5,
    target_seqlet_fdr=0.05,
    n_leiden_runs=modisco_motifs_parameters['n_leiden'],
    verbose=modisco_motifs_parameters['verbose'])

modiscolite.io.save_hdf5(modisco_motifs_parameters['output_filename'], pos_patterns, neg_patterns, modisco_motifs_parameters['window'])

Using 11092 positive seqlets
Extracted 3999 negative seqlets


#### Parameterize the modisco report

In [17]:
modisco_motifs_parameters['output_filename']

'/workspaces/torch_ddsm/_data_pool1/test_dragonnfruit_pbmc/pbmc_granulocyte_sorted_3k_atac_modisco_results.h5'

In [18]:
modisco_report_parameters= {
    'motifs': '/workspaces/torch_ddsm/_data_pool1/jaspar/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme.txt',
    'output_folder': f"{parameters['save_dir']}/{parameters['name']}_modisco/",
    'verbose': True,
    'input': modisco_motifs_parameters['output_filename'],
    
}

#### Generate the modisco report

In [19]:
# Step 5: Generate the tf-modisco report
modiscolite.report.report_motifs(
    modisco_report_parameters['input'], 
    modisco_report_parameters['output_folder'],
    img_path_suffix=modisco_report_parameters['output_folder'], 
    meme_motif_db=modisco_report_parameters['motifs'],
    is_writing_tomtom_matrix=True,
    top_n_matches=3)

In [20]:
from IPython.display import HTML

# Open the HTML file and read its content
with open(
    '/workspaces/torch_ddsm/_data_pool1/test_dragonnfruit_pbmc/pbmc_granulocyte_sorted_3k_atac_modisco/motifs.html', 
    'r') as file:
    html_content = file.read()

# Display the HTML content
HTML(html_content)

pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
pos_patterns.pattern_0,1749,,,MA1643.1,0.564803,,MA0751.1,0.564803,,MA0697.1,0.564803,
pos_patterns.pattern_1,1396,,,MA0751.1,1.0,,MA1584.1,1.0,,MA0697.1,1.0,
pos_patterns.pattern_2,627,,,MA1615.1,1.0,,MA0162.4,1.0,,MA0484.2,1.0,
pos_patterns.pattern_3,264,,,MA1627.1,1.0,,MA0795.1,1.0,,MA1557.1,1.0,
pos_patterns.pattern_4,228,,,MA0800.1,0.664061,,MA0688.1,0.68277,,MA1155.1,0.68277,
pos_patterns.pattern_5,185,,,MA1522.1,0.03784,,MA1513.1,0.086496,,MA1578.1,0.17119,
pos_patterns.pattern_6,181,,,MA0139.1,1.0,,,,,,,
pos_patterns.pattern_7,166,,,MA0836.2,0.999993,,,,,,,
pos_patterns.pattern_8,166,,,MA1513.1,0.000856,,MA0039.4,0.009236,,MA0146.2,0.015455,
pos_patterns.pattern_9,161,,,MA0146.2,1.0,,MA0503.1,1.0,,MA1616.1,1.0,
