# Set-up

In [227]:
# Imports
import os
import glob
import yaml
import pandas as pd

In [228]:
# Paths
path_fragments = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments"
path_narrowPeaks = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2"

path_genome_fasta = "/cellar/users/aklie/data/ref/genomes/hg38/hg38.fa"
path_chromsizes = "/cellar/users/aklie/data/ref/genomes/hg38/hg38.chrom.sizes"
path_blacklist = "/cellar/users/aklie/data/ref/genomes/hg38/blacklist/blacklist.bed.gz"
path_fold_dir = "/cellar/users/aklie/data/ref/genomes/hg38/chrombpnet/splits"
path_out = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k"

os.makedirs(path_out, exist_ok=True)

# Find files

In [229]:
# Get count bigwigs
fragments = {}
for f in glob.glob(f"{path_fragments}/*.bed.gz"):
    celltype = os.path.basename(f).split(".bed.gz")[0]
    fragments[celltype] = f
fragments

{'SC.alpha': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.alpha.bed.gz',
 'SC.beta': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.beta.bed.gz',
 'SC.EC': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.EC.bed.gz',
 'SC.delta': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.delta.bed.gz'}

In [230]:
# Get narrowPeaks
narrowPeaks = {}
for f in glob.glob(f"{path_narrowPeaks}/*.narrowPeak"):
    celltype = os.path.basename(f).split(".narrowPeak")[0]
    narrowPeaks[celltype] = f
narrowPeaks = {k: v for k, v in narrowPeaks.items() if "top" not in k}
narrowPeaks

{'SC.delta': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.delta.narrowPeak',
 'SC.alpha': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.alpha.narrowPeak',
 'SC.EC': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.EC.narrowPeak',
 'SC.beta': '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.beta.narrowPeak'}

In [231]:
# Run for each peak set sort -k8,8nr input.narrowPeak | head -n 250000 > top_250k_peaks.narrowPeak
for celltype, narrowPeak in narrowPeaks.items():
    os.makedirs(f"{path_out}/{celltype}", exist_ok=True)
    cmd = f"sort -k8,8nr {narrowPeak} | head -n 250000 > {path_out}/{celltype}/{celltype}.top250k.narrowPeak"
    print(cmd)
    #os.system(cmd)
    narrowPeaks[celltype] = f"{path_out}/{celltype}/{celltype}.top250k.narrowPeak"

sort -k8,8nr /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.delta.narrowPeak | head -n 250000 > /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.delta/SC.delta.top250k.narrowPeak
sort -k8,8nr /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.alpha.narrowPeak | head -n 250000 > /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.alpha/SC.alpha.top250k.narrowPeak
sort -k8,8nr /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/5_peak_analysis/peak_calls/rna_celltype/12-1/snapatac2/SC.EC.narrowPeak | head -n 250000 > /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.EC/SC.EC.top250k.narrowPeak
sort -k8,8nr /cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multi

In [232]:
# Make sure keys are the same
assert set(fragments.keys()) == set(narrowPeaks.keys())

In [233]:
# Define the groups to train models on
groups = list(fragments.keys())
groups

['SC.alpha', 'SC.beta', 'SC.EC', 'SC.delta']

# Negatives

In [234]:
path_scripts = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/8_chrombpnet/scripts/negatives"
name = "celltype_250k"

In [235]:
folds = 1
celltypes = []
peaks = []
output_dirs = []
fold_lst = []
for group in groups:
    for i in range(folds):
        outdir = f"{path_out}/{group}/fold_{i}/negatives"
        os.makedirs(outdir, exist_ok=True)
        peakset = narrowPeaks[group]
        celltypes.append(group)
        peaks.append(peakset)
        output_dirs.append(outdir)
        fold_lst.append(i)

In [236]:
# Print out celltypes in same way I would write a bash array so I can copy paste into a script
celltypes_str="celltypes=(\n"
for celltype in celltypes:
    celltypes_str += f"\t{celltype}\n"
celltypes_str += ")"
print(celltypes_str)

celltypes=(
	SC.alpha
	SC.beta
	SC.EC
	SC.delta
)


In [237]:
# Samme for peaks
peaks_str="peaks=(\n"
for peak in peaks:
    peaks_str += f"\t{peak}\n"
peaks_str += ")"
print(peaks_str)

peaks=(
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.alpha/SC.alpha.top250k.narrowPeak
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.beta/SC.beta.top250k.narrowPeak
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.EC/SC.EC.top250k.narrowPeak
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.delta/SC.delta.top250k.narrowPeak
)


In [238]:
# Samme for output_dirs
output_dirs_str="output_dirs=(\n"
for output_dir in output_dirs:
    output_dirs_str += f"\t{output_dir}\n"
output_dirs_str += ")"

In [239]:
#  Samme for fold_lst
fold_lst_str="folds=(\n"
for fold in fold_lst:
    fold_lst_str += f"\t{fold}\n"
fold_lst_str += ")"
print(fold_lst_str)

folds=(
	0
	0
	0
	0
)


In [240]:
# Create a template script for negatives that looks like this
template = '''#! /bin/bash

#####
# Script to run chrombpnet negatives command
# USAGE: sbatch \
--job-name=negatives \
--partition carter-compute \
--output slurm_logs/%x.%A.%a.out \
--mem=16G \
-n 1 \
-t 02-00:00:00 \
--array=1-12%12 \
chrombpnet_negatives.sh
#####

date
echo -e "Job ID: $SLURM_JOB_ID\\n"

# Set-up env
source activate chrombpnet

# file lists
{}
{}
{}
{}

# Grab each for this SLURM task
celltype=${{celltypes[$SLURM_ARRAY_TASK_ID - 1]}}
peak=${{peaks[$SLURM_ARRAY_TASK_ID - 1]}}
fold=${{folds[$SLURM_ARRAY_TASK_ID - 1]}}
output_dir=${{output_dirs[$SLURM_ARRAY_TASK_ID - 1]}}

# echo the celltype and peak
echo -e "Celltype: $celltype"
echo -e "Peakset: $peak"
echo -e "Fold: $fold"
echo -e "Output directory: $output_dir\\n"

# make the output directory
mkdir -p $output_dir

# Run cmd
cmd="chrombpnet prep nonpeaks \
-g {} \
-c {} \
-p $peak \
-fl {}/fold_${{fold}}.json \
-br {} \
-o $output_dir/${{celltype}}"
echo -e "Running command:\\n$cmd\\n"
eval $cmd

# Date
date
'''

In [241]:
# Write the script
with open(f"{path_scripts}/{name}_negatives.sh", "w") as f:
    f.write(template.format(
        celltypes_str,
        peaks_str,
        fold_lst_str,
        output_dirs_str,
        path_genome_fasta,
        path_chromsizes,
        path_fold_dir,
        path_blacklist
    ))

# Bias models

In [242]:
path_scripts = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/8_chrombpnet/scripts/bias_pipeline"
os.makedirs(path_scripts, exist_ok=True)
name = "celltype_250k"
beta = 0.8

In [243]:
# Find negatives
negatives = {}
for outdir in output_dirs:
    group = outdir.split("/")[-3]
    negatives[group] = f"{outdir}/{group}_negatives.bed"

In [245]:
# Update output_dirs to replace "negatives" with bias_model/$beta
output_dirs = [f"{outdir.replace('/negatives', '')}/bias_model/$beta" for outdir in output_dirs]
output_dirs

['/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.alpha/fold_0/bias_model/$beta',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.beta/fold_0/bias_model/$beta',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.EC/fold_0/bias_model/$beta',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.delta/fold_0/bias_model/$beta']

In [246]:
# Same for neegatives
negatives_str="negatives=(\n"
for negative in negatives.values():
    negatives_str += f"\t{negative}\n"
negatives_str += ")"
print(negatives_str)

negatives=(
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.alpha/fold_0/negatives/SC.alpha_negatives.bed
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.beta/fold_0/negatives/SC.beta_negatives.bed
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.EC/fold_0/negatives/SC.EC_negatives.bed
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.delta/fold_0/negatives/SC.delta_negatives.bed
)


In [247]:
# Same for output_dirs
output_dirs_str="output_dirs=(\n"
for output_dir in output_dirs:
    output_dirs_str += f"\t{output_dir}\n"
output_dirs_str += ")"
print(output_dirs_str)

output_dirs=(
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.alpha/fold_0/bias_model/$beta
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.beta/fold_0/bias_model/$beta
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.EC/fold_0/bias_model/$beta
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_chrombpnet/rna_celltype_250k/SC.delta/fold_0/bias_model/$beta
)


In [248]:
# Same for fragments
fragments_str="fragments=(\n"
for fragment in fragments.values():
    fragments_str += f"\t{fragment}\n"
fragments_str += ")"
print(fragments_str)

fragments=(
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.alpha.bed.gz
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.beta.bed.gz
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.EC.bed.gz
	/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/4_integration/atac/pseudobulk/rna_celltype/12-1/fragments/SC.delta.bed.gz
)


In [249]:
# Create a template script for negatives that looks like this
template = '''#! /bin/bash

#####
# Script to run chrombpnet bias model pipeline on single input sample in fragment file
# USAGE: sbatch \
--job-name=chrombpnet_bias_pipeline \
--account carter-gpu \
--partition carter-gpu \
--gpus=a30:1 \
--output slurm_logs/%x.%A.%a.out \
--mem=128G \
-n 4 \
-t 02-00:00:00 \
--array=1-12%12 \
chrombpnet_bias_pipeline.sh
#####

date
echo -e "Job ID: $SLURM_JOB_ID\\n"

# Set-up env
source activate chrombpnet

# file lists
beta={}
{}
{}
{}
{}
{}
{}

# Grab each for this SLURM task
celltype=${{celltypes[$SLURM_ARRAY_TASK_ID - 1]}}
fragment=${{fragments[$SLURM_ARRAY_TASK_ID - 1]}}
peak=${{peaks[$SLURM_ARRAY_TASK_ID - 1]}}
fold=${{folds[$SLURM_ARRAY_TASK_ID - 1]}}
negative=${{negatives[$SLURM_ARRAY_TASK_ID - 1]}}
output_dir=${{output_dirs[$SLURM_ARRAY_TASK_ID - 1]}}

# echo the celltype and peak
echo -e "Celltype: $celltype"
echo -e "Fragment: $fragment"
echo -e "Peakset: $peak"
echo -e "Fold: $fold"
echo -e "Negatives: $negative"
echo -e "Output directory: $output_dir\\n"

# make the output directory
mkdir -p $output_dir

# Run cmd
cmd="chrombpnet bias pipeline \
-ifrag $fragment \
-d "ATAC" \
-g {} \
-c {} \
-p $peak \
-n $negative \
-fl {}/fold_${{fold}}.json \
-b $beta \
-o $output_dir \
-fp $celltype"
echo -e "Running command:\\n$cmd\\n"
eval $cmd

# Date
date
'''

In [250]:
# Write the script
with open(f"{path_scripts}/{name}_bias_pipeline.sh", "w") as f:
    f.write(template.format(
        beta,
        celltypes_str,
        fragments_str,
        peaks_str,
        fold_lst_str,
        negatives_str,
        output_dirs_str,
        path_genome_fasta,
        path_chromsizes,
        path_fold_dir,
    ))


# Prep dataset config

In [7]:
# path to template config file
path_config = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/8_sequence_models/configs/prep_dataset_template.yaml"

In [8]:
# load template config
config = yaml.safe_load(open(path_config, 'r'))
config

{'name': 'celltype_name',
 'threads': 4,
 'random_state': 1234,
 'seqdata': {'fasta': '/cellar/users/aklie/data/ref/genomes/hg38/hg38.fa',
  'seq_var': 'seq',
  'bws': ['unstranded.bw'],
  'bw_names': ['celltype_name'],
  'cov_var': 'cov',
  'loci': 'celltype_name.narrowPeak',
  'batch_size': 1000,
  'fixed_length': 2114,
  'target_length': 1000,
  'alphabet': 'DNA',
  'upper_case': False,
  'add_rev_comp': False,
  'max_jitter': 512},
 'negatives': {'gc_bin_width': 0.02,
  'max_n_perc': 0.1,
  'signal': 'unstranded.bw',
  'signal_beta': 0.5,
  'in_window': 2114,
  'out_window': 1000,
  'random_state': 1234},
 'splits': '/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/ENCODE_cross-val.json'}

In [9]:
# Only use these signal betas
signal_betas = [0.5]

In [25]:
config_paths = []
for i, group in enumerate(groups):
    for signal_beta in signal_betas:
        out_dir = os.path.join(path_out, group, "prep_dataset", f"{signal_beta}")
        os.makedirs(out_dir, exist_ok=True)
        curr_out = os.path.join(out_dir, f"{group}.yaml")
        curr_config = config.copy()
        curr_config["name"] = group
        curr_config["seqdata"]["bws"] = [unstranded_bws[group]]
        curr_config["seqdata"]["bw_names"] = [group]
        curr_config["seqdata"]["loci"] = narrowPeaks[celltypes[i]]
        curr_config["negatives"]["signal"] = curr_config["seqdata"]["bws"][0]
        curr_config['negatives']['signal_beta'] = round(float(signal_beta), 1)
        with open(curr_out, 'w') as f:
            yaml.dump(curr_config, f)
        print(curr_out)
        config_paths.append(curr_out)

/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/prep_dataset/0.5/SC.EC_dex_72.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.delta_palmitate_72/prep_dataset/0.5/SC.delta_palmitate_72.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.alpha_dex_72/prep_dataset/0.5/SC.alpha_dex_72.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.beta_Ex-4_HG_24/prep_dataset/0.5/SC.beta_Ex-4_HG_24.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.beta_control_72/prep_dataset/0.5/SC.beta_control_72.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/

In [27]:
# Make a dataframe with paths of configs in first column and output directory in second column
df = pd.DataFrame()
df["config"] = config_paths
df["out_dir"] = [os.path.dirname(p) for p in config_paths]
df.to_csv("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/8_sequence_models/metadata/prep_dataset_celltype+condition+timepoint.tsv", sep="\t", index=False, header=False)

# Bpnet fit configs

In [12]:
# path to template config file
path_config = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/8_sequence_models/configs/bpnet_fit_template.yaml"

In [13]:
# load template config
config = yaml.safe_load(open(path_config, 'r'))
config

{'name': 'celltype_name',
 'threads': 4,
 'random_state': 1234,
 'seqdata': {'path': 'celltype_name.minimal.seqdata',
  'seq_var': 'seq',
  'cov_var': 'cov',
  'fold': 'fold_0',
  'seq_length': 2114,
  'target_length': 1000,
  'max_jitter': 512,
  'max_counts': 999999,
  'min_counts': 0,
  'outlier_threshold': 0.9999,
  'neg_sampling_ratio': 0.1},
 'model': {'n_filters': 512, 'n_layers': 8, 'n_outputs': 1, 'alpha': None},
 'training': {'learning_rate': 0.001,
  'batch_size': 64,
  'max_epochs': 50,
  'validation_iter': 1000,
  'rc_augment': True,
  'early_stopping': 10},
 'evaluation': {'batch_size': 256},
 'attribution': {'batch_size': 32, 'subsample': 20000, 'n_shuffles': 10},
 'modisco': {'n_seqlets': 50000,
  'window': 500,
  'motif_db': '/cellar/users/aklie/projects/ML4GLand/tutorials/data/motifs.meme.txt'}}

In [16]:
# Grab paths of seqdatas from the previous step
seqdatas = {}
for f in glob.glob(os.path.join(path_out, "*", "prep_dataset", "*", "*.minimal.seqdata")):
    signal_beta = float(f.split("/")[-2])
    group = os.path.basename(f).split(".minimal.seqdata")[0]
    if group not in seqdatas:
        seqdatas[group] = {}
    seqdatas[group][signal_beta] = f
seqdatas

{'SC.delta_palmitate_72': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.delta_palmitate_72/prep_dataset/0.5/SC.delta_palmitate_72.minimal.seqdata'},
 'SC.EC_dex_72': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/prep_dataset/0.5/SC.EC_dex_72.minimal.seqdata'},
 'SC.alpha_dex_72': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.alpha_dex_72/prep_dataset/0.5/SC.alpha_dex_72.minimal.seqdata'}}

In [17]:
# How many folds?
folds = 5

In [18]:
# Which signal betas to use?
signal_betas = [0.5]

In [21]:
# Make config files
config_paths = []
for i, group in enumerate(groups):
    if group not in seqdatas:
        continue
    for signal_beta in signal_betas:
        for fold in range(folds):
            out_dir = os.path.join(path_out, group, "bpnet_fit", f"fold_{fold}", f"{signal_beta}")
            os.makedirs(out_dir, exist_ok=True)
            curr_out = os.path.join(out_dir, f"{group}_fold_{fold}.yaml")
            curr_config = config.copy()
            curr_config["name"] = group
            curr_config["seqdata"]["path"] = seqdatas[group][signal_beta]
            curr_config["seqdata"]["fold"] = f"fold_{fold}"
            with open(curr_out, 'w') as f:
                yaml.dump(curr_config, f)
            print(curr_out)
            config_paths.append(curr_out)

/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_0/0.5/SC.EC_dex_72_fold_0.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_1/0.5/SC.EC_dex_72_fold_1.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_2/0.5/SC.EC_dex_72_fold_2.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_3/0.5/SC.EC_dex_72_fold_3.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_4/0.5/SC.EC_dex_72_fold_4.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_

In [22]:
# Keep only fold_0 configs
config_paths = sorted([c for c in config_paths if "fold_0" in c])
config_paths

['/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_0/0.5/SC.EC_dex_72_fold_0.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.alpha_dex_72/bpnet_fit/fold_0/0.5/SC.alpha_dex_72_fold_0.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.delta_palmitate_72/bpnet_fit/fold_0/0.5/SC.delta_palmitate_72_fold_0.yaml']

In [23]:
# Keep only SC.alpha cellypes
#config_paths = [c for c in config_paths if "SC.alpha" in c]
config_paths

['/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.EC_dex_72/bpnet_fit/fold_0/0.5/SC.EC_dex_72_fold_0.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.alpha_dex_72/bpnet_fit/fold_0/0.5/SC.alpha_dex_72_fold_0.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/8_sequence_models/atac/rna_celltype+condition+timepoint/SC.delta_palmitate_72/bpnet_fit/fold_0/0.5/SC.delta_palmitate_72_fold_0.yaml']

In [25]:
# Make a dataframe with paths of configs in first column and output directory in second column
df = pd.DataFrame()
df["config"] = config_paths
df["out_dir"] = [os.path.dirname(p) for p in config_paths]
df.to_csv("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/8_sequence_models/metadata/bpnet_fit_celltype+condition+timepoint.tsv", sep="\t", index=False, header=False)

# Bias fit configs

In [16]:
# path to template config file
path_config = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/9_sequence_models/configs/bias_fit_template.yaml"

In [17]:
# load template config
config = yaml.safe_load(open(path_config, 'r'))
config

{'name': 'celltype_name',
 'threads': 4,
 'random_state': 1234,
 'seqdata': {'path': 'celltype_name.minimal.seqdata',
  'seq_var': 'seq',
  'cov_var': 'cov',
  'seq_length': 2114,
  'target_length': 1000,
  'max_jitter': 0,
  'max_counts': 0,
  'min_counts': 0,
  'outlier_threshold': 0.9999},
 'model': {'n_filters': 128, 'n_layers': 4, 'n_outputs': 1, 'alpha': None},
 'training': {'learning_rate': 0.001,
  'batch_size': 64,
  'max_epochs': 50,
  'validation_iter': 1000,
  'rc_augment': True,
  'early_stopping': 10},
 'evaluation': {'batch_size': 256},
 'attribution': {'batch_size': 128, 'subsample': 30000, 'n_shuffles': 20},
 'modisco': {'n_seqlets': 50000,
  'window': 500,
  'motif_db': '/cellar/users/aklie/projects/ML4GLand/tutorials/data/motifs.meme.txt'}}

In [18]:
seqdatas = {}
for f in glob.glob(os.path.join(path_out, "*", "prep_dataset", "*", "*.minimal.seqdata")):
    signal_beta = float(f.split("/")[-2])
    celltype = os.path.basename(f).split(".minimal.seqdata")[0]
    if celltype not in seqdatas:
        seqdatas[celltype] = {}
    seqdatas[celltype][signal_beta] = f
seqdatas

{'SC.alpha': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.5/SC.alpha.minimal.seqdata',
  0.6: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.6/SC.alpha.minimal.seqdata',
  0.9: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.9/SC.alpha.minimal.seqdata',
  0.7: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.7/SC.alpha.minimal.seqdata'},
 'SC.EC': {0.9: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC/prep_dataset/0.9/SC.EC.minimal.seqdata',
  0.7: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC

In [19]:
# How many folds?
folds = 5

In [20]:
# Need to pull out Max Negative Counts: value from the following
reports = {}
for f in glob.glob(os.path.join(path_out, "*", "prep_dataset", "*", "*.report.html"), recursive=True):
    signal_beta = float(f.split("/")[-2])
    celltype = os.path.basename(f).split(".report.html")[0]
    with open(f, 'r') as f:
        content = f.read()
        max_neg_counts = float(content.split("Max Negative Counts: ")[1].split(",")[0])
    if celltype not in reports:
        reports[celltype] = {}
    reports[celltype][signal_beta] = max_neg_counts        
reports

{'SC.alpha': {0.5: 110.0, 0.6: 132.0, 0.9: 198.0, 0.7: 154.0},
 'SC.EC': {0.9: 151.0, 0.7: 117.0, 0.5: 84.0, 0.6: 100.0},
 'SC.delta': {0.5: 6.0, 0.6: 7.0, 0.9: 11.0, 0.7: 9.0},
 'SC.beta': {0.7: 144.0, 0.9: 185.0, 0.6: 123.0, 0.5: 103.0}}

In [21]:
signal_betas = [0.5]

In [22]:
config_paths = []
for i, celltype in enumerate(celltypes):
    for signal_beta in signal_betas:
        for fold in range(folds):
            out_dir = os.path.join(path_out, celltype, "bias_fit", f"fold_{fold}", f"{signal_beta}")
            os.makedirs(out_dir, exist_ok=True)
            curr_out = os.path.join(out_dir, f"{celltype}_fold_{fold}.yaml")
            curr_config = config.copy()
            curr_config["name"] = celltype
            curr_config["seqdata"]["path"] = seqdatas[celltype][signal_beta]
            curr_config["seqdata"]["fold"] = f"fold_{fold}"
            config["seqdata"]["max_counts"] = reports[celltype][signal_beta]
            with open(curr_out, 'w') as f:
                yaml.dump(curr_config, f)
            print(curr_out)
            config_paths.append(curr_out)

/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_0/0.5/SC.beta_fold_0.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_1/0.5/SC.beta_fold_1.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_2/0.5/SC.beta_fold_2.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_3/0.5/SC.beta_fold_3.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_4/0.5/SC.beta_fold_4.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC/bias_fit/fold_0/0.5/SC.EC_fold_0.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10

In [58]:
# Keep only fold_0 configs
#config_paths = sorted([c for c in config_paths if "fold_0" in c])
config_paths

['/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_0/0.5/SC.beta_fold_0.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_1/0.5/SC.beta_fold_1.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_2/0.5/SC.beta_fold_2.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_3/0.5/SC.beta_fold_3.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/bias_fit/fold_4/0.5/SC.beta_fold_4.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC/bias_fit/fold_0/0.5/SC.EC_fold_0.yaml',
 '/cellar/users/aklie/data

In [61]:
# Make a dataframe with paths of configs in first column and output directory in second column
df = pd.DataFrame()
df["config"] = config_paths
df["out_dir"] = [os.path.dirname(p) for p in config_paths]
df.to_csv("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/9_sequence_models/metadata/bias_fit_0.5.tsv", sep="\t", index=False, header=False)

# Chrombpnet fit configs

In [23]:
# path to template config file
path_config = "/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/9_sequence_models/configs/chrombpnet_fit_template.yaml"

In [24]:
# load template config
config = yaml.safe_load(open(path_config, 'r'))
config

{'name': 'celltype_name',
 'threads': 4,
 'random_state': 1234,
 'seqdata': {'path': 'celltype_name.minimal.seqdata',
  'seq_var': 'seq',
  'cov_var': 'cov',
  'fold': 'fold_0',
  'seq_length': 2114,
  'target_length': 1000,
  'max_jitter': 512,
  'max_counts': 999999,
  'min_counts': 0,
  'outlier_threshold': 0.9999,
  'neg_sampling_ratio': 0.1},
 'model': {'n_filters': 512,
  'n_layers': 8,
  'n_outputs': 1,
  'alpha': None,
  'bias_model': 'celltype_name.torch'},
 'training': {'learning_rate': 0.001,
  'batch_size': 64,
  'max_epochs': 50,
  'validation_iter': 1000,
  'rc_augment': True,
  'early_stopping': 10},
 'evaluation': {'batch_size': 256},
 'attribution': {'batch_size': 32, 'subsample': 20000, 'n_shuffles': 10},
 'modisco': {'n_seqlets': 50000,
  'window': 500,
  'motif_db': '/cellar/users/aklie/projects/ML4GLand/tutorials/data/motifs.meme.txt'}}

In [25]:
# Grab paths of seqdatas from the previous step
seqdatas = {}
for f in glob.glob(os.path.join(path_out, "*", "prep_dataset", "*", "*.minimal.seqdata")):
    signal_beta = float(f.split("/")[-2])
    celltype = os.path.basename(f).split(".minimal.seqdata")[0]
    if celltype not in seqdatas:
        seqdatas[celltype] = {}
    seqdatas[celltype][signal_beta] = f
seqdatas

{'SC.alpha': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.5/SC.alpha.minimal.seqdata',
  0.6: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.6/SC.alpha.minimal.seqdata',
  0.9: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.9/SC.alpha.minimal.seqdata',
  0.7: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/prep_dataset/0.7/SC.alpha.minimal.seqdata'},
 'SC.EC': {0.9: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC/prep_dataset/0.9/SC.EC.minimal.seqdata',
  0.7: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC

In [26]:
# Need to pull
bias_models = {}
paths = glob.glob(os.path.join(path_out, "*", "bias_fit", "fold_*", "*", "*.torch"), recursive=True)
paths = [p for p in paths if "final" not in p]
for f in paths:
    
    # Grab celltype
    celltype = os.path.basename(f).split(".torch")[0]
    if celltype not in bias_models:
        bias_models[celltype] = {}
    
    # Grab fold
    fold = f.split("/")[-3]
    if fold not in bias_models[celltype]:
        bias_models[celltype][fold] = {}

    # Grab signal_beta
    signal_beta = float(f.split("/")[-2])
    bias_models[celltype][fold][signal_beta] = f
    
bias_models

{'SC.alpha': {'fold_0': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/bias_fit/fold_0/0.5/SC.alpha.torch',
   0.6: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/bias_fit/fold_0/0.6/SC.alpha.torch',
   0.9: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/bias_fit/fold_0/0.9/SC.alpha.torch',
   0.7: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/bias_fit/fold_0/0.7/SC.alpha.torch'},
  'fold_4': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alpha/bias_fit/fold_4/0.5/SC.alpha.torch'},
  'fold_3': {0.5: '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.alp

In [27]:
# How many folds?
folds = 5

In [28]:
# Which signal betas to use?
signal_betas = [0.5]

In [33]:
config_paths = []
for i, celltype in enumerate(celltypes):
    for fold in bias_models[celltype]:
        for signal_beta in signal_betas:
            out_dir = os.path.join(path_out, celltype, "chrombpnet_fit", fold, f"{signal_beta}")
            os.makedirs(out_dir, exist_ok=True)
            curr_out = os.path.join(out_dir, f"{celltype}_{fold}.yaml")
            curr_config = config.copy()
            curr_config["name"] = celltype
            curr_config["seqdata"]["path"] = seqdatas[celltype][signal_beta]
            curr_config["seqdata"]["fold"] = fold
            curr_config["model"]["bias_model"] = bias_models[celltype][fold][signal_beta]
            with open(curr_out, 'w') as f:
                yaml.dump(curr_config, f)
            print(curr_out)
            config_paths.append(curr_out)

/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_1/0.5/SC.beta_fold_1.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_2/0.5/SC.beta_fold_2.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_0/0.5/SC.beta_fold_0.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_4/0.5/SC.beta_fold_4.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_3/0.5/SC.beta_fold_3.yaml
/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC/chrombpnet_fit/fold_4/0.5/SC.EC_fold_4.yaml
/cellar/users/

In [34]:
# Keep only fold_0 configs
#config_paths = sorted([c for c in config_paths if "fold_0" in c])
config_paths

['/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_1/0.5/SC.beta_fold_1.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_2/0.5/SC.beta_fold_2.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_0/0.5/SC.beta_fold_0.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_4/0.5/SC.beta_fold_4.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.beta/chrombpnet_fit/fold_3/0.5/SC.beta_fold_3.yaml',
 '/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/results/9_sequence_models/atac/3-cyt_rna_celltype/SC.EC/chrombpnet_fit/fold_4/0.5/SC.EC_fold

In [35]:
# Make a dataframe with paths of configs in first column and output directory in second column
df = pd.DataFrame()
df["config"] = config_paths
df["out_dir"] = [os.path.dirname(p) for p in config_paths]
df.to_csv("/cellar/users/aklie/data/datasets/igvf_sc-islet_10X-Multiome/bin/9_sequence_models/metadata/chrombpnet_fit_0.5.tsv", sep="\t", index=False, header=False)

# DONE!

---