In [1]:
import sys
from pathlib import Path

# Resolve project root (notebook is in notebooks/)
PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))

# Define data paths (relative to project root)
MUT_PATH = [
    "data/raw/mutations/filtered_mutations.bed",
]
FAI_PATH = "data/raw/reference/GRCh37.fa.fai"
FASTA_PATH = "data/raw/reference/GRCh37.fa"
TIMING_BW = "data/raw/timing/repliSeq_SknshWaveSignalRep1.bigWig"

DNASE_MAP_PATH = "data/raw/DNase-seq/celltype_dnase_map.json"

TUMOUR_FILTER = None


In [2]:
from scripts.grid_search.runner import run_grid_experiment

# Simple sanity check
simple_check = run_grid_experiment(
    mut_path=MUT_PATH,
    fai_path=FAI_PATH,
    fasta_path=FASTA_PATH,
    dnase_map_path=DNASE_MAP_PATH,
    timing_bigwig=TIMING_BW,
    per_sample_count=5,
    pearson_score_window_bins=1,
    pearson_score_smoothing="none",
    pearson_score_smooth_param=None,
    pearson_score_transform="none",
    pearson_score_zscore=False,
    pearson_score_weights=(0.7, 0.3),
    spearman_score_window_bins=5,
    spearman_score_smoothing="none",
    spearman_score_smooth_param=None,
    spearman_score_transform="none",
    spearman_score_zscore=False,
    spearman_score_weights=(0.7, 0.3),
    base_seed=123,
    track_strategies=["counts_raw", "exp_decay"],
    counts_raw_bins=[1000000],
    exp_decay_bins=[1000000],
    exp_decay_decay_bp_grid=[200000],
    exp_decay_max_distance_bp_grid=[1000000],
    covariate_sets=[["gc", "cpg", "timing"]],
    include_trinuc=False,
    downsample_counts=None,
    save_per_bin=False,
    out_dir="outputs/experiments/simple_check",
)
simple_check.head()


[18:26:46] INFO Session start
[18:26:46] INFO Inputs
[18:26:46] INFO   mutations_bed:       [PosixPath('/home/lem/projects/mut-epi-origin/data/raw/mutations/filtered_mutations.bed')]
[18:26:46] INFO   fasta:               /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa
[18:26:46] INFO   fai:                 /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa.fai
[18:26:46] INFO   dnase_bigwigs:       myel_prog, nkc, kera, smc, eso_epi, neuro_stem, astro_hippo, astro_spinal, mela, fibr
[18:26:46] INFO   timing_bigwig:       /home/lem/projects/mut-epi-origin/data/raw/timing/repliSeq_SknshWaveSignalRep1.bigWig
[18:26:46] INFO   tumour_filter:       AML,LAML,NKTL,BCC,LMS,ESAD,ESCA,NBL,GBM,LGG,SKCM,MELA
[18:26:46] INFO Grid
[18:26:46] INFO   chroms:              22
[18:26:46] INFO   configs:             10
[18:26:46] INFO START Sample selection (per-sample idx=0) ...
[18:28:43] INFO DONE Sample selection (per-sample idx=0) (2m03.2s)
[18:28:43] INFO   sample_slice: 

Unnamed: 0,sample_size_k,repeat,seed_samples,n_selected_samples,sample_slice_start,sample_slice_end,sample_index,sample_id,sample_tag,sample_mode,...,is_correct_pearson_local_score,is_correct_spearman_local_score,downsample_applied,downsample_ratio,rf_top_feature_perm,rf_top_feature_importance_perm,rf_top_is_dnase,exp_decay_bin,exp_decay_decay_bp,exp_decay_max_distance_bp
0,1,0,123,1,0,1,0,filtered_mutations::TNKTL21,filtered_mutations--TNKTL21,per_sample,...,True,False,False,1.0,dnase_nkc,0.383942,True,,,
1,1,0,123,1,0,1,0,filtered_mutations::TNKTL21,filtered_mutations--TNKTL21,per_sample,...,True,False,False,1.0,dnase_nkc,0.31639,True,1000000.0,200000.0,1000000.0
2,1,0,123,1,1,2,1,filtered_mutations::LP6005409-DNA_A03,filtered_mutations--LP6005409-DNA_A03,per_sample,...,True,True,False,1.0,dnase_nkc,0.170635,True,,,
3,1,0,123,1,1,2,1,filtered_mutations::LP6005409-DNA_A03,filtered_mutations--LP6005409-DNA_A03,per_sample,...,True,True,False,1.0,dnase_nkc,0.149644,True,1000000.0,200000.0,1000000.0
4,1,0,123,1,2,3,2,filtered_mutations::LMS31T1,filtered_mutations--LMS31T1,per_sample,...,False,False,False,1.0,timing_mean,0.079408,False,,,
