In [1]:
import sys
from pathlib import Path

# Resolve project root (notebook is in analysis/)
PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))

# Define data paths
MUT_PATH = [
    PROJECT_ROOT / "data" / "raw" / "mutations" / "UV_mutations.bed",
    PROJECT_ROOT / "data" / "raw" / "mutations" / "ICGC_WGS_Feb20_mutations.MELA_SKCM.bed"
]
FAI_PATH = PROJECT_ROOT / "data" / "raw" / "reference" / "GRCh37.fa.fai"
FASTA_PATH = PROJECT_ROOT / "data" / "raw" / "reference" / "GRCh37.fa"
TIMING_BW = PROJECT_ROOT / "data" / "raw" / "timing" / "repliSeq_SknshWaveSignalRep1.bigWig"

DNASE_MAP = {
    "mela": PROJECT_ROOT / "data" / "raw" / "DNase-seq" / "mela_ENCFF285GEW.bigWig",
    "kera": PROJECT_ROOT / "data" / "raw" / "DNase-seq" / "kera_ENCFF597YXQ.bigWig",
    "fibr": PROJECT_ROOT / "data" / "raw" / "DNase-seq" / "fibr_ENCFF355OPU.bigWig",
}

# TUMOUR_WHITELIST = [
#     "MELA",
#     "SKCM",
# ]
TUMOUR_WHITELIST = None

In [2]:
from scripts.pipeline_mut_vs_accessibility import run_grid_experiment

# Small, fast smoke test
quick_results_1 = run_grid_experiment(
    mut_path=MUT_PATH,
    fai_path=FAI_PATH,
    fasta_path=FASTA_PATH,
    dnase_bigwigs=DNASE_MAP,
    timing_bigwig=TIMING_BW,
    sample_sizes=[5],
    repeats=1,
    base_seed=123,
    bin_sizes=[100_000],
    track_strategies=["counts_raw"],
    covariate_sets=[
        ["gc", "cpg"],
    ],
    include_trinuc=False,
    chroms=None,  # all chromosomes in the .fai
    standardise_tracks=True,
    standardise_scope="per_chrom",
    verbose=True,
    out_dir=PROJECT_ROOT / "outputs" / "experiments" / "mut_vs_dnase_quickcheck_v1",
    tumour_whitelist=TUMOUR_WHITELIST,
    save_per_bin=False,
)

quick_results_1.sort_values("best_celltype_linear_resid_value").head(5)


[13:56:33] INFO Session start
[13:56:33] INFO Inputs
[13:56:33] INFO   mutations_bed:       /home/lem/projects/mut-epi-origin/data/raw/mutations/UV_mutations.bed
[13:56:33] INFO   fasta:               /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa
[13:56:33] INFO   fai:                 /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa.fai
[13:56:33] INFO   dnase_bigwigs:       mela, kera, fibr
[13:56:33] INFO   timing_bigwig:       /home/lem/projects/mut-epi-origin/data/raw/timing/repliSeq_SknshWaveSignalRep1.bigWig
[13:56:33] INFO Grid
[13:56:33] INFO   chroms:              24
[13:56:33] INFO   configs:             1
[13:56:33] INFO START Sample selection (k=5, rep=0, seed=123) ...
[13:56:38] INFO DONE Sample selection (k=5, rep=0, seed=123) (5.5s)
[13:56:38] INFO   selected_samples:    5
[13:56:38] INFO   mutations_loaded:    431,607
[13:56:38] INFO config bin=100000 track=counts_raw covs=gc-cpg
[13:56:38] INFO Run start  [1/1]
[13:56:38] INFO   id:      

Unnamed: 0,sample_size_k,repeat,seed_samples,n_selected_samples,bin_size,track_strategy,covariates,include_trinuc,n_mutations_total,n_bins_total,...,best_celltype_rf_resid,best_celltype_rf_resid_value,best_minus_second_rf_resid,rf_perm_importances_mean_json,rf_feature_sign_corr_mean_json,ridge_coef_mean_json,rf_r2_mean_weighted,ridge_r2_mean_weighted,rf_top_celltype_feature_perm,rf_top_celltype_importance_perm
0,5,0,123,5,100000,counts_raw,"gc,cpg",False,431607,30970,...,mela,-0.234003,0.065063,"{""gc_fraction"": 0.23695488005873663, ""cpg_per_...","{""gc_fraction"": 0.12515185708863016, ""cpg_per_...","{""gc_fraction"": 0.5994251551438511, ""cpg_per_b...",0.793983,0.311728,mela,0.989604


In [2]:
from scripts.pipeline_mut_vs_accessibility import run_grid_experiment

# Quick-but-slightly-larger search (6 runs)
quick_results_2 = run_grid_experiment(
    mut_path=MUT_PATH,
    fai_path=FAI_PATH,
    fasta_path=FASTA_PATH,
    dnase_bigwigs=DNASE_MAP,
    timing_bigwig=TIMING_BW,
    sample_sizes=[5, 10],  # 2
    repeats=1,  # 1
    base_seed=123,
    bin_sizes=[100_000],  # 1
    track_strategies=["counts_raw", "counts_gauss", "inv_dist_gauss"],  # 3
    covariate_sets=[
        ["gc", "cpg"],  # 1
    ],
    include_trinuc=False,
    chroms=None,  # all chromosomes in the .fai
    standardise_tracks=True,
    standardise_scope="per_chrom",
    verbose=True,
    out_dir=PROJECT_ROOT / "outputs" / "experiments" / "mut_vs_dnase_quickcheck_v2",
    tumour_whitelist=TUMOUR_WHITELIST,
    save_per_bin=False,
)

quick_results_2.sort_values("best_celltype_linear_resid_value").head(10)


[15:26:48] INFO Session start
[15:26:48] INFO Inputs
[15:26:48] INFO   mutations_bed:       [PosixPath('/home/lem/projects/mut-epi-origin/data/raw/mutations/UV_mutations.bed'), PosixPath('/home/lem/projects/mut-epi-origin/data/raw/mutations/ICGC_WGS_Feb20_mutations.MELA_SKCM.bed')]
[15:26:48] INFO   fasta:               /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa
[15:26:48] INFO   fai:                 /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa.fai
[15:26:48] INFO   dnase_bigwigs:       mela, kera, fibr
[15:26:48] INFO   timing_bigwig:       /home/lem/projects/mut-epi-origin/data/raw/timing/repliSeq_SknshWaveSignalRep1.bigWig
[15:26:48] INFO   tumour_whitelist:    none
[15:26:48] INFO Grid
[15:26:48] INFO   chroms:              24
[15:26:48] INFO   configs:             6
[15:26:48] INFO START Sample selection (k=5, rep=0, seed=123) ...
[15:27:59] INFO DONE Sample selection (k=5, rep=0, seed=123) (1m12.3s)
[15:27:59] INFO   selected_samples:    5
[1

Unnamed: 0,sample_size_k,repeat,seed_samples,n_selected_samples,bin_size,track_strategy,covariates,include_trinuc,n_mutations_total,n_bins_total,...,best_celltype_rf_resid,best_celltype_rf_resid_value,best_minus_second_rf_resid,rf_perm_importances_mean_json,rf_feature_sign_corr_mean_json,ridge_coef_mean_json,rf_r2_mean_weighted,ridge_r2_mean_weighted,rf_top_celltype_feature_perm,rf_top_celltype_importance_perm
3,10,0,123,10,100000,counts_raw,"gc,cpg",False,1008569,30970,...,mela,-0.233096,0.057424,"{""gc_fraction"": 0.17750303469211554, ""cpg_per_...","{""gc_fraction"": 0.09959670794484987, ""cpg_per_...","{""gc_fraction"": 0.624586684007952, ""cpg_per_bp...",0.823762,0.344438,mela,0.952435
0,5,0,123,5,100000,counts_raw,"gc,cpg",False,870228,30970,...,mela,-0.230388,0.058487,"{""gc_fraction"": 0.19426387385245789, ""cpg_per_...","{""gc_fraction"": 0.10256830998131991, ""cpg_per_...","{""gc_fraction"": 0.6144617307134598, ""cpg_per_b...",0.810761,0.335927,mela,0.949471
4,10,0,123,10,100000,counts_gauss,"gc,cpg",False,1008569,30970,...,mela,-0.214528,0.053593,"{""gc_fraction"": 0.14487722686724722, ""cpg_per_...","{""gc_fraction"": 0.09464089381011018, ""cpg_per_...","{""gc_fraction"": 0.6274325378326392, ""cpg_per_b...",0.837097,0.350256,mela,0.920506
1,5,0,123,5,100000,counts_gauss,"gc,cpg",False,870228,30970,...,mela,-0.212951,0.054282,"{""gc_fraction"": 0.15329233966385913, ""cpg_per_...","{""gc_fraction"": 0.09798324089813859, ""cpg_per_...","{""gc_fraction"": 0.6184543923161678, ""cpg_per_b...",0.82734,0.343755,mela,0.919171
5,10,0,123,10,100000,inv_dist_gauss,"gc,cpg",False,1008569,30970,...,mela,-0.030705,0.013528,"{""gc_fraction"": 0.1329228668692673, ""cpg_per_b...","{""gc_fraction"": 0.035882963699299136, ""cpg_per...","{""gc_fraction"": 0.11321118164809708, ""cpg_per_...",0.302134,0.021581,fibr,0.20462
2,5,0,123,5,100000,inv_dist_gauss,"gc,cpg",False,870228,30970,...,mela,-0.026054,0.016609,"{""gc_fraction"": 0.12383635708320713, ""cpg_per_...","{""gc_fraction"": 0.03465911096606451, ""cpg_per_...","{""gc_fraction"": 0.10405105268791624, ""cpg_per_...",0.292115,0.026742,fibr,0.231002


In [None]:
from scripts.pipeline_mut_vs_accessibility import run_grid_experiment

# Full grid search
results = run_grid_experiment(
    mut_path=MUT_PATH,
    fai_path=FAI_PATH,
    fasta_path=FASTA_PATH,
    dnase_bigwigs=DNASE_MAP,
    timing_bigwig=TIMING_BW,
    sample_sizes=[5, 10, 20],
    repeats=1,
    base_seed=123,
    bin_sizes=[50_000, 100_000],
    track_strategies=["counts_raw", "counts_gauss", "inv_dist_gauss"],
    covariate_sets=[
        ["gc", "cpg"],
        ["gc", "cpg", "timing"],
    ],
    include_trinuc=False,
    chroms=None,  # all chromosomes in the .fai
    standardise_tracks=True,
    standardise_scope="per_chrom",
    verbose=True,
    out_dir=PROJECT_ROOT / "outputs" / "experiments" / "mut_vs_dnase_multi_v1",
    tumour_whitelist=TUMOUR_WHITELIST,
    save_per_bin=True,
)

results.sort_values("best_celltype_linear_resid_value").head(10)
