In [1]:
import sys
from pathlib import Path

# Resolve project root (notebook is in analysis/)
PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))

# Define data paths
MUT_PATH = [
    PROJECT_ROOT / "data" / "raw" / "mutations" / "UV_mutations.bed",
    PROJECT_ROOT / "data" / "raw" / "mutations" / "ICGC_WGS_Feb20_mutations.bed"
]
FAI_PATH = PROJECT_ROOT / "data" / "raw" / "reference" / "GRCh37.fa.fai"
FASTA_PATH = PROJECT_ROOT / "data" / "raw" / "reference" / "GRCh37.fa"
TIMING_BW = PROJECT_ROOT / "data" / "raw" / "timing" / "repliSeq_SknshWaveSignalRep1.bigWig"

DNASE_MAP = {
    "mela": PROJECT_ROOT / "data" / "raw" / "DNase-seq" / "mela_ENCFF285GEW.bigWig",
    "kera": PROJECT_ROOT / "data" / "raw" / "DNase-seq" / "kera_ENCFF597YXQ.bigWig",
    "fibr": PROJECT_ROOT / "data" / "raw" / "DNase-seq" / "fibr_ENCFF355OPU.bigWig",
}

In [2]:
from scripts.sanity_checks import SanityConfig, run_all_sanity_checks

cfg = SanityConfig(
    project_root=PROJECT_ROOT,
    mut_path=MUT_PATH,
    fai_path=FAI_PATH,
    fasta_path=FASTA_PATH,
    dnase_bws=DNASE_MAP,
    timing_bw=TIMING_BW,
    k_samples=1,
    seed=123,
    bin_size=50_000,
    check_chrom="chr1",
    tumour_whitelist=None,
    hard_fail=False,
)

run_all_sanity_checks(cfg)


=== SANITY CHECKS BEGIN ===
PROJECT_ROOT: /home/lem/projects/mut-epi-origin
mut_path(s):
  - /home/lem/projects/mut-epi-origin/data/raw/mutations/UV_mutations.bed
  - /home/lem/projects/mut-epi-origin/data/raw/mutations/ICGC_WGS_Feb20_mutations.bed
fai_path:     /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa.fai
fasta_path:   /home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa
dnase_bws:
  - mela: /home/lem/projects/mut-epi-origin/data/raw/DNase-seq/mela_ENCFF285GEW.bigWig
  - kera: /home/lem/projects/mut-epi-origin/data/raw/DNase-seq/kera_ENCFF597YXQ.bigWig
  - fibr: /home/lem/projects/mut-epi-origin/data/raw/DNase-seq/fibr_ENCFF355OPU.bigWig
timing_bw:    /home/lem/projects/mut-epi-origin/data/raw/timing/repliSeq_SknshWaveSignalRep1.bigWig
k=1, seed=123, bin_size=50000, check_chrom=chr1

[PASS] check_chrom uses internal canonical chr-style (chr1)
[PASS] pysam import OK
[PASS] pyBigWig import OK
[PASS] PROJECT_ROOT exists
[PASS] mut_path provided
[PASS] mut_

In [1]:
from pathlib import Path
import pandas as pd

icgc_path = Path("/home/lem/projects/mut-epi-origin/data/raw/mutations/ICGC_WGS_Feb20_mutations.bed")

# Column 6 (0-based) = Project / cohort label (e.g. BRCA-UK)
projects = set()

for chunk in pd.read_csv(
    icgc_path,
    sep="\t",
    header=None,
    usecols=[6],
    names=["Project"],
    dtype=str,
    chunksize=500_000,
    low_memory=False,
):
    s = chunk["Project"].fillna("").astype(str).str.strip()
    s = s[s != ""]
    projects.update(s.tolist())

projects = sorted(projects)
print(f"n_projects={len(projects)}")
print("\n".join(projects))

n_projects=68
ALL-US
AML-US
BLCA-US
BOCA-FR
BOCA-UK
BRCA-EU
BRCA-FR
BRCA-UK
BRCA-US
BTCA-SG
CESC-US
CLLE-ES
CMDI-UK
COAD-US
COCA-CN
DLBC-US
EOPC-DE
ESAD-UK
ESCA-CN
GACA-CN
GBM-US
HNSC-US
KICH-US
KIRC-US
KIRP-US
LAML-KR
LGG-US
LIAD-FR
LICA-CN
LICA-FR
LIHC-US
LINC-JP
LIRI-JP
LMS-FR
LUAD-US
LUSC-CN
LUSC-KR
LUSC-US
MALY-DE
MELA-AU
NBL-US
NKTL-SG
ORCA-IN
OV-AU
OV-US
PACA-AU
PACA-CA
PAEN-AU
PAEN-IT
PBCA-DE
PBCA-US
PEME-CA
PRAD-CA
PRAD-CN
PRAD-FR
PRAD-UK
PRAD-US
READ-US
RECA-EU
RT-US
SARC-US
SKCA-BR
SKCM-US
STAD-US
THCA-US
UCEC-US
UTCA-FR
WT-US
