In [1]:
import sys
from pathlib import Path

# Resolve project root (notebook is in notebooks/)
PROJECT_ROOT = Path.cwd().resolve().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))

# Define data paths
# MUT_PATH = [
#     PROJECT_ROOT / "data" / "raw" / "mutations" / "UV_mutations.bed",
#     PROJECT_ROOT / "data" / "raw" / "mutations" / "ICGC_WGS_Feb20_mutations.MELA_SKCM.bed"
# ]
MUT_PATH = [
    PROJECT_ROOT / "data" / "raw" / "mutations" / "ICGC_WGS_Feb20_mutations.LIHC_LIRI.bed"
]
FAI_PATH = PROJECT_ROOT / "data" / "raw" / "reference" / "GRCh37.fa.fai"
FASTA_PATH = PROJECT_ROOT / "data" / "raw" / "reference" / "GRCh37.fa"
TIMING_BW = PROJECT_ROOT / "data" / "raw" / "timing" / "repliSeq_SknshWaveSignalRep1.bigWig"

TUMOUR_FILTER = None


In [2]:
from scripts.mutation_stats import compute_mutation_stats, stats_as_dict

stats = compute_mutation_stats(
    mutations_path=MUT_PATH,
    reference_fasta=FASTA_PATH,
    seed=123,
    n_rows_to_test=50,
    tumour_filter=TUMOUR_FILTER,
)

stats


MutationStats(file_path='/home/lem/projects/mut-epi-origin/data/raw/mutations/ICGC_WGS_Feb20_mutations.LIHC_LIRI.bed', file_format='uv_bed_like', n_rows=3497476, n_unique_samples=294, n_unique_patients=294, n_unique_genes=0, n_unique_variants=3497452, avg_mutations_per_sample=11896.176870748299, median_mutations_per_sample=10810.5, median_mutations_per_patient=10810.5, top_variant_classification='NA', top_variant_classification_count=0, snv_count=3497476, indel_count=0, missing_sample_barcode_rows=0, ref_check=RefCheckResult(reference_fasta='/home/lem/projects/mut-epi-origin/data/raw/reference/GRCh37.fa', seed=123, n_tested=50, n_matched=50, n_mismatched=0, n_skipped=0, mismatch_examples=[]))

In [3]:
import pandas as pd

d = stats_as_dict(stats)

# Flatten ref_check for display
ref = d.pop("ref_check")
rows = [{"metric": k, "value": v} for k, v in d.items()]

if ref is not None:
    for k, v in ref.items():
        if k == "mismatch_examples":
            continue
        rows.append({"metric": f"ref_check.{k}", "value": v})

df = pd.DataFrame(rows)
df


Unnamed: 0,metric,value
0,file_path,/home/lem/projects/mut-epi-origin/data/raw/mut...
1,file_format,uv_bed_like
2,n_rows,3497476
3,n_unique_samples,294
4,n_unique_patients,294
5,n_unique_genes,0
6,n_unique_variants,3497452
7,avg_mutations_per_sample,11896.176871
8,median_mutations_per_sample,10810.5
9,median_mutations_per_patient,10810.5
