# VCF Benchmarking Statistics Analysis

This notebook performs comprehensive VCF statistics on CASTLE benchmarking truth sets for DeepSomatic model training and evaluation.

## Analysis Overview
- **Whole-Landscape Analysis**: Cross-sample comparisons and aggregate statistics
- **Per-Sample Analysis**: Individual sample characterization
- **Statistics Included**:
  - Variant type distribution (SNP/INDEL counts and percentages)
  - Chromosome distribution (natural sort order, counts and percentages)
  - FILTER label statistics
  - Ti/Tv ratio (transition/transversion)
  - Base change spectrum (6-type mutation categories)
  - Indel size distribution

## 1. Setup

Import libraries and configure paths.

In [1]:
import sys
import re
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Any

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from cyvcf2 import VCF

# Import indel analysis utilities from vcf_stats
sys.path.insert(0, str(Path('/t9k/mnt/hdd/work/Vax/pipeline/rnadnavar/bin')))
from vcf_stats.indel_analysis import (
    INDEL_SIZE_BINS,
    categorize_indel_size,
    indel_stats_by_bin,
    aggregate_indel_statistics,
    summarize_indel_sizes
)
from vcf_stats.indel_visualization import (
    INSERTIONS_COLOR,
    DELETIONS_COLOR,
    plot_indel_binned_distribution,
    plot_indel_per_sample_heatmap,
    plot_indel_per_sample_stacked
)

# Paths
BENCHMARKING_BASE = Path('/t9k/mnt/WorkSpace/data/ngs/xuzhenyu/CASTLE/benchmarking')
OUTPUT_DIR = Path('/t9k/mnt/hdd/work/Vax/pipeline/rnadnavar/notebook/castle_statistics_output')
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print('✓ Setup complete')


✓ Using shared vcf_config module from bin/common/
✓ Variant classification functions loaded from shared modules (stage-aware)
✓ Workflow Comparison module loaded successfully
✓ Enhanced Statistics Aggregator imported successfully
✓ VCF statistics utility functions loaded
✓ VCF Statistics Extractor with stage-aware classification loaded
✓ VCF Visualizer module loaded successfully
✓ VCF statistics core module initialized
✓ Setup complete


## 2. Configuration

Define benchmarking categories, samples, and VCF file paths.

In [2]:
# Benchmarking categories
BENCHMARK_CATEGORIES = [
    'DeepSomatic_multicancer-model_benchmark',
    'DeepSomatic_HCC1395-model_benchmark'
]

# Expected samples
SAMPLES = ['H1437', 'H2009', 'HCC1395', 'HCC1937', 'HCC1954', 'HG008', 'Hs578']

# VCF file paths
VCF_PATHS = [
    # HCC1395-model benchmark
    BENCHMARKING_BASE / 'DeepSomatic_HCC1395-model_benchmark/vcfs/H1437_DeepSomatic_HCC1395-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_HCC1395-model_benchmark/vcfs/H2009_DeepSomatic_HCC1395-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_HCC1395-model_benchmark/vcfs/HCC1395_DeepSomatic_HCC1395-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_HCC1395-model_benchmark/vcfs/HCC1937_DeepSomatic_HCC1395-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_HCC1395-model_benchmark/vcfs/HCC1954_DeepSomatic_HCC1395-model.vcf.gz',
    # multicancer-model benchmark
    BENCHMARKING_BASE / 'DeepSomatic_multicancer-model_benchmark/vcfs/H1437_DeepSomatic_multicancer-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_multicancer-model_benchmark/vcfs/H2009_DeepSomatic_multicancer-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_multicancer-model_benchmark/vcfs/HCC1395_DeepSomatic_multicancer-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_multicancer-model_benchmark/vcfs/HCC1937_DeepSomatic_multicancer-model.vcf.gz',
    BENCHMARKING_BASE / 'DeepSomatic_multicancer-model_benchmark/vcfs/HCC1954_DeepSomatic_multicancer-model.vcf.gz',
]

# Validate paths
valid_vcfs = [p for p in VCF_PATHS if p.exists()]
missing_vcfs = [p for p in VCF_PATHS if not p.exists()]

print(f'✓ Found {len(valid_vcfs)} VCF files')
if missing_vcfs:
    print(f'⚠️  Missing {len(missing_vcfs)} files:')
    for p in missing_vcfs[:5]:
        print(f'   - {p.name}')

✓ Found 10 VCF files


## 3. Helper Functions

### 3.1 Chromosome Natural Sort Utility

In [3]:
def natural_sort_chromosomes(chromosomes: List[str]) -> List[str]:
    """
    Sort chromosomes in natural order: 1, 2, ..., 22, X, Y, M/MT.
    Handles both 'chr' prefixed and non-prefixed formats.
    """
    def chrom_sort_key(chrom: str) -> Tuple:
        # Remove 'chr' prefix if present
        c = chrom.lower().replace('chr', '')
        
        # Define order for special chromosomes
        special_order = {'x': 23, 'y': 24, 'm': 25, 'mt': 25}
        
        if c in special_order:
            return (special_order[c], 0, chrom)
        
        # Try to parse as integer
        try:
            num = int(c)
            return (num, 0, chrom)
        except ValueError:
            # Unknown chromosome, sort after known ones
            return (100, 0, chrom)
    
    return sorted(chromosomes, key=chrom_sort_key)


def get_chromosome_order() -> List[str]:
    """Return standard chromosome order for consistent plotting."""
    return [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']


# Test natural sort
test_chroms = ['chr2', 'chr10', 'chrX', 'chr1', 'chrY', 'chr22', 'chrM']
print(f'Natural sort test: {natural_sort_chromosomes(test_chroms)}')

Natural sort test: ['chr1', 'chr2', 'chr10', 'chr22', 'chrX', 'chrY', 'chrM']


In [4]:
def inspect_vcf_header(vcf_path: Path) -> Dict[str, Any]:
    """
    Inspect VCF header to discover available fields.
    
    Returns:
        Dict with 'info_fields', 'format_fields', 'filter_fields', 'samples'
    """
    vcf = VCF(str(vcf_path))
    samples = list(vcf.samples)  # Get samples before iterating
    
    info_fields = {}
    format_fields = {}
    filter_fields = {}
    
    for field in vcf.header_iter():
        header_type = field['HeaderType']
        
        if header_type == 'INFO':
            try:
                info_fields[field['ID']] = {
                    'type': field['Type'],
                    'description': field['Description']
                }
            except (KeyError, TypeError):
                info_fields[field['ID']] = {'type': 'Unknown', 'description': ''}
                
        elif header_type == 'FORMAT':
            try:
                format_fields[field['ID']] = {
                    'type': field['Type'],
                    'description': field['Description']
                }
            except (KeyError, TypeError):
                format_fields[field['ID']] = {'type': 'Unknown', 'description': ''}
                
        elif header_type == 'FILTER':
            try:
                filter_fields[field['ID']] = field['Description']
            except (KeyError, TypeError):
                filter_fields[field['ID']] = ''
    
    vcf.close()
    
    return {
        'info_fields': info_fields,
        'format_fields': format_fields,
        'filter_fields': filter_fields,
        'samples': samples,
        'path': vcf_path
    }


def summarize_vcf_fields(vcf_paths: List[Path]) -> pd.DataFrame:
    """Summarize available fields across all VCF files."""
    summaries = []
    
    for vcf_path in vcf_paths:
        if not vcf_path.exists():
            continue
        
        header = inspect_vcf_header(vcf_path)
        summaries.append({
            'File': vcf_path.name,
            'INFO_Fields': len(header['info_fields']),
            'FORMAT_Fields': len(header['format_fields']),
            'FILTER_Fields': len(header['filter_fields']),
            'Samples': len(header['samples']),
            'Has_GT': 'GT' in header['format_fields'],
            'Has_DP': 'DP' in header['format_fields'],
            'Has_AF': 'AF' in header['format_fields'],
            'Has_AD': 'AD' in header['format_fields'],
        })
    
    return pd.DataFrame(summaries)


# Inspect first VCF to understand structure
if valid_vcfs:
    sample_header = inspect_vcf_header(valid_vcfs[0])
    print(f"Sample VCF: {valid_vcfs[0].name}")
    print(f"  INFO fields: {list(sample_header['info_fields'].keys())[:10]}...")
    print(f"  FORMAT fields: {list(sample_header['format_fields'].keys())}")
    print(f"  FILTER fields: {list(sample_header['filter_fields'].keys())}")
    print(f"  Samples: {sample_header['samples']}")

Sample VCF: H1437_DeepSomatic_HCC1395-model.vcf.gz
  INFO fields: ['END']...
  FORMAT fields: ['GT', 'GQ', 'DP', 'MIN_DP', 'AD', 'VAF', 'PL', 'MED_DP']
  FILTER fields: ['PASS', 'RefCall', 'LowQual', 'NoCall', 'GERMLINE']
  Samples: ['1437_Ilmn_tumor', '1437_tumor', '3:1437_tumor', '1437']


### 3.3 Core Statistics Functions

In [5]:
class VCFBenchmarkStats:
    """
    Extract comprehensive statistics from a VCF file.
    Designed to work with any VCF format (universal, no pipeline-specific dependencies).
    """
    
    # Base change categories (complement-collapsed for standard mutation spectrum)
    BASE_CHANGE_CATEGORIES = {
        ('C', 'A'): 'C>A', ('G', 'T'): 'C>A',
        ('C', 'G'): 'C>G', ('G', 'C'): 'C>G',
        ('C', 'T'): 'C>T', ('G', 'A'): 'C>T',
        ('T', 'A'): 'T>A', ('A', 'T'): 'T>A',
        ('T', 'C'): 'T>C', ('A', 'G'): 'T>C',
        ('T', 'G'): 'T>G', ('A', 'C'): 'T>G',
    }
    
    TRANSITIONS = {('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')}
    TRANSVERSIONS = {('A', 'C'), ('C', 'A'), ('A', 'T'), ('T', 'A'),
                     ('G', 'C'), ('C', 'G'), ('G', 'T'), ('T', 'G')}
    
    def __init__(self, vcf_path: Path):
        self.vcf_path = vcf_path
        self.stats = None
        
    def extract_all_stats(self) -> Dict[str, Any]:
        """Extract all statistics from the VCF file."""
        vcf = VCF(str(self.vcf_path))
        
        stats = {
            'total': 0,
            'snps': 0,
            'indels': 0,
            'other': 0,
            'chromosomes': defaultdict(int),
            'filters': defaultdict(int),
            'transitions': 0,
            'transversions': 0,
            'base_changes': defaultdict(int),
            'indel_sizes': [],
            'qual_scores': [],
        }
        
        for variant in vcf:
            stats['total'] += 1
            
            # Chromosome
            chrom = variant.CHROM
            stats['chromosomes'][chrom] += 1
            
            # FILTER
            filt = variant.FILTER if variant.FILTER else 'PASS'
            stats['filters'][filt] += 1
            
            # QUAL score
            if variant.QUAL is not None and variant.QUAL != -1:
                stats['qual_scores'].append(variant.QUAL)
            
            # Variant type
            ref = variant.REF.upper()
            alt = variant.ALT[0].upper() if variant.ALT else ''
            
            if variant.is_snp:
                stats['snps'] += 1
                
                # Ti/Tv calculation
                if (ref, alt) in self.TRANSITIONS:
                    stats['transitions'] += 1
                elif (ref, alt) in self.TRANSVERSIONS:
                    stats['transversions'] += 1
                
                # Base change spectrum
                if (ref, alt) in self.BASE_CHANGE_CATEGORIES:
                    category = self.BASE_CHANGE_CATEGORIES[(ref, alt)]
                    stats['base_changes'][category] += 1
                    
            elif variant.is_indel:
                stats['indels'] += 1
                # Indel size: positive = insertion, negative = deletion
                indel_size = len(alt) - len(ref)
                stats['indel_sizes'].append(indel_size)
            else:
                stats['other'] += 1
        
        vcf.close()
        
        # Convert defaultdicts to regular dicts
        stats['chromosomes'] = dict(stats['chromosomes'])
        stats['filters'] = dict(stats['filters'])
        stats['base_changes'] = dict(stats['base_changes'])
        
        # Calculate Ti/Tv ratio
        if stats['transversions'] > 0:
            stats['titv_ratio'] = stats['transitions'] / stats['transversions']
        else:
            stats['titv_ratio'] = None
        
        self.stats = stats
        return stats
    
    def get_variant_type_summary(self) -> pd.DataFrame:
        """Get variant type counts and percentages."""
        if not self.stats:
            self.extract_all_stats()
        
        total = self.stats['total']
        data = {
            'Type': ['SNP', 'INDEL', 'Other', 'Total'],
            'Count': [
                self.stats['snps'],
                self.stats['indels'],
                self.stats['other'],
                total
            ],
            'Percentage': [
                self.stats['snps'] / total * 100 if total > 0 else 0,
                self.stats['indels'] / total * 100 if total > 0 else 0,
                self.stats['other'] / total * 100 if total > 0 else 0,
                100.0
            ]
        }
        df = pd.DataFrame(data)
        df['Percentage'] = df['Percentage'].round(2)
        return df
    
    def get_chromosome_summary(self) -> pd.DataFrame:
        """Get chromosome distribution with counts and percentages (natural sort)."""
        if not self.stats:
            self.extract_all_stats()
        
        total = self.stats['total']
        chroms = natural_sort_chromosomes(list(self.stats['chromosomes'].keys()))
        
        data = {
            'Chromosome': chroms,
            'Count': [self.stats['chromosomes'][c] for c in chroms],
            'Percentage': [self.stats['chromosomes'][c] / total * 100 if total > 0 else 0 for c in chroms]
        }
        df = pd.DataFrame(data)
        df['Percentage'] = df['Percentage'].round(2)
        return df
    
    def get_filter_summary(self) -> pd.DataFrame:
        """Get FILTER label distribution with counts and percentages."""
        if not self.stats:
            self.extract_all_stats()
        
        total = self.stats['total']
        filters = sorted(self.stats['filters'].keys())
        
        data = {
            'Filter': filters,
            'Count': [self.stats['filters'][f] for f in filters],
            'Percentage': [self.stats['filters'][f] / total * 100 if total > 0 else 0 for f in filters]
        }
        df = pd.DataFrame(data)
        df['Percentage'] = df['Percentage'].round(2)
        return df.sort_values('Count', ascending=False).reset_index(drop=True)
    
    def get_base_change_summary(self) -> pd.DataFrame:
        """Get 6-type base change spectrum with counts and percentages."""
        if not self.stats:
            self.extract_all_stats()
        
        snp_total = self.stats['snps']
        categories = ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G']
        
        data = {
            'Base_Change': categories,
            'Count': [self.stats['base_changes'].get(c, 0) for c in categories],
            'Percentage': [
                self.stats['base_changes'].get(c, 0) / snp_total * 100 if snp_total > 0 else 0
                for c in categories
            ]
        }
        df = pd.DataFrame(data)
        df['Percentage'] = df['Percentage'].round(2)
        return df
    
    def get_indel_size_summary(self) -> pd.DataFrame:
        """Get indel size distribution summary using module function."""
        if not self.stats:
            self.extract_all_stats()
        
        sizes = self.stats['indel_sizes']
        if not sizes:
            return pd.DataFrame({'Metric': [], 'Value': []})
        
        # Use summarize_indel_sizes from module
        summary = summarize_indel_sizes(sizes)
        
        data = {
            'Metric': [
                'Total INDELs', 'Insertions', 'Deletions',
                'Median Size', 'Max Insertion', 'Max Deletion (abs)',
                'Mean Insertion Size', 'Mean Deletion Size (abs)'
            ],
            'Value': [
                summary['total'],
                summary['insertions'],
                summary['deletions'],
                summary['median_abs'],
                summary['max_insertion'],
                summary['max_deletion_abs'],
                summary['mean_insertion'],
                summary['mean_deletion_abs']
            ]
        }
        df = pd.DataFrame(data)
        df['Value'] = df['Value'].round(2)
        return df


print('✓ VCFBenchmarkStats class defined')


✓ VCFBenchmarkStats class defined


## 4. Data Extraction

Parse VCF filenames and extract statistics from all files.

In [6]:
def parse_benchmark_filename(vcf_path: Path) -> Dict[str, str]:
    """
    Parse benchmarking VCF filename to extract metadata.
    Expected format: Sample_Caller_Model.vcf.gz
    """
    filename = vcf_path.name.replace('.vcf.gz', '').replace('_somatic-only', '')
    
    # Pattern: Sample_Caller_Model (e.g., H1437_DeepSomatic_HCC1395-model)
    pattern = r'^([A-Za-z0-9]+)_([A-Za-z]+)_([A-Za-z0-9-]+)$'
    match = re.match(pattern, filename)
    
    if match:
        sample, caller, model = match.groups()
        return {
            'sample': sample,
            'caller': caller,
            'model': model,
            'filename': vcf_path.name,
            'path': vcf_path
        }
    
    return {
        'sample': filename,
        'caller': 'Unknown',
        'model': 'Unknown',
        'filename': vcf_path.name,
        'path': vcf_path
    }


def extract_all_vcf_stats(vcf_paths: List[Path], verbose: bool = True) -> Dict[str, Dict]:
    """
    Extract statistics from all VCF files.
    
    Returns:
        Dict mapping filename to {'metadata': {...}, 'stats': {...}, 'extractor': VCFBenchmarkStats}
    """
    all_stats = {}
    
    for i, vcf_path in enumerate(vcf_paths):
        if not vcf_path.exists():
            if verbose:
                print(f'⚠️  Skipping missing file: {vcf_path.name}')
            continue
        
        if verbose:
            print(f'[{i+1}/{len(vcf_paths)}] Processing {vcf_path.name}...')
        
        metadata = parse_benchmark_filename(vcf_path)
        extractor = VCFBenchmarkStats(vcf_path)
        stats = extractor.extract_all_stats()
        
        all_stats[vcf_path.name] = {
            'metadata': metadata,
            'stats': stats,
            'extractor': extractor
        }
        
        if verbose:
            print(f'    → {stats["total"]:,} variants (SNP: {stats["snps"]:,}, INDEL: {stats["indels"]:,})')
    
    return all_stats


# Extract statistics from all VCFs
print('Extracting statistics from all VCF files...\n')
all_vcf_stats = extract_all_vcf_stats(valid_vcfs, verbose=True)
print(f'\n✓ Processed {len(all_vcf_stats)} VCF files')

Extracting statistics from all VCF files...

[1/10] Processing H1437_DeepSomatic_HCC1395-model.vcf.gz...
    → 5,328,595 variants (SNP: 4,285,815, INDEL: 1,042,780)
[2/10] Processing H2009_DeepSomatic_HCC1395-model.vcf.gz...
    → 5,406,875 variants (SNP: 4,387,006, INDEL: 1,019,869)
[3/10] Processing HCC1395_DeepSomatic_HCC1395-model.vcf.gz...
    → 5,209,875 variants (SNP: 4,230,499, INDEL: 979,376)
[4/10] Processing HCC1937_DeepSomatic_HCC1395-model.vcf.gz...
    → 5,270,519 variants (SNP: 4,249,467, INDEL: 1,021,052)
[5/10] Processing HCC1954_DeepSomatic_HCC1395-model.vcf.gz...
    → 5,425,920 variants (SNP: 4,374,716, INDEL: 1,051,204)
[6/10] Processing H1437_DeepSomatic_multicancer-model.vcf.gz...
    → 5,333,715 variants (SNP: 4,291,166, INDEL: 1,042,549)
[7/10] Processing H2009_DeepSomatic_multicancer-model.vcf.gz...
    → 5,407,335 variants (SNP: 4,387,452, INDEL: 1,019,883)
[8/10] Processing HCC1395_DeepSomatic_multicancer-model.vcf.gz...
    → 5,213,075 variants (SNP: 4,232,

## 5. Whole-Landscape Analysis

Cross-sample statistics and visualizations.

In [7]:
def create_landscape_summary(all_vcf_stats: Dict) -> pd.DataFrame:
    """
    Create comprehensive summary table with counts and percentages side-by-side.
    """
    rows = []
    
    for filename, data in all_vcf_stats.items():
        metadata = data['metadata']
        stats = data['stats']
        total = stats['total']
        
        # Calculate PASS count and percentage
        pass_count = stats['filters'].get('PASS', 0)
        pass_pct = pass_count / total * 100 if total > 0 else 0
        
        row = {
            'Sample': metadata['sample'],
            'Model': metadata['model'],
            'Total_Variants': total,
            'SNP_Count': stats['snps'],
            'SNP_Pct': stats['snps'] / total * 100 if total > 0 else 0,
            'INDEL_Count': stats['indels'],
            'INDEL_Pct': stats['indels'] / total * 100 if total > 0 else 0,
            'Other_Count': stats['other'],
            'TiTv_Ratio': stats['titv_ratio'],
            'PASS_Count': pass_count,
            'PASS_Pct': pass_pct,
        }
        rows.append(row)
    
    df = pd.DataFrame(rows)
    
    # Round percentages
    pct_cols = [c for c in df.columns if c.endswith('_Pct')]
    df[pct_cols] = df[pct_cols].round(2)
    df['TiTv_Ratio'] = df['TiTv_Ratio'].round(3)
    
    # Sort by Sample then Model
    df = df.sort_values(['Sample', 'Model']).reset_index(drop=True)
    
    return df


# Create landscape summary
landscape_df = create_landscape_summary(all_vcf_stats)

print('=' * 80)
print('LANDSCAPE SUMMARY: All Samples and Models')
print('=' * 80)
display(landscape_df)

# Summary statistics
print(f'\nTotal variants across all files: {landscape_df["Total_Variants"].sum():,}')
print(f'Mean Ti/Tv ratio: {landscape_df["TiTv_Ratio"].mean():.3f}')

LANDSCAPE SUMMARY: All Samples and Models


Unnamed: 0,Sample,Model,Total_Variants,SNP_Count,SNP_Pct,INDEL_Count,INDEL_Pct,Other_Count,TiTv_Ratio,PASS_Count,PASS_Pct
0,H1437,HCC1395-model,5328595,4285815,80.43,1042780,19.57,0,1.852,87292,1.64
1,H1437,multicancer-model,5333715,4291166,80.45,1042549,19.55,0,1.852,92412,1.73
2,H2009,HCC1395-model,5406875,4387006,81.14,1019869,18.86,0,1.801,168180,3.11
3,H2009,multicancer-model,5407335,4387452,81.14,1019883,18.86,0,1.8,168640,3.12
4,HCC1395,HCC1395-model,5209875,4230499,81.2,979376,18.8,0,1.909,129222,2.48
5,HCC1395,multicancer-model,5213075,4232510,81.19,980565,18.81,0,1.909,132422,2.54
6,HCC1937,HCC1395-model,5270519,4249467,80.63,1021052,19.37,0,1.892,56888,1.08
7,HCC1937,multicancer-model,5271049,4250113,80.63,1020936,19.37,0,1.892,57418,1.09
8,HCC1954,HCC1395-model,5425920,4374716,80.63,1051204,19.37,0,1.895,26532,0.49
9,HCC1954,multicancer-model,5427075,4374976,80.61,1052099,19.39,0,1.895,27687,0.51



Total variants across all files: 53,294,033
Mean Ti/Tv ratio: 1.870


### 5.1 Variant Type Distribution

In [8]:
# Variant type stacked bar chart by sample and model
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Variant Counts by Sample', 'Variant Percentages by Sample'],
    horizontal_spacing=0.1
)

# Prepare data
samples = landscape_df['Sample'].unique()
models = landscape_df['Model'].unique()

colors = {'SNP': '#636EFA', 'INDEL': '#EF553B', 'Other': '#00CC96'}

for model in models:
    model_data = landscape_df[landscape_df['Model'] == model]
    
    # Count plot
    fig.add_trace(
        go.Bar(
            name=f'{model} - SNP',
            x=model_data['Sample'],
            y=model_data['SNP_Count'],
            marker_color=colors['SNP'],
            text=model_data['SNP_Count'].apply(lambda x: f'{x:,}'),
            textposition='inside',
            legendgroup=model,
            showlegend=True
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Bar(
            name=f'{model} - INDEL',
            x=model_data['Sample'],
            y=model_data['INDEL_Count'],
            marker_color=colors['INDEL'],
            text=model_data['INDEL_Count'].apply(lambda x: f'{x:,}'),
            textposition='inside',
            legendgroup=model,
            showlegend=True
        ),
        row=1, col=1
    )
    
    # Percentage plot
    fig.add_trace(
        go.Bar(
            name=f'{model} - SNP %',
            x=model_data['Sample'],
            y=model_data['SNP_Pct'],
            marker_color=colors['SNP'],
            text=model_data['SNP_Pct'].apply(lambda x: f'{x:.1f}%'),
            textposition='inside',
            legendgroup=model + '_pct',
            showlegend=False
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Bar(
            name=f'{model} - INDEL %',
            x=model_data['Sample'],
            y=model_data['INDEL_Pct'],
            marker_color=colors['INDEL'],
            text=model_data['INDEL_Pct'].apply(lambda x: f'{x:.1f}%'),
            textposition='inside',
            legendgroup=model + '_pct',
            showlegend=False
        ),
        row=1, col=2
    )

fig.update_layout(
    title='Variant Type Distribution by Sample and Model',
    barmode='stack',
    height=500,
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5)
)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Percentage (%)', row=1, col=2)

fig.show()

### 5.2 Ti/Tv Ratio Comparison

In [9]:
# Ti/Tv ratio comparison bar chart
fig = px.bar(
    landscape_df,
    x='Sample',
    y='TiTv_Ratio',
    color='Model',
    barmode='group',
    title='Ti/Tv Ratio by Sample and Model',
    text='TiTv_Ratio',
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    yaxis_title='Ti/Tv Ratio',
    height=450,
    showlegend=True
)

# Add reference lines for expected Ti/Tv ranges
fig.add_hline(y=2.0, line_dash='dash', line_color='gray', 
              annotation_text='Typical WGS (~2.0)', annotation_position='right')
fig.add_hline(y=2.8, line_dash='dash', line_color='lightgray',
              annotation_text='Typical WES (~2.8)', annotation_position='right')

fig.show()

# Ti/Tv summary statistics
print('\nTi/Tv Ratio Summary:')
titv_summary = landscape_df.groupby('Model')['TiTv_Ratio'].agg(['mean', 'std', 'min', 'max']).round(3)
display(titv_summary)


Ti/Tv Ratio Summary:


Unnamed: 0_level_0,mean,std,min,max
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HCC1395-model,1.87,0.044,1.801,1.909
multicancer-model,1.87,0.044,1.8,1.909


### 5.3 Chromosome Distribution

In [10]:
def create_chromosome_matrix(all_vcf_stats: Dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create chromosome distribution matrices (counts and percentages).
    Chromosomes are sorted in natural order.
    """
    # Collect all unique chromosomes
    all_chroms = set()
    for data in all_vcf_stats.values():
        all_chroms.update(data['stats']['chromosomes'].keys())
    
    # Natural sort
    sorted_chroms = natural_sort_chromosomes(list(all_chroms))
    
    # Build matrices
    count_data = {}
    pct_data = {}
    
    for filename, data in all_vcf_stats.items():
        sample = data['metadata']['sample']
        model = data['metadata']['model']
        label = f"{sample}_{model.replace('-model', '')}"
        
        total = data['stats']['total']
        chrom_counts = data['stats']['chromosomes']
        
        count_data[label] = {c: chrom_counts.get(c, 0) for c in sorted_chroms}
        pct_data[label] = {c: chrom_counts.get(c, 0) / total * 100 if total > 0 else 0 for c in sorted_chroms}
    
    count_df = pd.DataFrame(count_data).T
    count_df = count_df[sorted_chroms]  # Ensure column order
    
    pct_df = pd.DataFrame(pct_data).T
    pct_df = pct_df[sorted_chroms].round(2)
    
    return count_df, pct_df


# Create chromosome matrices
chrom_count_df, chrom_pct_df = create_chromosome_matrix(all_vcf_stats)

print('Chromosome Distribution (Counts):')
display(chrom_count_df.head())

print('\nChromosome Distribution (Percentages):')
display(chrom_pct_df.head())

Chromosome Distribution (Counts):


Unnamed: 0,chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM
H1437_HCC1395,416921,418397,348029,373671,327316,321094,303581,261503,228919,262285,256402,245083,207716,164989,157343,162671,154216,155339,118079,129823,86059,82904,137712,8500,43
H2009_HCC1395,418508,419945,354560,372821,316853,315651,295092,273544,237833,264780,261048,253107,212609,159417,159449,164171,148569,156795,118507,129270,84184,83881,189367,16896,18
HCC1395_HCC1395,400251,411477,344779,366636,312902,319169,288562,258808,226640,253885,248254,234456,196711,159091,155902,151920,146189,150723,118610,122728,84700,79797,164775,12872,38
HCC1937_HCC1395,405213,405742,345063,361905,311124,311235,294413,268517,231110,256719,250604,239669,208630,159189,159082,162027,153685,153538,116342,127300,86539,80579,165862,16396,36
HCC1954_HCC1395,414150,413448,351969,366920,317195,319958,305034,268746,236801,270238,259920,247797,210531,165671,159171,168159,153210,157866,119112,131536,89539,88717,192257,17923,52



Chromosome Distribution (Percentages):


Unnamed: 0,chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM
H1437_HCC1395,7.82,7.85,6.53,7.01,6.14,6.03,5.7,4.91,4.3,4.92,4.81,4.6,3.9,3.1,2.95,3.05,2.89,2.92,2.22,2.44,1.62,1.56,2.58,0.16,0.0
H2009_HCC1395,7.74,7.77,6.56,6.9,5.86,5.84,5.46,5.06,4.4,4.9,4.83,4.68,3.93,2.95,2.95,3.04,2.75,2.9,2.19,2.39,1.56,1.55,3.5,0.31,0.0
HCC1395_HCC1395,7.68,7.9,6.62,7.04,6.01,6.13,5.54,4.97,4.35,4.87,4.77,4.5,3.78,3.05,2.99,2.92,2.81,2.89,2.28,2.36,1.63,1.53,3.16,0.25,0.0
HCC1937_HCC1395,7.69,7.7,6.55,6.87,5.9,5.91,5.59,5.09,4.38,4.87,4.75,4.55,3.96,3.02,3.02,3.07,2.92,2.91,2.21,2.42,1.64,1.53,3.15,0.31,0.0
HCC1954_HCC1395,7.63,7.62,6.49,6.76,5.85,5.9,5.62,4.95,4.36,4.98,4.79,4.57,3.88,3.05,2.93,3.1,2.82,2.91,2.2,2.42,1.65,1.64,3.54,0.33,0.0


In [11]:
# Chromosome distribution heatmap (percentage only)
fig = go.Figure()

# Percentage heatmap
fig.add_trace(
    go.Heatmap(
        z=chrom_pct_df.values,
        x=chrom_pct_df.columns,
        y=chrom_pct_df.index,
        colorscale='Greens',
        text=chrom_pct_df.values,
        texttemplate='%{text:.1f}%',
        textfont={'size': 8},
        showscale=True,
        colorbar=dict(title='%')
    )
)

fig.update_layout(
    title='Chromosome Distribution Across Samples - Percentage (Natural Sort Order)',
    height=500,
    width=1200
)

fig.show()


### 5.4 FILTER Label Statistics

In [12]:
def create_filter_summary(all_vcf_stats: Dict) -> pd.DataFrame:
    """Create FILTER label summary across all samples."""
    # Collect all unique filters
    all_filters = set()
    for data in all_vcf_stats.values():
        all_filters.update(data['stats']['filters'].keys())
    
    sorted_filters = sorted(all_filters)
    
    rows = []
    for filename, data in all_vcf_stats.items():
        metadata = data['metadata']
        stats = data['stats']
        total = stats['total']
        
        row = {
            'Sample': metadata['sample'],
            'Model': metadata['model'],
            'Total': total
        }
        
        for filt in sorted_filters:
            count = stats['filters'].get(filt, 0)
            row[f'{filt}_Count'] = count
            row[f'{filt}_Pct'] = count / total * 100 if total > 0 else 0
        
        rows.append(row)
    
    df = pd.DataFrame(rows)
    
    # Round percentages
    pct_cols = [c for c in df.columns if c.endswith('_Pct')]
    df[pct_cols] = df[pct_cols].round(2)
    
    return df.sort_values(['Sample', 'Model']).reset_index(drop=True)


# Create FILTER summary
filter_df = create_filter_summary(all_vcf_stats)

# Display summary
print('FILTER Label Summary:')
display(filter_df)

# Get unique filters for visualization
filter_cols = [c.replace('_Count', '') for c in filter_df.columns if c.endswith('_Count')]

if len(filter_cols) > 0:
    # FILTER distribution grouped bar chart
    fig = go.Figure()
    
    colors = px.colors.qualitative.Set3
    
    for i, filt in enumerate(filter_cols):
        fig.add_trace(go.Bar(
            name=filt,
            x=[f"{row['Sample']}_{row['Model'].replace('-model', '')}" for _, row in filter_df.iterrows()],
            y=filter_df[f'{filt}_Count'],
            marker_color=colors[i % len(colors)],
            text=filter_df[f'{filt}_Pct'].apply(lambda x: f'{x:.1f}%'),
            textposition='auto'
        ))
    
    fig.update_layout(
        title='FILTER Label Distribution (Counts with % Labels)',
        xaxis_title='Sample_Model',
        yaxis_title='Count',
        barmode='group',
        height=500,
        legend=dict(orientation='h', yanchor='bottom', y=1.02)
    )
    fig.show()

FILTER Label Summary:


Unnamed: 0,Sample,Model,Total,GERMLINE_Count,GERMLINE_Pct,PASS_Count,PASS_Pct
0,H1437,HCC1395-model,5328595,5241303,98.36,87292,1.64
1,H1437,multicancer-model,5333715,5241303,98.27,92412,1.73
2,H2009,HCC1395-model,5406875,5238695,96.89,168180,3.11
3,H2009,multicancer-model,5407335,5238695,96.88,168640,3.12
4,HCC1395,HCC1395-model,5209875,5080653,97.52,129222,2.48
5,HCC1395,multicancer-model,5213075,5080653,97.46,132422,2.54
6,HCC1937,HCC1395-model,5270519,5213631,98.92,56888,1.08
7,HCC1937,multicancer-model,5271049,5213631,98.91,57418,1.09
8,HCC1954,HCC1395-model,5425920,5399388,99.51,26532,0.49
9,HCC1954,multicancer-model,5427075,5399388,99.49,27687,0.51


### 5.5 Base Change Spectrum (6-Type Mutation Categories)

In [13]:
def create_base_change_matrix(all_vcf_stats: Dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Create base change spectrum matrices (counts and percentages)."""
    categories = ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G']
    
    count_data = {}
    pct_data = {}
    
    for filename, data in all_vcf_stats.items():
        sample = data['metadata']['sample']
        model = data['metadata']['model']
        label = f"{sample}_{model.replace('-model', '')}"
        
        base_changes = data['stats']['base_changes']
        snp_total = data['stats']['snps']
        
        count_data[label] = {c: base_changes.get(c, 0) for c in categories}
        pct_data[label] = {c: base_changes.get(c, 0) / snp_total * 100 if snp_total > 0 else 0 for c in categories}
    
    count_df = pd.DataFrame(count_data).T[categories]
    pct_df = pd.DataFrame(pct_data).T[categories].round(2)
    
    return count_df, pct_df


# Create base change matrices
bc_count_df, bc_pct_df = create_base_change_matrix(all_vcf_stats)

# Display tables with counts and percentages side-by-side
print('Base Change Spectrum (Counts):')
display(bc_count_df)

print('\nBase Change Spectrum (Percentages of SNPs):')
display(bc_pct_df)

# Base change spectrum visualization
# Standard mutation spectrum colors
spectrum_colors = {
    'C>A': '#3498db',  # Blue
    'C>G': '#000000',  # Black
    'C>T': '#e74c3c',  # Red
    'T>A': '#95a5a6',  # Gray
    'T>C': '#2ecc71',  # Green
    'T>G': '#f39c12'   # Orange/Pink
}

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Base Change Counts', 'Base Change Percentages'],
    horizontal_spacing=0.1
)

categories = ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G']

for cat in categories:
    # Count bars
    fig.add_trace(
        go.Bar(
            name=cat,
            x=bc_count_df.index,
            y=bc_count_df[cat],
            marker_color=spectrum_colors[cat],
            legendgroup=cat,
            showlegend=True
        ),
        row=1, col=1
    )
    
    # Percentage bars
    fig.add_trace(
        go.Bar(
            name=cat,
            x=bc_pct_df.index,
            y=bc_pct_df[cat],
            marker_color=spectrum_colors[cat],
            legendgroup=cat,
            showlegend=False
        ),
        row=1, col=2
    )

fig.update_layout(
    title='6-Type Mutation Spectrum Across Samples',
    barmode='group',
    height=500,
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5)
)
fig.update_xaxes(tickangle=45)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Percentage (%)', row=1, col=2)

fig.show()

Base Change Spectrum (Counts):


Unnamed: 0,C>A,C>G,C>T,T>A,T>C,T>G
H1437_HCC1395,403308,390596,1423138,339998,1359808,368967
H2009_HCC1395,438138,408336,1451083,349891,1369460,370098
HCC1395_HCC1395,376652,387852,1423566,327663,1352690,362076
HCC1937_HCC1395,380825,386579,1420896,335377,1359030,366760
HCC1954_HCC1395,391280,396411,1464627,344905,1398820,378673
H1437_multicancer,403924,391056,1424974,340379,1361403,369430
H2009_multicancer,438288,408353,1451189,349934,1369588,370100
HCC1395_multicancer,376901,388075,1424131,327939,1353176,362288
HCC1937_multicancer,380898,386607,1421158,335457,1359202,366791
HCC1954_multicancer,391303,396445,1464715,344939,1398869,378705



Base Change Spectrum (Percentages of SNPs):


Unnamed: 0,C>A,C>G,C>T,T>A,T>C,T>G
H1437_HCC1395,9.41,9.11,33.21,7.93,31.73,8.61
H2009_HCC1395,9.99,9.31,33.08,7.98,31.22,8.44
HCC1395_HCC1395,8.9,9.17,33.65,7.75,31.97,8.56
HCC1937_HCC1395,8.96,9.1,33.44,7.89,31.98,8.63
HCC1954_HCC1395,8.94,9.06,33.48,7.88,31.98,8.66
H1437_multicancer,9.41,9.11,33.21,7.93,31.73,8.61
H2009_multicancer,9.99,9.31,33.08,7.98,31.22,8.44
HCC1395_multicancer,8.9,9.17,33.65,7.75,31.97,8.56
HCC1937_multicancer,8.96,9.1,33.44,7.89,31.98,8.63
HCC1954_multicancer,8.94,9.06,33.48,7.88,31.97,8.66


### 5.6 Indel Size Distribution

In [14]:
# Create indel size summary using module function
indel_summary_df, indel_binned_df, per_sample_binned_df = aggregate_indel_statistics(all_vcf_stats)

print('Indel Size Summary:')
display(indel_summary_df)

print('\nIndel Size Distribution by Bins (Aggregated):')
display(indel_binned_df)

# Indel size distribution: Side-by-side grouped bar chart using module visualization
fig = plot_indel_binned_distribution(
    indel_binned_df,
    'Indel Size Distribution by Bins (Aggregated Across All Samples)'
)
fig.show()

print('\n📊 Indel Statistics Summary:')
print(f'Total Insertions: {indel_binned_df["Insertions_Count"].sum():,}')
print(f'Total Deletions: {indel_binned_df["Deletions_Count"].sum():,}')
print(f'Most common insertion bin: {indel_binned_df.loc[indel_binned_df["Insertions_Count"].idxmax(), "Size_Bin"]}')
print(f'Most common deletion bin: {indel_binned_df.loc[indel_binned_df["Deletions_Count"].idxmax(), "Size_Bin"]}')


Indel Size Summary:


Unnamed: 0,Sample,Model,Total_INDELs,Insertions_Count,Insertions_Pct,Deletions_Count,Deletions_Pct,Median_Size,Max_Insertion,Max_Deletion,Mean_Insertion,Mean_Deletion
0,H1437,HCC1395-model,1042780,462961,44.4,570808,54.74,1.0,4250,11364,7.12,4.17
1,H1437,multicancer-model,1042549,462658,44.38,570880,54.76,1.0,4250,11364,7.12,4.17
2,H2009,HCC1395-model,1019869,486474,47.7,523957,51.37,2.0,3720,6061,7.46,4.31
3,H2009,multicancer-model,1019883,486635,47.71,523815,51.36,1.0,3720,6061,7.46,4.31
4,HCC1395,HCC1395-model,979376,473528,48.35,499527,51.0,2.0,4754,6191,7.94,4.45
5,HCC1395,multicancer-model,980565,474025,48.34,500219,51.01,2.0,4754,6191,7.94,4.45
6,HCC1937,HCC1395-model,1021052,470779,46.11,541547,53.04,1.0,3266,6209,6.72,4.2
7,HCC1937,multicancer-model,1020936,470802,46.11,541409,53.03,1.0,3266,6209,6.72,4.19
8,HCC1954,HCC1395-model,1051204,481959,45.85,560233,53.29,1.0,2935,12913,6.79,4.21
9,HCC1954,multicancer-model,1052099,482421,45.85,560666,53.29,1.0,2935,12913,6.79,4.2



Indel Size Distribution by Bins (Aggregated):


Unnamed: 0,Size_Bin,Insertions_Count,Insertions_Pct,Deletions_Count,Deletions_Pct
0,1,2447815,51.51,2668299,48.71
1,2,771315,16.23,937058,17.11
2,3,273962,5.76,347823,6.35
3,4-14,986460,20.76,1240397,22.64
4,15-29,137947,2.9,201276,3.67
5,30-49,46769,0.98,57534,1.05
6,>50,87974,1.85,25684,0.47



📊 Indel Statistics Summary:
Total Insertions: 4,752,242
Total Deletions: 5,478,071
Most common insertion bin: 1
Most common deletion bin: 1


### 5.6.1 Per-Sample Indel Size Distribution - Heatmaps

In [15]:
# Prepare per-sample data with Sample_Model column
per_sample_binned_df['Sample_Model'] = per_sample_binned_df['Sample'] + '_' + per_sample_binned_df['Model'].str.replace('-model', '')

# Insertions heatmap
fig_ins_heatmap = plot_indel_per_sample_heatmap(
    per_sample_binned_df,
    value_col='Insertions_Pct',
    title='Per-Sample Insertion Size Distribution (% of Total Insertions)',
    color_scale='Greens'
)
fig_ins_heatmap.show()

# Deletions heatmap
fig_del_heatmap = plot_indel_per_sample_heatmap(
    per_sample_binned_df,
    value_col='Deletions_Pct',
    title='Per-Sample Deletion Size Distribution (% of Total Deletions)',
    color_scale='Reds'
)
fig_del_heatmap.show()


### 5.6.2 Per-Sample Indel Size Distribution - Stacked Bar Charts

In [16]:
# Define consistent bin color map (green gradient)
bin_color_map = {
    '1': '#c7e9c0',
    '2': '#a1d99b',
    '3': '#74c476',
    '4-14': '#41ab5d',
    '15-29': '#238b45',
    '30-49': '#006d2c',
    '>50': '#00441b'
}

# Insertions - Counts
fig_ins_counts = plot_indel_per_sample_stacked(
    per_sample_binned_df,
    variant_type='Insertions',
    value_col='Insertions_Count',
    title='Per-Sample Insertion Counts by Size Bin',
    color_map=bin_color_map,
    height=550
)
fig_ins_counts.show()

# Insertions - Percentages
fig_ins_pct = plot_indel_per_sample_stacked(
    per_sample_binned_df,
    variant_type='Insertions',
    value_col='Insertions_Pct',
    title='Per-Sample Insertion Percentages by Size Bin',
    color_map=bin_color_map,
    height=550
)
fig_ins_pct.show()

# Deletions - Counts
fig_del_counts = plot_indel_per_sample_stacked(
    per_sample_binned_df,
    variant_type='Deletions',
    value_col='Deletions_Count',
    title='Per-Sample Deletion Counts by Size Bin',
    color_map=bin_color_map,
    height=550
)
fig_del_counts.show()

# Deletions - Percentages
fig_del_pct = plot_indel_per_sample_stacked(
    per_sample_binned_df,
    variant_type='Deletions',
    value_col='Deletions_Pct',
    title='Per-Sample Deletion Percentages by Size Bin',
    color_map=bin_color_map,
    height=550
)
fig_del_pct.show()


### 5.7 Model Comparison Summary

In [17]:
# Model comparison: aggregate statistics by model
model_comparison = landscape_df.groupby('Model').agg({
    'Total_Variants': ['sum', 'mean', 'std'],
    'SNP_Count': 'sum',
    'SNP_Pct': 'mean',
    'INDEL_Count': 'sum',
    'INDEL_Pct': 'mean',
    'TiTv_Ratio': ['mean', 'std'],
    'PASS_Pct': 'mean'
}).round(2)

model_comparison.columns = ['_'.join(col).strip() for col in model_comparison.columns.values]

print('=' * 80)
print('MODEL COMPARISON: Aggregate Statistics')
print('=' * 80)
display(model_comparison)

# Model comparison visualization
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=['Total Variants', 'Mean Ti/Tv Ratio', 'Mean SNP Percentage'],
    horizontal_spacing=0.1
)

models = model_comparison.index.tolist()
colors = px.colors.qualitative.Set2[:len(models)]

# Total variants
fig.add_trace(
    go.Bar(
        x=models,
        y=model_comparison['Total_Variants_sum'],
        marker_color=colors,
        text=model_comparison['Total_Variants_sum'].apply(lambda x: f'{x:,.0f}'),
        textposition='outside'
    ),
    row=1, col=1
)

# Mean Ti/Tv
fig.add_trace(
    go.Bar(
        x=models,
        y=model_comparison['TiTv_Ratio_mean'],
        marker_color=colors,
        text=model_comparison['TiTv_Ratio_mean'].apply(lambda x: f'{x:.2f}'),
        textposition='outside',
        error_y=dict(type='data', array=model_comparison['TiTv_Ratio_std'].values)
    ),
    row=1, col=2
)

# Mean SNP percentage
fig.add_trace(
    go.Bar(
        x=models,
        y=model_comparison['SNP_Pct_mean'],
        marker_color=colors,
        text=model_comparison['SNP_Pct_mean'].apply(lambda x: f'{x:.1f}%'),
        textposition='outside'
    ),
    row=1, col=3
)

fig.update_layout(
    title='Model Comparison Overview',
    height=400,
    showlegend=False
)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Ti/Tv Ratio', row=1, col=2)
fig.update_yaxes(title_text='Percentage (%)', row=1, col=3)

fig.show()

MODEL COMPARISON: Aggregate Statistics


Unnamed: 0_level_0,Total_Variants_sum,Total_Variants_mean,Total_Variants_std,SNP_Count_sum,SNP_Pct_mean,INDEL_Count_sum,INDEL_Pct_mean,TiTv_Ratio_mean,TiTv_Ratio_std,PASS_Pct_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HCC1395-model,26641784,5328356.8,90921.52,21527503,80.81,5114281,19.19,1.87,0.04,1.76
multicancer-model,26652249,5330449.8,90227.11,21536217,80.8,5116032,19.2,1.87,0.04,1.8


## 6. Per-Sample Analysis

Detailed statistics for each individual sample.

In [18]:
def create_sample_report(sample_name: str, all_vcf_stats: Dict) -> Dict[str, Any]:
    """
    Create comprehensive report for a single sample across all models.
    """
    # Get all entries for this sample
    sample_data = {
        fname: data for fname, data in all_vcf_stats.items()
        if data['metadata']['sample'] == sample_name
    }
    
    if not sample_data:
        return None
    
    report = {
        'sample': sample_name,
        'models': {},
        'combined': {}
    }
    
    # Per-model statistics
    all_chroms = defaultdict(int)
    all_base_changes = defaultdict(int)
    all_indel_sizes = []
    total_snps = 0
    total_indels = 0
    total_variants = 0
    
    for fname, data in sample_data.items():
        model = data['metadata']['model']
        stats = data['stats']
        
        report['models'][model] = {
            'total': stats['total'],
            'snps': stats['snps'],
            'indels': stats['indels'],
            'titv_ratio': stats['titv_ratio'],
            'filters': stats['filters'],
            'chromosomes': stats['chromosomes'],
            'base_changes': stats['base_changes'],
            'indel_sizes': stats['indel_sizes']
        }
        
        # Aggregate
        total_variants += stats['total']
        total_snps += stats['snps']
        total_indels += stats['indels']
        
        for c, count in stats['chromosomes'].items():
            all_chroms[c] += count
        for bc, count in stats['base_changes'].items():
            all_base_changes[bc] += count
        all_indel_sizes.extend(stats['indel_sizes'])
    
    report['combined'] = {
        'total_variants': total_variants,
        'total_snps': total_snps,
        'total_indels': total_indels,
        'chromosomes': dict(all_chroms),
        'base_changes': dict(all_base_changes),
        'indel_sizes': all_indel_sizes
    }
    
    return report


def display_sample_analysis(sample_name: str, all_vcf_stats: Dict):
    """Display comprehensive analysis for a single sample."""
    report = create_sample_report(sample_name, all_vcf_stats)
    
    if not report:
        print(f'No data found for sample: {sample_name}')
        return
    
    print('=' * 80)
    print(f'SAMPLE: {sample_name}')
    print('=' * 80)
    
    # Summary table
    summary_rows = []
    for model, stats in report['models'].items():
        total = stats['total']
        summary_rows.append({
            'Model': model,
            'Total': total,
            'SNP_Count': stats['snps'],
            'SNP_Pct': f"{stats['snps']/total*100:.1f}%" if total > 0 else '0%',
            'INDEL_Count': stats['indels'],
            'INDEL_Pct': f"{stats['indels']/total*100:.1f}%" if total > 0 else '0%',
            'Ti/Tv': f"{stats['titv_ratio']:.3f}" if stats['titv_ratio'] else 'N/A'
        })
    
    summary_df = pd.DataFrame(summary_rows)
    print('\n📊 Variant Type Summary:')
    display(summary_df)
    
    # Create visualizations
    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{'type': 'pie'}, {'type': 'bar'}],
               [{'type': 'bar'}, {'type': 'bar'}]],
        subplot_titles=[
            'Variant Type Distribution (Combined)',
            'Chromosome Distribution (Top 10)',
            'Base Change Spectrum',
            'Indel Size Distribution (Binned)'
        ]
    )
    
    combined = report['combined']
    
    # 1. Variant type pie chart
    fig.add_trace(
        go.Pie(
            labels=['SNP', 'INDEL'],
            values=[combined['total_snps'], combined['total_indels']],
            marker_colors=['#636EFA', '#EF553B'],
            textinfo='label+percent+value',
            hole=0.3
        ),
        row=1, col=1
    )
    
    # 2. Chromosome bar chart (top 10, natural sorted)
    chroms = natural_sort_chromosomes(list(combined['chromosomes'].keys()))
    chrom_counts = [combined['chromosomes'][c] for c in chroms]
    total_chrom = sum(chrom_counts)
    
    # Take top 10 chromosomes by count
    chrom_data = sorted(zip(chroms, chrom_counts), key=lambda x: x[1], reverse=True)[:10]
    top_chroms, top_counts = zip(*chrom_data) if chrom_data else ([], [])
    
    fig.add_trace(
        go.Bar(
            x=list(top_chroms),
            y=list(top_counts),
            marker_color='#636EFA',
            text=[f'{c:,}<br>({c/total_chrom*100:.1f}%)' for c in top_counts],
            textposition='outside'
        ),
        row=1, col=2
    )
    
    # 3. Base change spectrum
    bc_categories = ['C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G']
    bc_colors = ['#3498db', '#000000', '#e74c3c', '#95a5a6', '#2ecc71', '#f39c12']
    bc_counts = [combined['base_changes'].get(c, 0) for c in bc_categories]
    snp_total = combined['total_snps']
    
    fig.add_trace(
        go.Bar(
            x=bc_categories,
            y=bc_counts,
            marker_color=bc_colors,
            text=[f'{c:,}<br>({c/snp_total*100:.1f}%)' if snp_total > 0 else f'{c:,}' for c in bc_counts],
            textposition='outside'
        ),
        row=2, col=1
    )
    
    # 4. Indel size binned bar chart
    if combined['indel_sizes']:
        # Calculate binned statistics using module function
        bin_stats = indel_stats_by_bin(combined['indel_sizes'])
        
        # Add insertion bars
        fig.add_trace(
            go.Bar(
                name='Insertions',
                x=INDEL_SIZE_BINS,
                y=[bin_stats['insertions'][bin_label] for bin_label in INDEL_SIZE_BINS],
                marker_color=INSERTIONS_COLOR,
                offsetgroup=0,
                showlegend=True
            ),
            row=2, col=2
        )
        
        # Add deletion bars
        fig.add_trace(
            go.Bar(
                name='Deletions',
                x=INDEL_SIZE_BINS,
                y=[bin_stats['deletions'][bin_label] for bin_label in INDEL_SIZE_BINS],
                marker_color=DELETIONS_COLOR,
                offsetgroup=1,
                showlegend=True
            ),
            row=2, col=2
        )
    
    fig.update_layout(
        title=f'Sample Analysis: {sample_name}',
        height=800,
        barmode='group'
    )
    fig.update_xaxes(title_text='Size Bin (bp)', row=2, col=2)
    fig.update_yaxes(title_text='Count', row=2, col=2)
    
    fig.show()
    
    return report


# Get unique samples
unique_samples = list(set(data['metadata']['sample'] for data in all_vcf_stats.values()))
print(f'Found {len(unique_samples)} unique samples: {sorted(unique_samples)}')


Found 5 unique samples: ['H1437', 'H2009', 'HCC1395', 'HCC1937', 'HCC1954']


In [19]:
# Generate per-sample analysis for each sample
sample_reports = {}

for sample in sorted(unique_samples):
    report = display_sample_analysis(sample, all_vcf_stats)
    if report:
        sample_reports[sample] = report
    print('\n')

SAMPLE: H1437

📊 Variant Type Summary:


Unnamed: 0,Model,Total,SNP_Count,SNP_Pct,INDEL_Count,INDEL_Pct,Ti/Tv
0,HCC1395-model,5328595,4285815,80.4%,1042780,19.6%,1.852
1,multicancer-model,5333715,4291166,80.5%,1042549,19.5%,1.852




SAMPLE: H2009

📊 Variant Type Summary:


Unnamed: 0,Model,Total,SNP_Count,SNP_Pct,INDEL_Count,INDEL_Pct,Ti/Tv
0,HCC1395-model,5406875,4387006,81.1%,1019869,18.9%,1.801
1,multicancer-model,5407335,4387452,81.1%,1019883,18.9%,1.8




SAMPLE: HCC1395

📊 Variant Type Summary:


Unnamed: 0,Model,Total,SNP_Count,SNP_Pct,INDEL_Count,INDEL_Pct,Ti/Tv
0,HCC1395-model,5209875,4230499,81.2%,979376,18.8%,1.909
1,multicancer-model,5213075,4232510,81.2%,980565,18.8%,1.909




SAMPLE: HCC1937

📊 Variant Type Summary:


Unnamed: 0,Model,Total,SNP_Count,SNP_Pct,INDEL_Count,INDEL_Pct,Ti/Tv
0,HCC1395-model,5270519,4249467,80.6%,1021052,19.4%,1.892
1,multicancer-model,5271049,4250113,80.6%,1020936,19.4%,1.892




SAMPLE: HCC1954

📊 Variant Type Summary:


Unnamed: 0,Model,Total,SNP_Count,SNP_Pct,INDEL_Count,INDEL_Pct,Ti/Tv
0,HCC1395-model,5425920,4374716,80.6%,1051204,19.4%,1.895
1,multicancer-model,5427075,4374976,80.6%,1052099,19.4%,1.895






## 7. Combined Summary and Export

Export all statistics to Excel and CSV formats.

In [20]:
def export_all_statistics(
    landscape_df: pd.DataFrame,
    chrom_count_df: pd.DataFrame,
    chrom_pct_df: pd.DataFrame,
    bc_count_df: pd.DataFrame,
    bc_pct_df: pd.DataFrame,
    filter_df: pd.DataFrame,
    indel_summary_df: pd.DataFrame,
    indel_binned_df: pd.DataFrame,
    per_sample_binned_df: pd.DataFrame,
    output_dir: Path
):
    """Export all statistics to multi-sheet Excel file."""
    
    excel_path = output_dir / 'vcf_benchmarking_statistics.xlsx'
    
    with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
        # Landscape summary
        landscape_df.to_excel(writer, sheet_name='Landscape_Summary', index=False)
        
        # Chromosome distributions
        chrom_count_df.to_excel(writer, sheet_name='Chromosome_Counts')
        chrom_pct_df.to_excel(writer, sheet_name='Chromosome_Percentages')
        
        # Base change spectrum
        bc_count_df.to_excel(writer, sheet_name='BaseChange_Counts')
        bc_pct_df.to_excel(writer, sheet_name='BaseChange_Percentages')
        
        # FILTER labels
        filter_df.to_excel(writer, sheet_name='Filter_Labels', index=False)
        
        # Indel sizes
        indel_summary_df.to_excel(writer, sheet_name='Indel_Summary', index=False)
        indel_binned_df.to_excel(writer, sheet_name='Indel_Binned', index=False)
        per_sample_binned_df.to_excel(writer, sheet_name='Indel_Per_Sample_Binned', index=False)
    
    print(f'✓ Excel exported: {excel_path}')
    
    # Also export individual CSVs
    csv_dir = output_dir / 'csv'
    csv_dir.mkdir(exist_ok=True)
    
    landscape_df.to_csv(csv_dir / 'landscape_summary.csv', index=False)
    chrom_count_df.to_csv(csv_dir / 'chromosome_counts.csv')
    chrom_pct_df.to_csv(csv_dir / 'chromosome_percentages.csv')
    bc_count_df.to_csv(csv_dir / 'basechange_counts.csv')
    bc_pct_df.to_csv(csv_dir / 'basechange_percentages.csv')
    filter_df.to_csv(csv_dir / 'filter_labels.csv', index=False)
    indel_summary_df.to_csv(csv_dir / 'indel_summary.csv', index=False)
    indel_binned_df.to_csv(csv_dir / 'indel_binned.csv', index=False)
    per_sample_binned_df.to_csv(csv_dir / 'indel_per_sample_binned.csv', index=False)
    
    print(f'✓ CSV files exported to: {csv_dir}')
    
    return excel_path


# Export all statistics
excel_path = export_all_statistics(
    landscape_df=landscape_df,
    chrom_count_df=chrom_count_df,
    chrom_pct_df=chrom_pct_df,
    bc_count_df=bc_count_df,
    bc_pct_df=bc_pct_df,
    filter_df=filter_df,
    indel_summary_df=indel_summary_df,
    indel_binned_df=indel_binned_df,
    per_sample_binned_df=per_sample_binned_df,
    output_dir=OUTPUT_DIR
)


✓ Excel exported: /t9k/mnt/hdd/work/Vax/pipeline/rnadnavar/notebook/castle_statistics_output/vcf_benchmarking_statistics.xlsx
✓ CSV files exported to: /t9k/mnt/hdd/work/Vax/pipeline/rnadnavar/notebook/castle_statistics_output/csv


### 7.1 Final Summary Dashboard

In [21]:
# Create final summary dashboard
print('=' * 80)
print('FINAL SUMMARY DASHBOARD')
print('=' * 80)

# Overall statistics
total_variants = landscape_df['Total_Variants'].sum()
total_snps = landscape_df['SNP_Count'].sum()
total_indels = landscape_df['INDEL_Count'].sum()
mean_titv = landscape_df['TiTv_Ratio'].mean()

print(f'''
📊 OVERALL STATISTICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Total VCF Files Analyzed:  {len(all_vcf_stats)}
Total Unique Samples:      {len(unique_samples)}
Total Models:              {len(landscape_df['Model'].unique())}

Total Variants:            {total_variants:,}
  • SNPs:                  {total_snps:,} ({total_snps/total_variants*100:.1f}%)
  • INDELs:                {total_indels:,} ({total_indels/total_variants*100:.1f}%)

Mean Ti/Tv Ratio:          {mean_titv:.3f}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
''')

# Cross-tabulation summary
print('\n📋 CROSS-TABULATION: Sample × Key Metrics')
cross_tab = landscape_df.pivot_table(
    index='Sample',
    columns='Model',
    values=['Total_Variants', 'SNP_Pct', 'TiTv_Ratio'],
    aggfunc='first'
)
display(cross_tab.round(2))

# Top chromosomes across all samples
print('\n🧬 TOP 5 CHROMOSOMES BY VARIANT COUNT (All Samples Combined):')
combined_chroms = chrom_count_df.sum().sort_values(ascending=False).head(5)
for chrom, count in combined_chroms.items():
    pct = count / total_variants * 100
    print(f'  {chrom}: {count:,} ({pct:.1f}%)')

# Base change spectrum summary
print('\n🔬 BASE CHANGE SPECTRUM SUMMARY (All Samples Combined):')
combined_bc = bc_count_df.sum().sort_values(ascending=False)
for bc, count in combined_bc.items():
    pct = count / total_snps * 100 if total_snps > 0 else 0
    print(f'  {bc}: {count:,} ({pct:.1f}%)')

print(f'''
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✓ Analysis complete!
✓ Results exported to: {OUTPUT_DIR}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
''')

FINAL SUMMARY DASHBOARD

📊 OVERALL STATISTICS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Total VCF Files Analyzed:  10
Total Unique Samples:      5
Total Models:              2

Total Variants:            53,294,033
  • SNPs:                  43,063,720 (80.8%)
  • INDELs:                10,230,313 (19.2%)

Mean Ti/Tv Ratio:          1.870
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


📋 CROSS-TABULATION: Sample × Key Metrics


Unnamed: 0_level_0,SNP_Pct,SNP_Pct,TiTv_Ratio,TiTv_Ratio,Total_Variants,Total_Variants
Model,HCC1395-model,multicancer-model,HCC1395-model,multicancer-model,HCC1395-model,multicancer-model
Sample,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
H1437,80.43,80.45,1.85,1.85,5328595,5333715
H2009,81.14,81.14,1.8,1.8,5406875,5407335
HCC1395,81.2,81.19,1.91,1.91,5209875,5213075
HCC1937,80.63,80.63,1.89,1.89,5270519,5271049
HCC1954,80.63,80.61,1.9,1.9,5425920,5427075



🧬 TOP 5 CHROMOSOMES BY VARIANT COUNT (All Samples Combined):
  chr2: 4,138,178 (7.8%)
  chr1: 4,110,255 (7.7%)
  chr4: 3,684,134 (6.9%)
  chr3: 3,489,101 (6.5%)
  chr6: 3,175,363 (6.0%)

🔬 BASE CHANGE SPECTRUM SUMMARY (All Samples Combined):
  C>T: 14,369,477 (33.4%)
  T>C: 13,682,046 (31.8%)
  C>A: 3,981,517 (9.2%)
  C>G: 3,940,310 (9.1%)
  T>G: 3,693,888 (8.6%)
  T>A: 3,396,482 (7.9%)

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✓ Analysis complete!
✓ Results exported to: /t9k/mnt/hdd/work/Vax/pipeline/rnadnavar/notebook/castle_statistics_output
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

