# DNA-Only VCF Statistics Analysis

**Purpose**: Comprehensive VCF statistics analysis for DNA-only pipeline output.

**Key Features:**
- 🧬 **DNA-Only Workflow Support**: Automatic detection of DNA-only pipeline output
- 📊 **Per-Caller Statistics**: Variant counts by type (SNP/INDEL) and classification distribution
- 🎯 **Consensus Statistics**: Combined variant analysis from multiple callers
- 📈 **3-Way Venn Diagrams**: Caller comparison (Strelka, DeepSomatic, Mutect2)
- ✅ **Truth Set Comparison**: Precision, Recall, F1 metrics against reference truth sets
- 📁 **Export Capabilities**: Statistics to CSV and visualizations to HTML

**DNA-Only Pipeline Stages:**
1. `variant_calling/` - Individual caller outputs (Strelka, DeepSomatic, Mutect2)
2. `consensus/` - Combined consensus VCF

**Required Configuration**: Edit the next cell with your sample-specific paths before running.

In [None]:
# ==================================================================================
# CONFIGURATION - EDIT THIS CELL FOR YOUR SAMPLE
# ==================================================================================

from pathlib import Path

# Sample Configuration
SAMPLE_ID = "YOUR_SAMPLE_ID"  # Change this to your sample ID

# Base directory containing DNA-only pipeline output
# Expected structure: {BASE_DIR}/variant_calling/, {BASE_DIR}/consensus/
BASE_DIR = Path("/path/to/your/dna_only/output")

# Output directory for statistics and visualizations
OUTPUT_DIR = Path(f"{SAMPLE_ID}_dna_only_statistics")

# ==================================================================================
# TRUTH SET COMPARISON CONFIGURATION (Optional)
# ==================================================================================

# Path to truth VCF file (e.g., SEQC2 high-confidence somatic variants)
# Set to None to skip truth set comparison
TRUTH_VCF = None  # Path("/path/to/truth.vcf.gz")

# Path to high-confidence regions BED file
# Set to None to compare all regions
HIGH_CONFIDENCE_BED = None  # Path("/path/to/hc_regions.bed")

# Path to som.py metrics directory (alternative to direct comparison)
# If provided, will parse pre-computed metrics instead of running bcftools
SOM_PY_METRICS_DIR = None  # Path("/path/to/som_py_output")

# ==================================================================================
# VALIDATION - DO NOT EDIT
# ==================================================================================

# Validate base directory
if not BASE_DIR.exists():
    raise FileNotFoundError(f"❌ Base directory not found: {BASE_DIR}")

# Create output directory
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

print("✅ Configuration validated")
print(f"   Sample ID: {SAMPLE_ID}")
print(f"   Base directory: {BASE_DIR}")
print(f"   Output directory: {OUTPUT_DIR}")
print(f"   Truth VCF: {TRUTH_VCF if TRUTH_VCF else 'Not configured'}")
print(f"   High-confidence BED: {HIGH_CONFIDENCE_BED if HIGH_CONFIDENCE_BED else 'Not configured'}")

## 1. Setup and Imports

Import the VCF statistics analysis modules and configure the Python environment.

In [None]:
# Import core modules
import sys
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
from IPython.display import display, HTML

# Add bin/ directory to path for vcf_stats modules
bin_path = Path.cwd().parent / "bin"
if str(bin_path) not in sys.path:
    sys.path.insert(0, str(bin_path))

# Add bin/common for shared config modules
bin_common_path = bin_path / "common"
if str(bin_common_path) not in sys.path:
    sys.path.insert(0, str(bin_common_path))

# Import VCF statistics modules
from vcf_stats import (
    # Workflow management
    WorkflowManager,
    WorkflowType,
    # File discovery
    VCFFileDiscovery,
    # Statistics
    StatisticsAggregator,
    # Caller comparison
    CallerComparator,
    # Constants
    TOOLS,
    CATEGORY_ORDER,
    CATEGORY_COLORS,
)

# Import truth set comparison
from vcf_stats.truth_comparison import TruthSetComparator, TruthComparisonResult

print("✅ VCF statistics modules imported successfully")
print(f"   - Supported callers: {TOOLS}")
print(f"   - Categories: {', '.join(CATEGORY_ORDER)}")

## 2. Workflow Detection and File Discovery

Automatically detect DNA-only workflow and discover VCF files.

**DNA-Only Detection Criteria:**
- `variant_calling/` directory exists
- `consensus/` directory exists
- `vcf_realignment/` directory does NOT exist
- `rescue/` directory does NOT exist

In [None]:
# Initialize workflow manager and detect workflows
print("=" * 80)
print("WORKFLOW DETECTION")
print("=" * 80)

workflow_manager = WorkflowManager(BASE_DIR)
detected_workflows = workflow_manager.detect_workflows()

print(f"\nDetected workflows: {[wf.value for wf in detected_workflows]}")

# Check for DNA-only workflow
if WorkflowType.DNA_ONLY not in detected_workflows:
    print("\n⚠️  WARNING: DNA-only workflow not detected!")
    print("   This notebook is designed for DNA-only pipeline output.")
    print("   Expected directory structure:")
    print("     - variant_calling/ (present)")
    print("     - consensus/ (present)")
    print("     - vcf_realignment/ (absent)")
    print("     - rescue/ (absent)")
    print("\n   Detected directories:")
    for d in ['variant_calling', 'consensus', 'vcf_realignment', 'rescue']:
        exists = (BASE_DIR / d).exists()
        status = '✓' if exists else '✗'
        print(f"     {status} {d}/")
else:
    print("\n✅ DNA-only workflow detected successfully!")
    
    # Get workflow configuration
    dna_only_config = workflow_manager.get_workflow_config(WorkflowType.DNA_ONLY)
    print(f"\nDNA-Only Workflow Configuration:")
    print(f"  Base path: {dna_only_config.base_path}")
    print(f"  Stages: {', '.join(dna_only_config.stages)}")
    
    # Get available stages
    available_stages = workflow_manager.get_available_stages(WorkflowType.DNA_ONLY)
    print(f"  Available stages: {', '.join(available_stages)}")

In [None]:
# Discover VCF files for DNA-only workflow
print("\n" + "=" * 80)
print("VCF FILE DISCOVERY")
print("=" * 80)

# Create workflow-aware discovery
discovery = VCFFileDiscovery(BASE_DIR, workflow_manager=workflow_manager)

# Discover all workflows
all_workflow_vcfs = discovery.discover_all_workflows()

# Check if DNA-only workflow was discovered
if 'dna_only' not in all_workflow_vcfs:
    print("\n⚠️  No DNA-only VCF files discovered.")
    print("   Please check your directory structure.")
    dna_only_vcfs = {}
else:
    dna_only_vcfs = all_workflow_vcfs['dna_only']
    
    print("\nDiscovered VCF files:")
    total_files = 0
    for stage, files in dna_only_vcfs.items():
        if files:
            print(f"\n  {stage.upper()} ({len(files)} files):")
            total_files += len(files)
            for file_id, metadata in files.items():
                if isinstance(metadata, dict):
                    tool = metadata.get('tool', 'unknown')
                    sample = metadata.get('sample', 'unknown')
                    print(f"    - {file_id}: {tool} | {sample}")
                else:
                    print(f"    - {file_id}")
        else:
            print(f"\n  {stage.upper()}: No files found")
    
    print(f"\n✅ Total VCF files discovered: {total_files}")

## 3. Per-Caller Statistics (variant_calling stage)

Analyze variant statistics from each caller at the variant_calling stage:
- **Variant counts by type**: SNP, INDEL, OTHER
- **Classification distribution**: Somatic, Germline, Reference, Artifact

In [None]:
# Aggregate statistics for DNA-only workflow
print("=" * 80)
print("PER-CALLER STATISTICS (variant_calling stage)")
print("=" * 80)

# Check if we have variant_calling VCFs
variant_calling_vcfs = dna_only_vcfs.get('variant_calling', {})

if not variant_calling_vcfs:
    print("\n⚠️  No variant_calling VCFs found. Skipping per-caller statistics.")
    per_caller_stats_df = None
else:
    # Create aggregator and compute statistics
    aggregator = StatisticsAggregator({}, workflow_type="dna_only")
    
    # Flatten VCF files for aggregation
    all_vcf_files = {}
    for stage, files in dna_only_vcfs.items():
        for file_id, metadata in files.items():
            all_vcf_files[file_id] = metadata
    
    # Aggregate DNA-only statistics
    per_caller_stats_df = aggregator.aggregate_dna_only_stats(all_vcf_files)
    
    # Filter to variant_calling stage only
    if not per_caller_stats_df.empty:
        variant_calling_df = per_caller_stats_df[per_caller_stats_df['Stage'] == 'variant_calling'].copy()
        
        if not variant_calling_df.empty:
            print("\n📊 Per-Caller Variant Statistics:")
            print()
            
            # Display summary table
            display_cols = ['Tool', 'Sample', 'Total_Variants', 'SNP', 'INDEL', 'Somatic', 'Germline', 'Reference', 'Artifact']
            available_cols = [c for c in display_cols if c in variant_calling_df.columns]
            display(variant_calling_df[available_cols])
        else:
            print("\n⚠️  No variant_calling statistics computed.")
    else:
        print("\n⚠️  Statistics aggregation returned empty results.")

In [None]:
# Visualize per-caller statistics
if per_caller_stats_df is not None and not per_caller_stats_df.empty:
    variant_calling_df = per_caller_stats_df[per_caller_stats_df['Stage'] == 'variant_calling'].copy()
    
    if not variant_calling_df.empty:
        # Create variant type distribution plot
        fig_variant_types = go.Figure()
        
        tools = variant_calling_df['Tool'].unique()
        
        for var_type, color in [('SNP', '#2ecc71'), ('INDEL', '#e74c3c'), ('OTHER', '#95a5a6')]:
            if var_type in variant_calling_df.columns:
                fig_variant_types.add_trace(go.Bar(
                    name=var_type,
                    x=variant_calling_df['Tool'],
                    y=variant_calling_df[var_type],
                    marker_color=color
                ))
        
        fig_variant_types.update_layout(
            title='Variant Type Distribution by Caller',
            xaxis_title='Caller',
            yaxis_title='Variant Count',
            barmode='group',
            height=400
        )
        fig_variant_types.show()
        
        # Create classification distribution plot
        fig_classification = go.Figure()
        
        for category in ['Somatic', 'Germline', 'Reference', 'Artifact']:
            if category in variant_calling_df.columns:
                color = CATEGORY_COLORS.get(category, '#7f8c8d')
                fig_classification.add_trace(go.Bar(
                    name=category,
                    x=variant_calling_df['Tool'],
                    y=variant_calling_df[category],
                    marker_color=color
                ))
        
        fig_classification.update_layout(
            title='Classification Distribution by Caller',
            xaxis_title='Caller',
            yaxis_title='Variant Count',
            barmode='stack',
            height=400
        )
        fig_classification.show()
else:
    print("⚠️  No data available for visualization.")

## 4. Consensus Statistics

Analyze the consensus VCF that combines variants from multiple callers.

In [None]:
# Display consensus statistics
print("=" * 80)
print("CONSENSUS STATISTICS")
print("=" * 80)

# Check if we have consensus VCFs
consensus_vcfs = dna_only_vcfs.get('consensus', {})

if not consensus_vcfs:
    print("\n⚠️  No consensus VCFs found. Skipping consensus statistics.")
    consensus_stats_df = None
else:
    # Filter to consensus stage
    if per_caller_stats_df is not None and not per_caller_stats_df.empty:
        consensus_stats_df = per_caller_stats_df[per_caller_stats_df['Stage'] == 'consensus'].copy()
        
        if not consensus_stats_df.empty:
            print("\n📊 Consensus Variant Statistics:")
            print()
            
            # Display summary table
            display_cols = ['Tool', 'Sample', 'Total_Variants', 'SNP', 'INDEL', 'Somatic', 'Germline', 'Reference', 'Artifact']
            available_cols = [c for c in display_cols if c in consensus_stats_df.columns]
            display(consensus_stats_df[available_cols])
            
            # Create visualization
            fig_consensus = go.Figure()
            
            for category in ['Somatic', 'Germline', 'Reference', 'Artifact', 'NoConsensus']:
                if category in consensus_stats_df.columns:
                    color = CATEGORY_COLORS.get(category, '#7f8c8d')
                    fig_consensus.add_trace(go.Bar(
                        name=category,
                        x=consensus_stats_df['Sample'],
                        y=consensus_stats_df[category],
                        marker_color=color
                    ))
            
            fig_consensus.update_layout(
                title='Consensus Classification Distribution',
                xaxis_title='Sample',
                yaxis_title='Variant Count',
                barmode='stack',
                height=400
            )
            fig_consensus.show()
        else:
            print("\n⚠️  No consensus statistics computed.")
            consensus_stats_df = None
    else:
        print("\n⚠️  No statistics available for consensus stage.")
        consensus_stats_df = None

## 5. 3-Way Venn Diagram Comparison

Compare variant calls across DNA callers (Strelka, DeepSomatic, Mutect2) using Venn diagrams.

This shows:
- Variants unique to each caller
- Variants shared between pairs of callers
- Variants called by all three callers (high confidence)

In [None]:
# Generate 3-way Venn diagrams for DNA callers
print("=" * 80)
print("3-WAY VENN DIAGRAM COMPARISON")
print("=" * 80)

# Check if we have variant_calling VCFs
if not dna_only_vcfs.get('variant_calling', {}):
    print("\n⚠️  No variant_calling VCFs found. Skipping Venn diagram comparison.")
    venn_summary = None
    venn_figures = {}
else:
    # Create caller comparator
    caller_comparator = CallerComparator(all_workflow_vcfs)
    
    try:
        # Compare DNA callers using DNA-only workflow
        venn_summary, venn_figures = caller_comparator.compare_dna_callers(
            workflow="dna_only",
            stage="variant_calling"
        )
        
        if venn_summary is not None:
            print("\n📊 Caller Comparison Summary:")
            display(venn_summary)
            
            # Display Venn diagrams
            print("\n📈 Venn Diagrams:")
            for fig_name, fig in venn_figures.items():
                print(f"\n  {fig_name}:")
                fig.show()
        else:
            print("\n⚠️  Could not generate Venn diagram comparison.")
            print("   Need at least 2 DNA callers for comparison.")
            venn_figures = {}
            
    except Exception as e:
        print(f"\n⚠️  Error generating Venn diagrams: {e}")
        venn_summary = None
        venn_figures = {}

## 6. Truth Set Comparison

Compare variant calls against a truth set to calculate:
- **Precision**: TP / (TP + FP) - proportion of called variants that are true
- **Recall**: TP / (TP + FN) - proportion of true variants that were called
- **F1 Score**: Harmonic mean of precision and recall

**Note**: This section requires either:
1. Pre-computed som.py metrics files, OR
2. Truth VCF and bcftools installed for direct comparison

In [None]:
# Truth set comparison
print("=" * 80)
print("TRUTH SET COMPARISON")
print("=" * 80)

truth_results = {}

# Check if truth comparison is configured
if TRUTH_VCF is None and SOM_PY_METRICS_DIR is None:
    print("\n⚠️  Truth set comparison not configured.")
    print("   To enable, set one of the following in the configuration cell:")
    print("   - TRUTH_VCF: Path to truth VCF file")
    print("   - SOM_PY_METRICS_DIR: Path to som.py metrics directory")
else:
    # Initialize comparator
    comparator = TruthSetComparator(
        truth_vcf=TRUTH_VCF,
        high_confidence_bed=HIGH_CONFIDENCE_BED
    )
    
    # Try to load som.py metrics if available
    if SOM_PY_METRICS_DIR is not None:
        som_py_dir = Path(SOM_PY_METRICS_DIR)
        if som_py_dir.exists():
            print("\n📊 Loading som.py metrics...")
            
            # Look for metrics files for each caller
            caller_metrics_map = {
                'deepsomatic': ['DS.metrics.json', 'deepsomatic.metrics.json'],
                'mutect2': ['M2.metrics.json', 'mutect2.metrics.json'],
                'strelka': ['S2.metrics.json', 'strelka.metrics.json', 'strelka2.metrics.json']
            }
            
            for caller, possible_files in caller_metrics_map.items():
                for metrics_file in possible_files:
                    metrics_path = som_py_dir / metrics_file
                    if metrics_path.exists():
                        try:
                            result = comparator.compare_from_som_py_metrics(metrics_path)
                            truth_results[caller] = result
                            print(f"   ✓ Loaded metrics for {caller}")
                        except Exception as e:
                            print(f"   ⚠️  Error loading {caller} metrics: {e}")
                        break
        else:
            print(f"\n⚠️  som.py metrics directory not found: {som_py_dir}")
    
    # If no som.py metrics, try direct comparison with bcftools
    if not truth_results and TRUTH_VCF is not None:
        print("\n📊 Running bcftools isec comparison...")
        
        # Get variant_calling VCFs for comparison
        variant_calling_vcfs = dna_only_vcfs.get('variant_calling', {})
        
        for file_id, metadata in variant_calling_vcfs.items():
            if isinstance(metadata, dict):
                vcf_path = metadata.get('path')
                caller = metadata.get('tool', 'unknown')
            else:
                vcf_path = metadata
                caller = file_id.split('_')[0] if '_' in file_id else file_id
            
            if vcf_path and Path(vcf_path).exists():
                try:
                    result = comparator.compare(Path(vcf_path))
                    truth_results[caller] = result
                    print(f"   ✓ Compared {caller}")
                except Exception as e:
                    print(f"   ⚠️  Error comparing {caller}: {e}")
    
    # Display results
    if truth_results:
        print("\n" + "=" * 60)
        print("TRUTH SET COMPARISON RESULTS")
        print("=" * 60)
    else:
        print("\n⚠️  No truth set comparison results available.")

In [None]:
# Display truth comparison results in formatted table
if truth_results:
    # Create summary DataFrame
    truth_rows = []
    for caller, result in truth_results.items():
        truth_rows.append({
            'Caller': caller.capitalize(),
            'TP': result.tp,
            'FP': result.fp,
            'FN': result.fn,
            'Precision': f"{result.precision:.4f}",
            'Recall': f"{result.recall:.4f}",
            'F1 Score': f"{result.f1_score:.4f}",
            'Source': result.source
        })
    
    truth_df = pd.DataFrame(truth_rows)
    
    print("\n📊 Precision, Recall, F1 Metrics:")
    print()
    
    # Display as styled HTML table
    display(truth_df)
    
    # Create visualization
    fig_metrics = go.Figure()
    
    callers = [r['Caller'] for r in truth_rows]
    
    for metric, color in [('Precision', '#3498db'), ('Recall', '#2ecc71'), ('F1 Score', '#9b59b6')]:
        values = [float(r[metric]) for r in truth_rows]
        fig_metrics.add_trace(go.Bar(
            name=metric,
            x=callers,
            y=values,
            marker_color=color,
            text=[f"{v:.3f}" for v in values],
            textposition='outside'
        ))
    
    fig_metrics.update_layout(
        title='Truth Set Comparison Metrics by Caller',
        xaxis_title='Caller',
        yaxis_title='Score',
        yaxis_range=[0, 1.1],
        barmode='group',
        height=450
    )
    fig_metrics.show()
else:
    print("\n⚠️  No truth comparison results to display.")

## 7. Export Results

Export statistics to CSV and visualizations to HTML for sharing and further analysis.

In [None]:
# Export statistics and visualizations
print("=" * 80)
print("EXPORT RESULTS")
print("=" * 80)

export_files = []

# Export per-caller statistics to CSV
if per_caller_stats_df is not None and not per_caller_stats_df.empty:
    stats_csv_path = OUTPUT_DIR / f"{SAMPLE_ID}_dna_only_statistics.csv"
    per_caller_stats_df.to_csv(stats_csv_path, index=False)
    export_files.append(('Statistics CSV', stats_csv_path))
    print(f"\n✓ Exported statistics to: {stats_csv_path}")

# Export truth comparison results to CSV
if truth_results:
    truth_rows = []
    for caller, result in truth_results.items():
        truth_rows.append({
            'Caller': caller,
            'TP': result.tp,
            'FP': result.fp,
            'FN': result.fn,
            'Precision': result.precision,
            'Recall': result.recall,
            'F1_Score': result.f1_score,
            'Query_Total': result.query_total,
            'Truth_Total': result.truth_total,
            'Source': result.source
        })
    
    truth_csv_path = OUTPUT_DIR / f"{SAMPLE_ID}_truth_comparison.csv"
    pd.DataFrame(truth_rows).to_csv(truth_csv_path, index=False)
    export_files.append(('Truth Comparison CSV', truth_csv_path))
    print(f"✓ Exported truth comparison to: {truth_csv_path}")

# Export Venn diagrams to HTML
if venn_figures:
    venn_html_path = OUTPUT_DIR / f"{SAMPLE_ID}_venn_diagrams.html"
    
    # Combine all figures into single HTML
    html_content = "<html><head><title>Venn Diagrams</title></head><body>"
    html_content += f"<h1>DNA Caller Comparison - {SAMPLE_ID}</h1>"
    
    for fig_name, fig in venn_figures.items():
        html_content += f"<h2>{fig_name}</h2>"
        html_content += fig.to_html(full_html=False, include_plotlyjs='cdn')
    
    html_content += "</body></html>"
    
    with open(venn_html_path, 'w') as f:
        f.write(html_content)
    
    export_files.append(('Venn Diagrams HTML', venn_html_path))
    print(f"✓ Exported Venn diagrams to: {venn_html_path}")

# Summary
print("\n" + "=" * 60)
print("EXPORT SUMMARY")
print("=" * 60)

if export_files:
    print(f"\n✅ Exported {len(export_files)} file(s) to: {OUTPUT_DIR}")
    for name, path in export_files:
        print(f"   - {name}: {path.name}")
else:
    print("\n⚠️  No files exported. Run analysis cells first.")

## 8. Analysis Summary

Summary of the DNA-only VCF statistics analysis.

In [None]:
# Print analysis summary
print("=" * 80)
print("ANALYSIS SUMMARY")
print("=" * 80)

print(f"\n📋 Sample: {SAMPLE_ID}")
print(f"📁 Base Directory: {BASE_DIR}")
print(f"📂 Output Directory: {OUTPUT_DIR}")

# Workflow detection summary
print("\n🔍 Workflow Detection:")
if WorkflowType.DNA_ONLY in detected_workflows:
    print("   ✓ DNA-only workflow detected")
else:
    print("   ⚠️  DNA-only workflow not detected")

# File discovery summary
print("\n📄 VCF Files Discovered:")
for stage, files in dna_only_vcfs.items():
    print(f"   - {stage}: {len(files)} file(s)")

# Statistics summary
print("\n📊 Statistics:")
if per_caller_stats_df is not None and not per_caller_stats_df.empty:
    variant_calling_count = len(per_caller_stats_df[per_caller_stats_df['Stage'] == 'variant_calling'])
    consensus_count = len(per_caller_stats_df[per_caller_stats_df['Stage'] == 'consensus'])
    print(f"   - Per-caller statistics: {variant_calling_count} caller(s)")
    print(f"   - Consensus statistics: {consensus_count} sample(s)")
else:
    print("   - No statistics computed")

# Venn diagram summary
print("\n📈 Venn Diagrams:")
if venn_figures:
    print(f"   - Generated {len(venn_figures)} Venn diagram(s)")
else:
    print("   - No Venn diagrams generated")

# Truth comparison summary
print("\n✅ Truth Set Comparison:")
if truth_results:
    print(f"   - Compared {len(truth_results)} caller(s)")
    for caller, result in truth_results.items():
        print(f"     • {caller}: F1={result.f1_score:.4f}")
else:
    print("   - Not configured or no results")

# Export summary
print("\n📁 Exported Files:")
if export_files:
    for name, path in export_files:
        print(f"   - {name}")
else:
    print("   - No files exported")

print("\n" + "=" * 80)
print("Analysis complete!")
print("=" * 80)