In [2]:
# Setup and imports
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set project root
project_root = Path("/home/Plutonium/Documents/BioinfoMidterm")
os.chdir(project_root)
sys.path.insert(0, str(project_root / "scripts"))

print(f"Working directory: {os.getcwd()}")

# Core imports
import pandas as pd
import numpy as np
import subprocess

# Project imports
from config import PATHS

print("All imports successful!")

Working directory: /home/Plutonium/Documents/BioinfoMidterm
All imports successful!


## Define Paths

In [3]:
# Paths
PART2_DIR = project_root / "output" / "part2"
PART2_DIR.mkdir(parents=True, exist_ok=True)

# Input VCF - use the unified SNP-only biallelic VCF
INPUT_VCF = project_root / "1000genomes" / "main_vcf" / "main_vcf.vcf.gz"
SAMPLES_CSV = str(PATHS.EAS_SAMPLES_CSV)

print(f"Input VCF: {INPUT_VCF}")
print(f"VCF exists: {INPUT_VCF.exists()}")
print(f"Samples CSV: {SAMPLES_CSV}")
print(f"Part 2 directory: {PART2_DIR}")

# List available BED files
bed_files = list(PART2_DIR.glob("*.bed"))
# Exclude overlap BED files
bed_files = [f for f in bed_files if '_overlap' not in f.stem]
print(f"\nFound {len(bed_files)} BED files:")
for f in bed_files:
    print(f"  - {f.name}")

Input VCF: /home/Plutonium/Documents/BioinfoMidterm/1000genomes/main_vcf/main_vcf.vcf.gz
VCF exists: True
Samples CSV: 1000genomes/EAS_subpopulation_samples.csv
Part 2 directory: /home/Plutonium/Documents/BioinfoMidterm/output/part2

Found 1 BED files:
  - forenseq.bed


## Step 1: Load Sample List (EAS subpopulations)

In [4]:
# Load sample info (CSV has no header: sample, pop, super_pop)
samples_df = pd.read_csv(SAMPLES_CSV, header=None, names=['sample', 'pop', 'super_pop'])
print(f"Total samples: {len(samples_df)}")
print(f"Population distribution:")
print(samples_df['pop'].value_counts())

# Create sample list file for bcftools
sample_list_file = str(PART2_DIR / "eas_samples.txt")
samples_df['sample'].to_csv(sample_list_file, index=False, header=False)
print(f"\nSample list saved: {sample_list_file}")

Total samples: 306
Population distribution:
pop
JPT    104
CHB    103
KHV     99
Name: count, dtype: int64

Sample list saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/eas_samples.txt


## Step 2: Extract VCF for Each AISNP Panel using bcftools

In [6]:
# Extract VCF for each BED file using bcftools
extracted_vcfs = {}

for bed_file in bed_files:
    source_name = bed_file.stem
    print(f"\n{'='*70}")
    print(f"Processing: {source_name}")
    print(f"{'='*70}")
    
    output_vcf = str(PART2_DIR / f"{source_name}_extracted.vcf")
    
    # bcftools view:
    # -R: regions from BED file
    # -S: samples to include
    # -O v: output VCF format
    cmd = [
        "bcftools", "view",
        "-R", str(bed_file),
        "-S", sample_list_file,
        "-O", "v",
        "-o", output_vcf,
        str(INPUT_VCF)
    ]
    
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"Error: {result.stderr}")
        continue
    
    # Count variants in output
    count_cmd = ["grep", "-c", "-v", "^#", output_vcf]
    count_result = subprocess.run(count_cmd, capture_output=True, text=True)
    n_variants = int(count_result.stdout.strip()) if count_result.returncode == 0 else 0
    
    print(f"Extracted {n_variants} variants")
    extracted_vcfs[source_name] = output_vcf

print(f"\n\nExtracted VCFs for {len(extracted_vcfs)} sources")


Processing: forenseq
Running: bcftools view -R /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq.bed -S /home/Plutonium/Documents/BioinfoMidterm/output/part2/eas_samples.txt -O v -o /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_extracted.vcf /home/Plutonium/Documents/BioinfoMidterm/1000genomes/main_vcf/main_vcf.vcf.gz
Error: [E::hts_open_format] Failed to open file "/home/Plutonium/Documents/BioinfoMidterm/1000genomes/main_vcf/main_vcf.vcf.gz" : No such file or directory
Failed to read from /home/Plutonium/Documents/BioinfoMidterm/1000genomes/main_vcf/main_vcf.vcf.gz: No such file or directory



Extracted VCFs for 0 sources


## Step 3: Convert VCF to Genotype Matrix

In [7]:
def vcf_to_genotype_matrix(vcf_path: str, samples_df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert VCF to genotype matrix (samples x SNPs) using bcftools.
    Genotypes encoded as 0, 1, 2 (count of ALT alleles).
    
    Uses bcftools query for speed and accuracy.
    """
    # Step 1: Get sample IDs from VCF using bcftools
    sample_cmd = ["bcftools", "query", "-l", vcf_path]
    sample_result = subprocess.run(sample_cmd, capture_output=True, text=True, check=True)
    sample_ids = sample_result.stdout.strip().split('\n')
    
    # Step 2: Extract genotypes using bcftools query
    # Format: CHROM, POS, ID, then GT for each sample
    query_cmd = [
        "bcftools", "query",
        "-f", "%CHROM\t%POS\t%ID[\t%GT]\n",
        vcf_path
    ]
    query_result = subprocess.run(query_cmd, capture_output=True, text=True, check=True)
    
    # Parse the output
    genotypes = {}
    
    for line in query_result.stdout.strip().split('\n'):
        if not line:
            continue
        
        fields = line.split('\t')
        chrom = fields[0]
        pos = fields[1]
        rsid = fields[2]
        
        # Use rsid if available, else chr:pos
        snp_id = rsid if rsid != '.' else f"{chrom}:{pos}"
        
        # Parse genotypes (starting from field 3)
        gt_values = []
        for gt in fields[3:]:
            if gt in ['./.', '.|.', '.']:
                gt_values.append(np.nan)
            else:
                # Count ALT alleles (handle both | and / separators)
                alleles = gt.replace('|', '/').split('/')
                alt_count = sum(1 for a in alleles if a != '0' and a != '.')
                gt_values.append(alt_count)
        
        genotypes[snp_id] = gt_values
    
    # Create DataFrame
    df = pd.DataFrame(genotypes, index=sample_ids)
    df.index.name = 'sample'
    df = df.reset_index()
    
    # Add population labels
    df = df.merge(samples_df[['sample', 'pop']], on='sample', how='left')
    
    # Reorder columns: sample, pop, then SNPs
    snp_cols = [c for c in df.columns if c not in ['sample', 'pop']]
    df = df[['sample', 'pop'] + snp_cols]
    
    return df

# Convert each extracted VCF to matrix
ml_matrices = {}

for source_name, vcf_path in extracted_vcfs.items():
    print(f"\n{'='*50}")
    print(f"Converting: {source_name}")
    print(f"{'='*50}")
    
    df = vcf_to_genotype_matrix(vcf_path, samples_df)
    
    n_snps = len([c for c in df.columns if c not in ['sample', 'pop']])
    print(f"Shape: {df.shape} ({n_snps} SNPs)")
    print(f"Population distribution:")
    print(df['pop'].value_counts())
    
    # Save matrix
    output_path = str(PART2_DIR / f"{source_name}_ml_matrix.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")
    
    ml_matrices[source_name] = df

print(f"\n\nConverted {len(ml_matrices)} sources to ML matrices")


Converting: forenseq
Shape: (306, 2) (0 SNPs)
Population distribution:
pop
JPT    104
CHB    103
KHV     99
Name: count, dtype: int64
Saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_ml_matrix.csv


Converted 1 sources to ML matrices


## Step 4: Summary

In [8]:
# Display summary
extraction_summary = []

for source_name, df in ml_matrices.items():
    n_snps = len([c for c in df.columns if c not in ['sample', 'pop']])
    extraction_summary.append({
        'Source': source_name,
        'SNPs': n_snps,
        'Samples': len(df),
        'VCF': f"{source_name}_extracted.vcf",
        'Matrix': f"{source_name}_ml_matrix.csv"
    })

summary_df = pd.DataFrame(extraction_summary)

print("="*70)
print("EXTRACTION SUMMARY")
print("="*70)
display(summary_df)

# Save summary
summary_path = str(PART2_DIR / "extraction_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"\nSummary saved: {summary_path}")

EXTRACTION SUMMARY


Unnamed: 0,Source,SNPs,Samples,VCF,Matrix
0,forenseq,0,306,forenseq_extracted.vcf,forenseq_ml_matrix.csv



Summary saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/extraction_summary.csv


In [9]:
# Preview each matrix
for name, df in ml_matrices.items():
    print(f"\n{'='*50}")
    print(f"{name}")
    print(f"{'='*50}")
    display(df.head(3))


forenseq


Unnamed: 0,sample,pop
0,HG01595,KHV
1,HG01596,KHV
2,HG01597,KHV


## Next Steps

In [10]:
print("="*70)
print("BED TO ML MATRIX CONVERSION COMPLETE")
print("="*70)

print(f"\nWorkflow completed:")
print(f"  1. BED files → bcftools extract → VCF (variants + samples)")
print(f"  2. VCF → Genotype matrix (samples × SNPs)")

print(f"\nML matrices created for {len(ml_matrices)} AISNP panels:")
for name in ml_matrices.keys():
    print(f"  - {name}")

print(f"\nOutput files saved to: {PART2_DIR}")

print(f"\nNext Steps:")
print(f"  Run 08_known_aisnps_ml.ipynb to train and evaluate models")

BED TO ML MATRIX CONVERSION COMPLETE

Workflow completed:
  1. BED files → bcftools extract → VCF (variants + samples)
  2. VCF → Genotype matrix (samples × SNPs)

ML matrices created for 1 AISNP panels:
  - forenseq

Output files saved to: /home/Plutonium/Documents/BioinfoMidterm/output/part2

Next Steps:
  Run 08_known_aisnps_ml.ipynb to train and evaluate models
