In [33]:
# Setup and imports
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set project root
project_root = Path("/home/Plutonium/Documents/BioinfoMidterm")
os.chdir(project_root)
sys.path.insert(0, str(project_root / "scripts"))

print(f"Working directory: {os.getcwd()}")

# Core imports
import pandas as pd
import numpy as np

# Part 2 imports
from part2.rsid_utils import (
    load_rsid_list,
    rsid_to_bed,
    batch_rsid_lookup
)

# Project imports
from config import PATHS

print("All imports successful!")

Working directory: /home/Plutonium/Documents/BioinfoMidterm
All imports successful!


## Define Paths

In [34]:
# Input/Output directories
KNOWN_AISNPS_DIR = project_root / "data" / "known_aisnps"
OUTPUT_DIR = project_root / "output" / "part2"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Known AISNPs directory: {KNOWN_AISNPS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

# List available rsID files
if KNOWN_AISNPS_DIR.exists():
    rsid_files = list(KNOWN_AISNPS_DIR.glob("*.csv")) + list(KNOWN_AISNPS_DIR.glob("*.txt"))
    print(f"\nFound {len(rsid_files)} rsID files:")
    for f in rsid_files:
        print(f"  - {f.name}")
else:
    print("\nNo rsID files found. Please add files to data/known_aisnps/")

Known AISNPs directory: /home/Plutonium/Documents/BioinfoMidterm/data/known_aisnps
Output directory: /home/Plutonium/Documents/BioinfoMidterm/output/part2

Found 1 rsID files:
  - forenseq_ancestry.csv


## Step 1: Define Known AISNP Sources

Add your rsID sources here. Each source should have:
- `name`: Short identifier
- `file`: Path to rsID file
- `description`: Source description

In [35]:
# Define known AISNP sources
# Add your sources here
SOURCES = [
    # {
    #     'name': 'kidd_55',
    #     'file': KNOWN_AISNPS_DIR / 'kidd_55_aisnps.csv',
    #     'description': 'Kidd et al. 55 AISNPs panel'
    # },
    # {
    #     'name': 'precision_id',
    #     'file': KNOWN_AISNPS_DIR / 'precision_id_ancestry.csv',
    #     'description': 'ThermoFisher Precision ID Ancestry Panel'
    # },
    {
        'name': 'forenseq',
        'file': KNOWN_AISNPS_DIR / 'forenseq_ancestry.csv',
        'description': 'Verogen ForenSeq Ancestry SNPs'
    },
    # Add more sources as needed
]

# Check which sources exist
available_sources = []
for source in SOURCES:
    if source['file'].exists():
        available_sources.append(source)
        print(f"✓ {source['name']}: {source['file'].name}")
    else:
        print(f"✗ {source['name']}: File not found - {source['file']}")

print(f"\nAvailable sources: {len(available_sources)}/{len(SOURCES)}")

✓ forenseq: forenseq_ancestry.csv

Available sources: 1/1


## Step 2: Load and Preview rsID Lists

In [36]:
# Load and preview each source
rsid_data = {}

for source in available_sources:
    print(f"\n{'='*60}")
    print(f"Source: {source['name']}")
    print(f"Description: {source['description']}")
    print(f"{'='*60}")
    
    rsids = load_rsid_list(source['file'])
    rsid_data[source['name']] = rsids
    
    print(f"First 10 rsIDs: {rsids[:10]}")
    print(f"Last 5 rsIDs: {rsids[-5:]}")


Source: forenseq
Description: Verogen ForenSeq Ancestry SNPs
Loaded 56 rsIDs from /home/Plutonium/Documents/BioinfoMidterm/data/known_aisnps/forenseq_ancestry.csv
First 10 rsIDs: ['rs2814778', 'rs3737576', 'rs7554936', 'rs10497191', 'rs1834619', 'rs1876482', 'rs260690', 'rs3827760', 'rs6754311', 'rs798443']
Last 5 rsIDs: ['rs4891825', 'rs7226659', 'rs7251928', 'rs310644', 'rs2024566']


## Step 3: Convert rsIDs to BED Format

Uses Ensembl REST API to lookup genomic coordinates for each rsID.
Results are cached to avoid repeated API calls.

In [37]:
# Convert each source to BED format
# Using GRCh37 (hg19) to match 1000 Genomes Phase 3
ASSEMBLY = 'GRCh37'

bed_files = {}

for source in available_sources:
    name = source['name']
    print(f"\n{'='*60}")
    print(f"Processing: {name}")
    print(f"{'='*60}")
    
    output_bed = str(OUTPUT_DIR / f"{name}.bed")
    
    bed_path = rsid_to_bed(
        rsids=rsid_data[name],
        output_path=output_bed,
        assembly=ASSEMBLY,
        source_name=name
    )
    
    bed_files[name] = bed_path

print(f"\n\nGenerated BED files:")
for name, path in bed_files.items():
    print(f"  - {name}: {path}")


Processing: forenseq
Looking up 56 rsIDs...
Loaded 56 cached entries


Looking up rsIDs: 100%|██████████| 56/56 [00:00<00:00, 599186.29it/s]

Cache saved to /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_rsid_cache.csv
Successfully resolved 56/56 rsIDs
BED file saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq.bed
Annotated CSV saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_annotated.csv


Generated BED files:
  - forenseq: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq.bed





## Step 4: Verify BED Files

In [38]:
# Verify and summarize BED files
summary = []

for name, bed_path in bed_files.items():
    # Load BED
    bed_df = pd.read_csv(bed_path, sep='\t', header=None,
                         names=['chr', 'start', 'end', 'rsid'])
    
    # Load annotated CSV
    csv_path = bed_path.replace('.bed', '_annotated.csv')
    
    summary.append({
        'Source': name,
        'Total_rsIDs': len(rsid_data[name]),
        'Resolved': len(bed_df),
        'Success_Rate': f"{len(bed_df)/len(rsid_data[name])*100:.1f}%",
        'Chromosomes': bed_df['chr'].nunique(),
        'BED_File': bed_path
    })
    
    print(f"\n{name}:")
    print(f"  Resolved: {len(bed_df)}/{len(rsid_data[name])}")
    print(f"  Chromosome distribution:")
    print(bed_df['chr'].value_counts().head(5).to_string())

summary_df = pd.DataFrame(summary)
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
display(summary_df)


forenseq:
  Resolved: 56/56
  Chromosome distribution:
chr
chr2     7
chr13    5
chr15    5
chr17    5
chr18    4

SUMMARY


Unnamed: 0,Source,Total_rsIDs,Resolved,Success_Rate,Chromosomes,BED_File
0,forenseq,56,56,100.0%,22,/home/Plutonium/Documents/BioinfoMidterm/outpu...


## Step 5: Check Overlap with Our Dataset

In [39]:
# Load our SNP coordinates from pfile
pvar_file = str(PATHS.PLINK_LD_PRUNED) + '.pvar'

if Path(pvar_file).exists():
    # Load our SNPs with coordinates
    our_snps = []
    with open(pvar_file) as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                pos = int(fields[1])
                rsid = fields[2]
                our_snps.append({'chr': chrom, 'pos': pos, 'rsid': rsid})
    
    our_snps_df = pd.DataFrame(our_snps)
    # Create coordinate set for fast lookup (chr, pos)
    our_coords = set(zip(our_snps_df['chr'].astype(str), our_snps_df['pos']))
    
    print(f"Our dataset has {len(our_snps_df)} SNPs")
    
    # Check overlap for each source by coordinates
    print(f"\nOverlap with known AISNP panels (by chr:pos coordinates):")
    print("="*70)
    
    for name, bed_path in bed_files.items():
        # Load BED file
        bed_df = pd.read_csv(bed_path, sep='\t', header=None,
                             names=['chr', 'start', 'end', 'rsid'])
        
        # BED uses 0-based start, so end position = 1-based position
        # Match using chromosome and end position (1-based coordinate)
        bed_df['chr_clean'] = bed_df['chr'].astype(str).str.replace('chr', '')
        bed_coords = set(zip(bed_df['chr_clean'], bed_df['end']))
        
        overlap_coords = bed_coords & our_coords
        
        # Find the matching rsIDs
        overlapping_rsids = bed_df[
            bed_df.apply(lambda r: (r['chr_clean'], r['end']) in our_coords, axis=1)
        ]['rsid'].tolist()
        
        print(f"\n{name}:")
        print(f"  Total in panel: {len(bed_df)}")
        print(f"  Available in our data: {len(overlap_coords)} ({len(overlap_coords)/len(bed_df)*100:.1f}%)")
        print(f"  Overlapping rsIDs: {overlapping_rsids[:10]}{'...' if len(overlapping_rsids) > 10 else ''}")
        
        # Save overlapping SNPs for downstream use
        if overlapping_rsids:
            overlap_df = bed_df[bed_df['rsid'].isin(overlapping_rsids)][['chr', 'start', 'end', 'rsid']]
            overlap_path = str(OUTPUT_DIR / f"{name}_overlap.bed")
            overlap_df.to_csv(overlap_path, sep='\t', header=False, index=False)
            print(f"  Saved overlap BED: {overlap_path}")
else:
    print(f"pvar file not found: {pvar_file}")
    print("Run Part 1 notebooks first to generate LD-pruned pfile.")

Our dataset has 390615 SNPs

Overlap with known AISNP panels (by chr:pos coordinates):

forenseq:
  Total in panel: 56
  Available in our data: 4 (7.1%)
  Overlapping rsIDs: ['rs9522149', 'rs11652805', 'rs1834619', 'rs16891982']
  Saved overlap BED: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_overlap.bed


## Step 6: Save Summary

In [40]:
# Save summary
summary_path = str(OUTPUT_DIR / "rsid_conversion_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"Summary saved: {summary_path}")

# Save combined rsID list (all sources)
all_rsids = []
for name, rsids in rsid_data.items():
    for rsid in rsids:
        all_rsids.append({'rsid': rsid, 'source': name})

all_rsids_df = pd.DataFrame(all_rsids)
all_rsids_path = str(OUTPUT_DIR / "all_known_aisnps.csv")
all_rsids_df.to_csv(all_rsids_path, index=False)
print(f"Combined rsIDs saved: {all_rsids_path}")

Summary saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/rsid_conversion_summary.csv
Combined rsIDs saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/all_known_aisnps.csv


## Summary

In [41]:
print("="*70)
print("rsID TO BED CONVERSION SUMMARY")
print("="*70)

print(f"\nSources Processed: {len(available_sources)}")
for source in available_sources:
    print(f"  - {source['name']}: {source['description']}")

print(f"\nGenome Assembly: {ASSEMBLY}")

print(f"\nOutput Files:")
for name in bed_files.keys():
    print(f"  - {name}.bed")
    print(f"  - {name}_annotated.csv")
    print(f"  - {name}_rsid_cache.csv (API cache)")

print(f"\nNext Steps:")
print(f"  1. Run 07_bed_to_ml_matrix.ipynb to create ML matrices")
print(f"  2. Run 08_known_aisnps_ml.ipynb for model comparison")

rsID TO BED CONVERSION SUMMARY

Sources Processed: 1
  - forenseq: Verogen ForenSeq Ancestry SNPs

Genome Assembly: GRCh37

Output Files:
  - forenseq.bed
  - forenseq_annotated.csv
  - forenseq_rsid_cache.csv (API cache)

Next Steps:
  1. Run 07_bed_to_ml_matrix.ipynb to create ML matrices
  2. Run 08_known_aisnps_ml.ipynb for model comparison
