In [1]:
# Setup and imports
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set project root
project_root = Path("/home/Plutonium/Documents/BioinfoMidterm")
os.chdir(project_root)
sys.path.insert(0, str(project_root / "scripts"))

print(f"Working directory: {os.getcwd()}")

# Core imports
import pandas as pd
import numpy as np

# Part 2 imports
from part2.rsid_utils import (
    load_rsid_list,
    rsid_to_bed,
    batch_rsid_lookup
)

# Project imports
from config import PATHS

print("All imports successful!")

Working directory: /home/Plutonium/Documents/BioinfoMidterm
All imports successful!


## Define Paths

In [2]:
# Input/Output directories
KNOWN_AISNPS_DIR = project_root / "data" / "known_aisnps"
OUTPUT_DIR = project_root / "output" / "part2"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Known AISNPs directory: {KNOWN_AISNPS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

# List available rsID files
if KNOWN_AISNPS_DIR.exists():
    rsid_files = list(KNOWN_AISNPS_DIR.glob("*.csv")) + list(KNOWN_AISNPS_DIR.glob("*.txt"))
    print(f"\nFound {len(rsid_files)} rsID files:")
    for f in rsid_files:
        print(f"  - {f.name}")
else:
    print("\nNo rsID files found. Please add files to data/known_aisnps/")

Known AISNPs directory: /home/Plutonium/Documents/BioinfoMidterm/data/known_aisnps
Output directory: /home/Plutonium/Documents/BioinfoMidterm/output/part2

Found 6 rsID files:
  - seldin_128.csv
  - forenseq_ancestry.csv
  - cal_2016.csv
  - kiddlab_.csv
  - cal_et_al_2025.csv
  - hsiao_lin_hwa_2017.csv


## Step 1: Define Known AISNP Sources

Add your rsID sources here. Each source should have:
- `name`: Short identifier
- `file`: Path to rsID file
- `description`: Source description

In [3]:
# Define known AISNP sources
# Add your sources here
SOURCES = [
    {
        'name': 'kidd_55',
        'file': KNOWN_AISNPS_DIR / 'kiddlab_.csv',
        'description': 'Kidd et al. 55 AISNPs panel'
    },
    {
        'name': 'hsiao_lin_hwa',
        'file': KNOWN_AISNPS_DIR / 'hsiao_lin_hwa_2017.csv',
        'description': 'Hsiao Lin Hwa Ancestry Panel'
    },
    {
        'name': 'forenseq',
        'file': KNOWN_AISNPS_DIR / 'forenseq_ancestry.csv',
        'description': 'Verogen ForenSeq Ancestry SNPs'
    },
    {
        'name': 'cal_et_al',
        'file': KNOWN_AISNPS_DIR / 'cal_et_al_2025.csv',
        'description': 'Cal et al. AISNPs panel 2025'
    },
    {
        'name': 'seldin_128',
        'file': KNOWN_AISNPS_DIR / 'seldin_128.csv',
        'description': 'Seldin et al. 128 AISNPs panel'
    },
    # {
    #     'name': 'snipper_34',
    #     'file': KNOWN_AISNPS_DIR / 'snipper_34.txt',
    #     'description': 'Snipper 34 AISNPs panel'
    # },
    # {
    #     'name': 'precision_id',
    #     'file': KNOWN_AISNPS_DIR / 'precision_id_ancestry.csv',
    #     'description': 'Thermo Fisher Precision ID Ancestry Panel'
    # },


    # Add more sources as needed
]

# Check which sources exist
available_sources = []
for source in SOURCES:
    if source['file'].exists():
        available_sources.append(source)
        print(f"✓ {source['name']}: {source['file'].name}")
    else:
        print(f"✗ {source['name']}: File not found - {source['file']}")

print(f"\nAvailable sources: {len(available_sources)}/{len(SOURCES)}")

✓ kidd_55: kiddlab_.csv
✓ hsiao_lin_hwa: hsiao_lin_hwa_2017.csv
✓ forenseq: forenseq_ancestry.csv
✓ cal_et_al: cal_et_al_2025.csv
✓ seldin_128: seldin_128.csv

Available sources: 5/5


## Step 2: Load and Preview rsID Lists

In [4]:
# Load and preview each source
rsid_data = {}

for source in available_sources:
    print(f"\n{'='*60}")
    print(f"Source: {source['name']}")
    print(f"Description: {source['description']}")
    print(f"{'='*60}")
    
    rsids = load_rsid_list(source['file'])
    rsid_data[source['name']] = rsids
    
    print(f"First 10 rsIDs: {rsids[:10]}")
    print(f"Last 5 rsIDs: {rsids[-5:]}")


Source: kidd_55
Description: Kidd et al. 55 AISNPs panel
Loaded 54 rsIDs from /home/Plutonium/Documents/BioinfoMidterm/data/known_aisnps/kiddlab_.csv
First 10 rsIDs: ['rs1079597', 'rs11652805', 'rs1229984', 'rs12439433', 'rs12498138', 'rs12913832', 'rs1426654', 'rs1462906', 'rs1572018', 'rs16891982']
Last 5 rsIDs: ['rs798443', 'rs7997709', 'rs870347', 'rs917115', 'rs9522149']

Source: hsiao_lin_hwa
Description: Hsiao Lin Hwa Ancestry Panel
Loaded 129 rsIDs from /home/Plutonium/Documents/BioinfoMidterm/data/known_aisnps/hsiao_lin_hwa_2017.csv
First 10 rsIDs: ['rs7532151', 'rs9286879', 'rs891700', 'rs2192512', 'rs6544723', 'rs1275953', 'rs1063', 'rs10200485', 'rs1109037', 'rs10185531']
Last 5 rsIDs: ['rs221956', 'rs401396', 'rs1155419', 'rs987640', 'rs12166817']

Source: forenseq
Description: Verogen ForenSeq Ancestry SNPs
Loaded 56 rsIDs from /home/Plutonium/Documents/BioinfoMidterm/data/known_aisnps/forenseq_ancestry.csv
First 10 rsIDs: ['rs2814778', 'rs3737576', 'rs7554936', 'rs10497

## Step 3: Convert rsIDs to BED Format

Uses Ensembl REST API to lookup genomic coordinates for each rsID.
Results are cached to avoid repeated API calls.

In [5]:
# Convert each source to BED format
# Using GRCh37 (hg19) to match 1000 Genomes Phase 3
ASSEMBLY = 'GRCh37'

bed_files = {}

for source in available_sources:
    name = source['name']
    print(f"\n{'='*60}")
    print(f"Processing: {name}")
    print(f"{'='*60}")
    
    output_bed = str(OUTPUT_DIR / f"{name}.bed")
    
    bed_path = rsid_to_bed(
        rsids=rsid_data[name],
        output_path=output_bed,
        assembly=ASSEMBLY,
        source_name=name
    )
    
    bed_files[name] = bed_path

print(f"\n\nGenerated BED files:")
for name, path in bed_files.items():
    print(f"  - {name}: {path}")


Processing: kidd_55
Looking up 54 rsIDs...
Loaded 54 cached entries


Looking up rsIDs: 100%|██████████| 54/54 [00:00<00:00, 518289.28it/s]


Cache saved to /home/Plutonium/Documents/BioinfoMidterm/output/part2/kidd_55_rsid_cache.csv
Successfully resolved 54/54 rsIDs
BED file saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/kidd_55.bed
Annotated CSV saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/kidd_55_annotated.csv

Processing: hsiao_lin_hwa
Looking up 129 rsIDs...
Loaded 129 cached entries


Looking up rsIDs: 100%|██████████| 129/129 [00:00<00:00, 957637.55it/s]


Cache saved to /home/Plutonium/Documents/BioinfoMidterm/output/part2/hsiao_lin_hwa_rsid_cache.csv
Successfully resolved 129/129 rsIDs
BED file saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/hsiao_lin_hwa.bed
Annotated CSV saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/hsiao_lin_hwa_annotated.csv

Processing: forenseq
Looking up 56 rsIDs...
Loaded 56 cached entries


Looking up rsIDs: 100%|██████████| 56/56 [00:00<00:00, 541200.52it/s]


Cache saved to /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_rsid_cache.csv
Successfully resolved 56/56 rsIDs
BED file saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq.bed
Annotated CSV saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_annotated.csv

Processing: cal_et_al
Looking up 57 rsIDs...
Loaded 57 cached entries


Looking up rsIDs: 100%|██████████| 57/57 [00:00<00:00, 1013031.05it/s]


Cache saved to /home/Plutonium/Documents/BioinfoMidterm/output/part2/cal_et_al_rsid_cache.csv
Successfully resolved 57/57 rsIDs
BED file saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/cal_et_al.bed
Annotated CSV saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/cal_et_al_annotated.csv

Processing: seldin_128
Looking up 127 rsIDs...
Loaded 127 cached entries


Looking up rsIDs: 100%|██████████| 127/127 [00:00<00:00, 597841.31it/s]

Cache saved to /home/Plutonium/Documents/BioinfoMidterm/output/part2/seldin_128_rsid_cache.csv
Successfully resolved 127/127 rsIDs
BED file saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/seldin_128.bed
Annotated CSV saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/seldin_128_annotated.csv


Generated BED files:
  - kidd_55: /home/Plutonium/Documents/BioinfoMidterm/output/part2/kidd_55.bed
  - hsiao_lin_hwa: /home/Plutonium/Documents/BioinfoMidterm/output/part2/hsiao_lin_hwa.bed
  - forenseq: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq.bed
  - cal_et_al: /home/Plutonium/Documents/BioinfoMidterm/output/part2/cal_et_al.bed
  - seldin_128: /home/Plutonium/Documents/BioinfoMidterm/output/part2/seldin_128.bed





## Step 4: Verify BED Files

In [6]:
# Verify and summarize BED files
summary = []

for name, bed_path in bed_files.items():
    # Load BED
    bed_df = pd.read_csv(bed_path, sep='\t', header=None,
                         names=['chr', 'start', 'end', 'rsid'])
    
    # Remove 'chr' prefix and filter for numeric chromosomes only
    bed_df['chr'] = bed_df['chr'].str.replace(r'^chr', '', regex=True)
    bed_df = bed_df[bed_df['chr'].str.isnumeric()].reset_index(drop=True)
    
    # Load annotated CSV
    csv_path = bed_path.replace('.bed', '_annotated.csv')
    
    summary.append({
        'Source': name,
        'Total_rsIDs': len(rsid_data[name]),
        'Resolved': len(bed_df),
        'Success_Rate': f"{len(bed_df)/len(rsid_data[name])*100:.1f}%",
        'Chromosomes': bed_df['chr'].nunique(),
        'BED_File': bed_path
    })
    
    print(f"\n{name}:")
    print(f"  Resolved: {len(bed_df)}/{len(rsid_data[name])}")
    print(f"  Chromosome distribution:")
    print(bed_df['chr'].value_counts().head(5).to_string())

summary_df = pd.DataFrame(summary)
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
display(summary_df)


kidd_55:
  Resolved: 53/54
  Chromosome distribution:
chr
2     6
15    5
17    5
13    5
18    4

hsiao_lin_hwa:
  Resolved: 126/129
  Chromosome distribution:
chr
2     11
4     10
6     10
7      8
10     8

forenseq:
  Resolved: 55/56
  Chromosome distribution:
chr
2     7
15    5
17    5
13    5
18    4

cal_et_al:
  Resolved: 52/57
  Chromosome distribution:
chr
3     7
2     6
5     5
11    5
15    4

seldin_128:
  Resolved: 125/127
  Chromosome distribution:
chr
1    13
2     9
3     9
5     8
7     8

SUMMARY


Unnamed: 0,Source,Total_rsIDs,Resolved,Success_Rate,Chromosomes,BED_File
0,kidd_55,54,53,98.1%,21,/home/Plutonium/Documents/BioinfoMidterm/outpu...
1,hsiao_lin_hwa,129,126,97.7%,22,/home/Plutonium/Documents/BioinfoMidterm/outpu...
2,forenseq,56,55,98.2%,21,/home/Plutonium/Documents/BioinfoMidterm/outpu...
3,cal_et_al,57,52,91.2%,16,/home/Plutonium/Documents/BioinfoMidterm/outpu...
4,seldin_128,127,125,98.4%,22,/home/Plutonium/Documents/BioinfoMidterm/outpu...


## Step 5: Check Overlap with Our Dataset

In [7]:
# Load our SNP coordinates from pfile
pvar_file = str(PATHS.PLINK_LD_PRUNED) + '.pvar'

if Path(pvar_file).exists():
    # Load our SNPs with coordinates
    our_snps = []
    with open(pvar_file) as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                chrom = fields[0]
                pos = int(fields[1])
                rsid = fields[2]
                our_snps.append({'chr': chrom, 'pos': pos, 'rsid': rsid})
    
    our_snps_df = pd.DataFrame(our_snps)
    # Create coordinate set for fast lookup (chr, pos)
    our_coords = set(zip(our_snps_df['chr'].astype(str), our_snps_df['pos']))
    
    print(f"Our dataset has {len(our_snps_df)} SNPs")
    
    # Check overlap for each source by coordinates
    print(f"\nOverlap with known AISNP panels (by chr:pos coordinates):")
    print("="*70)
    
    for name, bed_path in bed_files.items():
        # Load BED file
        bed_df = pd.read_csv(bed_path, sep='\t', header=None,
                             names=['chr', 'start', 'end', 'rsid'])
        
        # BED uses 0-based start, so end position = 1-based position
        # Match using chromosome and end position (1-based coordinate)
        bed_df['chr_clean'] = bed_df['chr'].astype(str).str.replace('chr', '')
        bed_coords = set(zip(bed_df['chr_clean'], bed_df['end']))
        
        overlap_coords = bed_coords & our_coords
        
        # Find the matching rsIDs
        overlapping_rsids = bed_df[
            bed_df.apply(lambda r: (r['chr_clean'], r['end']) in our_coords, axis=1)
        ]['rsid'].tolist()
        
        print(f"\n{name}:")
        print(f"  Total in panel: {len(bed_df)}")
        print(f"  Available in our data: {len(overlap_coords)} ({len(overlap_coords)/len(bed_df)*100:.1f}%)")
        print(f"  Overlapping rsIDs: {overlapping_rsids[:10]}{'...' if len(overlapping_rsids) > 10 else ''}")
        
        # Save overlapping SNPs for downstream use
        if overlapping_rsids:
            overlap_df = bed_df[bed_df['rsid'].isin(overlapping_rsids)][['chr', 'start', 'end', 'rsid']]
            overlap_path = str(OUTPUT_DIR / f"{name}_overlap.bed")
            overlap_df.to_csv(overlap_path, sep='\t', header=False, index=False)
            print(f"  Saved overlap BED: {overlap_path}")
else:
    print(f"pvar file not found: {pvar_file}")
    print("Run Part 1 notebooks first to generate LD-pruned pfile.")

Our dataset has 390615 SNPs

Overlap with known AISNP panels (by chr:pos coordinates):

kidd_55:
  Total in panel: 54
  Available in our data: 4 (7.4%)
  Overlapping rsIDs: ['rs9522149', 'rs11652805', 'rs1834619', 'rs16891982']
  Saved overlap BED: /home/Plutonium/Documents/BioinfoMidterm/output/part2/kidd_55_overlap.bed

hsiao_lin_hwa:
  Total in panel: 129
  Available in our data: 9 (7.0%)
  Overlapping rsIDs: ['rs2009833', 'rs8078417', 'rs1109037', 'rs4607417', 'rs9320147', 'rs1407728', 'rs1790006', 'rs367311', 'rs4141883']
  Saved overlap BED: /home/Plutonium/Documents/BioinfoMidterm/output/part2/hsiao_lin_hwa_overlap.bed

forenseq:
  Total in panel: 56
  Available in our data: 4 (7.1%)
  Overlapping rsIDs: ['rs9522149', 'rs11652805', 'rs1834619', 'rs16891982']
  Saved overlap BED: /home/Plutonium/Documents/BioinfoMidterm/output/part2/forenseq_overlap.bed

cal_et_al:
  Total in panel: 57
  Available in our data: 9 (15.8%)
  Overlapping rsIDs: ['rs9522149', 'rs11652805', 'rs53035716

## Step 6: Save Summary

In [8]:
# Save summary
summary_path = str(OUTPUT_DIR / "rsid_conversion_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"Summary saved: {summary_path}")

# Save combined rsID list (all sources)
all_rsids = []
for name, rsids in rsid_data.items():
    for rsid in rsids:
        all_rsids.append({'rsid': rsid, 'source': name})

all_rsids_df = pd.DataFrame(all_rsids)
all_rsids_path = str(OUTPUT_DIR / "all_known_aisnps.csv")
all_rsids_df.to_csv(all_rsids_path, index=False)
print(f"Combined rsIDs saved: {all_rsids_path}")

Summary saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/rsid_conversion_summary.csv
Combined rsIDs saved: /home/Plutonium/Documents/BioinfoMidterm/output/part2/all_known_aisnps.csv


## Summary

In [9]:
print("="*70)
print("rsID TO BED CONVERSION SUMMARY")
print("="*70)

print(f"\nSources Processed: {len(available_sources)}")
for source in available_sources:
    print(f"  - {source['name']}: {source['description']}")

print(f"\nGenome Assembly: {ASSEMBLY}")

print(f"\nOutput Files:")
for name in bed_files.keys():
    print(f"  - {name}.bed")
    print(f"  - {name}_annotated.csv")
    print(f"  - {name}_rsid_cache.csv (API cache)")

print(f"\nNext Steps:")
print(f"  1. Run 07_bed_to_ml_matrix.ipynb to create ML matrices")
print(f"  2. Run 08_known_aisnps_ml.ipynb for model comparison")

rsID TO BED CONVERSION SUMMARY

Sources Processed: 5
  - kidd_55: Kidd et al. 55 AISNPs panel
  - hsiao_lin_hwa: Hsiao Lin Hwa Ancestry Panel
  - forenseq: Verogen ForenSeq Ancestry SNPs
  - cal_et_al: Cal et al. AISNPs panel 2025
  - seldin_128: Seldin et al. 128 AISNPs panel

Genome Assembly: GRCh37

Output Files:
  - kidd_55.bed
  - kidd_55_annotated.csv
  - kidd_55_rsid_cache.csv (API cache)
  - hsiao_lin_hwa.bed
  - hsiao_lin_hwa_annotated.csv
  - hsiao_lin_hwa_rsid_cache.csv (API cache)
  - forenseq.bed
  - forenseq_annotated.csv
  - forenseq_rsid_cache.csv (API cache)
  - cal_et_al.bed
  - cal_et_al_annotated.csv
  - cal_et_al_rsid_cache.csv (API cache)
  - seldin_128.bed
  - seldin_128_annotated.csv
  - seldin_128_rsid_cache.csv (API cache)

Next Steps:
  1. Run 07_bed_to_ml_matrix.ipynb to create ML matrices
  2. Run 08_known_aisnps_ml.ipynb for model comparison
