# Week 5: Pharmacogenomics Analysis

## Analysis Pipeline
1. Download reference genome (chromosome 10)
2. Align Illumina and PacBio reads with minimap2
3. Call variants with bcftools
4. Phase variants with HapCUT2
5. Compare VCFs and identify discordant variants
6. Determine star-alleles using PharmVar database

### Install Dependencies

In [None]:
import os
import sys
import subprocess
import pandas as pd
from pathlib import Path
import gzip
from collections import defaultdict
import urllib.request
import shutil
import bz2
import platform

### Set up working directories

In [None]:
notebook_dir = Path.cwd()
data_dir = notebook_dir / "data"
data_dir.mkdir(parents=True, exist_ok=True)
print(f"Working directory: {notebook_dir}")
print(f"Data directory: {data_dir}")

## Step 1: Download Reference Genome (Chromosome 10)

All target genes are located on chromosome 10:
- CYP2C8
- CYP2C9
- CYP2C19

In [None]:
# Download chromosome 10 reference genome
chr10_url = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr10.fa.gz"
chr10_gz_path = data_dir / "chr10.fa.gz"
chr10_fa_path = data_dir / "chr10.fa"

print("="*60)
print("DOWNLOADING REFERENCE GENOME")
print("="*60)

if not chr10_fa_path.exists():
    print(f"Downloading chromosome 10 from UCSC Genome Browser")
    print(f"Source: {chr10_url}")
    print(f"Target: {chr10_gz_path}")
    
    try:
        urllib.request.urlretrieve(chr10_url, chr10_gz_path)
        print("Download complete!")
        
        print(f"Decompressing {chr10_gz_path.name}...")
        with gzip.open(chr10_gz_path, 'rb') as f_in:
            with open(chr10_fa_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Decompression complete!")
        print(f"Output file: {chr10_fa_path}")
        
        # Remove compressed file to save space
        chr10_gz_path.unlink()
        print(f"Removed compressed file: {chr10_gz_path}")
                
    except Exception as e:
        print(f"Error during download: {e}")
        raise
else:
    print(f"Reference genome already exists: {chr10_fa_path}")

print("")

### Download Sequencing Data

In [None]:
# Download Illumina data
illumina_url = "https://github.com/inumanag/fall25-csc-bioinf/raw/refs/heads/main/week4/data/illumina.fq.bz2"
illumina_bz2_path = data_dir / "illumina.fq.bz2"
illumina_fq_path = data_dir / "illumina.fq"

print("="*60)
print("DOWNLOADING ILLUMINA SEQUENCING DATA")
print("="*60)

if not illumina_fq_path.exists():
    print(f"Downloading Illumina data from GitHub")
    print(f"Source: {illumina_url}")
    
    try:
        urllib.request.urlretrieve(illumina_url, illumina_bz2_path)
        print("Download complete!")
        
        print(f"Decompressing {illumina_bz2_path.name}...")
        with bz2.open(illumina_bz2_path, 'rb') as f_in:
            with open(illumina_fq_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Decompression complete!")
        print(f"Output file: {illumina_fq_path}")
        
        # Remove compressed file
        illumina_bz2_path.unlink()
        print(f"Removed compressed file: {illumina_bz2_path}")
                
    except Exception as e:
        print(f"Error during download: {e}")
        raise
else:
    print(f"Illumina data already exists: {illumina_fq_path}")

print("")

# Download PacBio data
pacbio_url = "https://github.com/inumanag/fall25-csc-bioinf/raw/refs/heads/main/week4/data/pacbio.fq.bz2"
pacbio_bz2_path = data_dir / "pacbio.fq.bz2"
pacbio_fq_path = data_dir / "pacbio.fq"

print("="*60)
print("DOWNLOADING PACBIO SEQUENCING DATA")
print("="*60)

if not pacbio_fq_path.exists():
    print(f"Downloading PacBio data from GitHub")
    print(f"Source: {pacbio_url}")
    
    try:
        urllib.request.urlretrieve(pacbio_url, pacbio_bz2_path)
        print("Download complete!")
        
        print(f"Decompressing {pacbio_bz2_path.name}...")
        with bz2.open(pacbio_bz2_path, 'rb') as f_in:
            with open(pacbio_fq_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Decompression complete!")
        print(f"Output file: {pacbio_fq_path}")
        
        # Remove compressed file
        pacbio_bz2_path.unlink()
        print(f"Removed compressed file: {pacbio_bz2_path}")
                
    except Exception as e:
        print(f"Error during download: {e}")
        raise
else:
    print(f"PacBio data already exists: {pacbio_fq_path}")

print("")

## Step 2: Align Illumina and PacBio reads with minimap2

Align all samples in FASTQ format to chromosome 10 using minimap2 with appropriate parameters for each technology.

### Index Reference Genome

In [None]:
print("="*60)
print("INDEXING REFERENCE GENOME")
print("="*60)

chr10_mmi_path = data_dir / "chr10.mmi"

if not chr10_mmi_path.exists():
    print(f"Creating minimap2 index for {chr10_fa_path}")
    cmd = [
        "minimap2",
        "-d", str(chr10_mmi_path),
        str(chr10_fa_path)
    ]
    
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(result.stderr)  # minimap2 outputs to stderr
        print(f"Index created: {chr10_mmi_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error creating index: {e}")
        print(f"STDOUT: {e.stdout}")
        print(f"STDERR: {e.stderr}")
        raise
else:
    print(f"Index already exists: {chr10_mmi_path}")

print("")

### Align Illumina Reads

In [None]:
print("="*60)
print("ALIGNING ILLUMINA READS")
print("="*60)

illumina_sam = data_dir / "illumina.sam"
illumina_bam = data_dir / "illumina.bam"
illumina_sorted_bam = data_dir / "illumina_sorted.bam"
illumina_bai = data_dir / "illumina_sorted.bam.bai"

if not illumina_sorted_bam.exists():
    print("Aligning Illumina reads with minimap2 (sr preset for short reads)")
    
    # Align with minimap2 using short-read preset
    align_cmd = [
        "minimap2",
        "-ax", "sr",  # short single-end reads preset
        "-t", "4",     # threads
        str(chr10_mmi_path),
        str(illumina_fq_path)
    ]
    
    with open(illumina_sam, 'w') as sam_file:
        result = subprocess.run(align_cmd, stdout=sam_file, stderr=subprocess.PIPE, text=True, check=True)
        print(result.stderr)  # minimap2 outputs to stderr
    
    print(f"Alignment complete: {illumina_sam}")
    
    # Convert SAM to BAM
    print("Converting SAM to BAM...")
    subprocess.run([
        "samtools", "view",
        "-b", "-o", str(illumina_bam),
        str(illumina_sam)
    ], check=True)
    print(f"BAM created: {illumina_bam}")
    
    # Sort BAM
    print("Sorting BAM file...")
    subprocess.run([
        "samtools", "sort",
        "-o", str(illumina_sorted_bam),
        str(illumina_bam)
    ], check=True)
    print(f"Sorted BAM created: {illumina_sorted_bam}")
    
    # Index BAM
    print("Indexing BAM file...")
    subprocess.run([
        "samtools", "index",
        str(illumina_sorted_bam)
    ], check=True)
    print(f"Index created: {illumina_bai}")
    
    # Clean up intermediate files
    illumina_sam.unlink()
    illumina_bam.unlink()
    print("Cleaned up intermediate files")
    
    print(f"\nFinal output: {illumina_sorted_bam}")
    print(f"Final index: {illumina_bai}")
else:
    print(f"Illumina alignment already exists: {illumina_sorted_bam}")

print("")

### Align PacBio Reads

In [None]:
print("="*60)
print("ALIGNING PACBIO READS")
print("="*60)

pacbio_sam = data_dir / "pacbio.sam"
pacbio_bam = data_dir / "pacbio.bam"
pacbio_sorted_bam = data_dir / "pacbio_sorted.bam"
pacbio_bai = data_dir / "pacbio_sorted.bam.bai"

if not pacbio_sorted_bam.exists():
    print("Aligning PacBio reads with minimap2 (map-pb preset for PacBio)")
    
    # Align with minimap2 using PacBio preset
    align_cmd = [
        "minimap2",
        "-ax", "map-pb",  # PacBio CLR reads preset
        "-t", "4",         # threads
        str(chr10_mmi_path),
        str(pacbio_fq_path)
    ]
    
    with open(pacbio_sam, 'w') as sam_file:
        result = subprocess.run(align_cmd, stdout=sam_file, stderr=subprocess.PIPE, text=True, check=True)
        print(result.stderr)  # minimap2 outputs to stderr
    
    print(f"Alignment complete: {pacbio_sam}")
    
    # Convert SAM to BAM
    print("Converting SAM to BAM...")
    subprocess.run([
        "samtools", "view",
        "-b", "-o", str(pacbio_bam),
        str(pacbio_sam)
    ], check=True)
    print(f"BAM created: {pacbio_bam}")
    
    # Sort BAM
    print("Sorting BAM file...")
    subprocess.run([
        "samtools", "sort",
        "-o", str(pacbio_sorted_bam),
        str(pacbio_bam)
    ], check=True)
    print(f"Sorted BAM created: {pacbio_sorted_bam}")
    
    # Index BAM
    print("Indexing BAM file...")
    subprocess.run([
        "samtools", "index",
        str(pacbio_sorted_bam)
    ], check=True)
    print(f"Index created: {pacbio_bai}")
    
    # Clean up intermediate files
    pacbio_sam.unlink()
    pacbio_bam.unlink()
    print("Cleaned up intermediate files")
    
    print(f"\nFinal output: {pacbio_sorted_bam}")
    print(f"Final index: {pacbio_bai}")
else:
    print(f"PacBio alignment already exists: {pacbio_sorted_bam}")

print("")

### Verify Output Files

In [None]:
print("="*60)
print("VERIFYING OUTPUT FILES")
print("="*60)

print(f"Illumina BAM: {illumina_sorted_bam.exists()} - {illumina_sorted_bam}")
print(f"Illumina BAI: {illumina_bai.exists()} - {illumina_bai}")
print(f"PacBio BAM: {pacbio_sorted_bam.exists()} - {pacbio_sorted_bam}")
print(f"PacBio BAI: {pacbio_bai.exists()} - {pacbio_bai}")

if all([illumina_sorted_bam.exists(), illumina_bai.exists(), 
        pacbio_sorted_bam.exists(), pacbio_bai.exists()]):
    print("\n✓ All alignment files successfully created!")
else:
    print("\n✗ Some alignment files are missing!")
    sys.exit(1)

print("")

## Step 3: Call Variants with bcftools

Find all variants in each sample for all genes of interest and obtain VCF files.

**Expected output:** Two VCF files (one for each sample)

In [None]:
print("="*60)
print("VARIANT CALLING")
print("="*60)
print("\nTODO: Implement variant calling with bcftools")
print("This step will call variants for both Illumina and PacBio samples")
print("")

## Step 4: Phase Variants with HapCUT2

Phase the variant VCFs using HapCUT2 or HapTree-X.

**Expected output:** Two phased VCF files (one for each sample)

In [None]:
print("="*60)
print("PHASING VARIANTS")
print("="*60)
print("\nTODO: Implement phasing with HapCUT2")
print("This step will phase variants from both samples")
print("")

## Step 5: Compare VCFs and Identify Discordant Variants

Compare the two phased VCF files and analyze discordant variants.

**Expected output:** Analysis with IGV screenshots and discussion

In [None]:
print("="*60)
print("COMPARING VCFS")
print("="*60)
print("\nTODO: Compare VCFs and identify discordant variants")
print("This step will analyze shared and unique variants between technologies")
print("")

## Step 6: Determine Star-Alleles using PharmVar

Identify star-alleles for each gene using the PharmVar database.

**Expected output:** Discussion of star-allele determination

In [None]:
print("="*60)
print("STAR-ALLELE DETERMINATION")
print("="*60)
print("\nTODO: Determine star-alleles using PharmVar database")
print("This step will identify CYP2C8, CYP2C9, and CYP2C19 star-alleles")
print("")