# Get Reference LD Panel
This notebook calculates the Linkage Disequilibrium (LD) matrix for a specific set of SNPs using reference genotypes from the 1000 Genomes Project (Phase 3).

**Features:**
- Uses **EUR** (European) samples only.
- Streams data directly from 1000 Genomes FTP (via `pysam`) to minimize disk usage.
- Calculates pairwise correlation (R) matrix.
- Saves the LD matrix and corresponding SNP metadata.

**Requirements:**
- `pysam` (must be installed with libcurl support for remote file access)
- `pandas`, `numpy`

**Note for Windows Users:**
`pysam` is notoriously difficult to install on Windows because it requires compiling C extensions (htslib). If you cannot install `pysam`, you may need to use **WSL (Windows Subsystem for Linux)** or a Docker container. Alternatively, you can try `cyvcf2` if a wheel is available, but `pysam` is the standard for remote VCF access.


In [1]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pysam
from tqdm import tqdm

# Determine project root relative to this notebook
NOTEBOOK_DIR = Path(os.getcwd()).resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

# Add project root to sys.path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print(f"Project Root: {PROJECT_ROOT}")
print(f"Pysam version: {pysam.__version__}")


ModuleNotFoundError: No module named 'pysam'

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================

GENE_NAME = "CBX8"
POPULATION = "EUR"  # Super-population code (EUR, AFR, EAS, SAS, AMR)

# Input VCF containing the SNPs of interest
# This file was generated by get_z_scores.ipynb
INPUT_VCF_PATH = PROJECT_ROOT / "output" / f"{GENE_NAME}_variants.vcf"

# Output paths
OUTPUT_DIR = PROJECT_ROOT / "output"
LD_MATRIX_PATH = OUTPUT_DIR / f"{GENE_NAME}_LD_matrix.csv"
LD_SNPS_PATH = OUTPUT_DIR / f"{GENE_NAME}_LD_snps.csv"

# 1000 Genomes Phase 3 URL Template
# We will fill in the chromosome dynamically
KG_URL_TEMPLATE = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr{chrom}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
# Use the standard panel file (TSV) instead of the Excel file for better reliability and fewer dependencies
KG_SAMPLE_INFO_URL = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel"

print(f"Target Gene: {GENE_NAME}")
print(f"Population: {POPULATION}")
print(f"Input VCF: {INPUT_VCF_PATH}")


In [None]:
# 1. Load Target SNPs
print("Loading target SNPs from local VCF...")
if not INPUT_VCF_PATH.exists():
    raise FileNotFoundError(f"Input VCF not found: {INPUT_VCF_PATH}")

# We can use pandas to read the VCF (skipping header lines)
# The VCF from get_z_scores.ipynb has a header line starting with #CHROM
target_snps = pd.read_csv(INPUT_VCF_PATH, sep="\t", comment="#", header=None, names=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])

# If the file has a proper header row that wasn't commented out correctly in previous steps, we might need to adjust.
# Let's check if the first row looks like a header or data.
# Actually, standard VCFs have #CHROM. pandas comment='#' will skip it.
# We need to read the header separately or just assume columns.
# Let's try reading with comment='#' and manually assigning columns, assuming standard format.

# Re-read to be safe about header
with open(INPUT_VCF_PATH, 'r') as f:
    for line in f:
        if line.startswith("#CHROM"):
            header = line.strip().replace("#", "").split("\t")
            break
    else:
        header = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]

target_snps = pd.read_csv(INPUT_VCF_PATH, sep="\t", comment="#", names=header)

# Ensure CHROM is string and doesn't have 'chr' prefix if 1kGP doesn't use it (1kGP usually uses numbers, but URLs might need adjustment)
# 1kGP VCFs usually have just numbers "1", "2", ... in the file, but filenames might be "chr1".
target_snps['CHROM'] = target_snps['CHROM'].astype(str).str.replace("chr", "")

print(f"Loaded {len(target_snps)} SNPs.")
target_snps.head()


In [None]:
# 2. Get Reference Sample Info
print("Downloading 1000 Genomes sample info...")

# Read the panel file (TSV)
# Columns are: sample, pop, super_pop, gender
sample_info = pd.read_csv(KG_SAMPLE_INFO_URL, sep='\t')

# Filter for Population
eur_samples = sample_info[sample_info['super_pop'] == POPULATION]['sample'].tolist()

print(f"Found {len(eur_samples)} samples for super-population {POPULATION}")
print(f"First 5 samples: {eur_samples[:5]}")


In [None]:
# 3. Fetch Genotypes
# We will iterate through the target SNPs and fetch genotypes from the remote VCF.
# To be efficient, we'll group by chromosome (though likely only one gene/chrom here).

chromosomes = target_snps['CHROM'].unique()
genotypes_list = []
found_snps_list = []

for chrom in chromosomes:
    vcf_url = KG_URL_TEMPLATE.format(chrom=chrom)
    print(f"Connecting to {vcf_url}...")
    
    try:
        # Open remote VCF
        # pysam allows opening URLs if libcurl is enabled
        vcf_in = pysam.VariantFile(vcf_url)
        
        # Get sample indices for our population
        # vcf_in.header.samples is a list of all samples in the VCF
        vcf_samples = list(vcf_in.header.samples)
        sample_indices = [i for i, s in enumerate(vcf_samples) if s in eur_samples]
        
        if not sample_indices:
            raise ValueError(f"No matching samples found in VCF for population {POPULATION}")
            
        print(f"  Extracting genotypes for {len(sample_indices)} samples...")
        
        # Iterate over target SNPs on this chromosome
        chrom_snps = target_snps[target_snps['CHROM'] == chrom]
        
        for _, row in tqdm(chrom_snps.iterrows(), total=len(chrom_snps), desc=f"Chr {chrom}"):
            pos = row['POS']
            ref = row['REF']
            alt = row['ALT']
            rsid = row['ID']
            
            # Fetch records at this position
            # Note: pysam fetch is 0-based? No, fetch(region) uses 1-based coordinates usually, or start/end.
            # fetch(contig, start, stop) is 0-based, half-open.
            # VCF POS is 1-based. So start=pos-1, stop=pos.
            
            try:
                # We use a small window to be safe, but really just want the exact pos
                records = list(vcf_in.fetch(chrom, pos-1, pos))
            except ValueError as e:
                # Contig not found or other error
                print(f"  Error fetching {chrom}:{pos}: {e}")
                continue
                
            # Find the matching record
            match = None
            for rec in records:
                # Check position (rec.pos is 1-based)
                if rec.pos != pos:
                    continue
                    
                # Check Ref/Alt
                # 1kGP might have multiallelics.
                # We need to find if our Alt is in the record's Alts.
                if rec.ref != ref:
                    # Possible strand flip? For now, skip if mismatch.
                    # GTEx and 1kGP are both usually + strand hg38.
                    continue
                
                if alt in rec.alts:
                    match = rec
                    alt_index = rec.alts.index(alt) + 1 # 1-based index for GT
                    break
            
            if match:
                # Extract genotypes
                # rec.samples is a proxy object, iterating it is slow.
                # We can use specialized methods if available, but standard pysam is:
                # [s['GT'] for s in rec.samples.values()]
                # But we only want specific samples.
                
                # Optimization: subsetting samples is slow in python loop.
                # Faster approach: get all GTs, then numpy slice.
                # But pysam doesn't give numpy array directly easily without iterating.
                # Let's just iterate the subset of samples we care about.
                
                # Actually, iterating 500 samples per SNP for 2000 SNPs is 1M ops. Fast enough.
                
                row_gts = []
                for samp_idx in sample_indices:
                    # Get sample name
                    samp_name = vcf_samples[samp_idx]
                    # Get GT
                    gt = match.samples[samp_name]['GT']
                    # gt is tuple like (0, 1)
                    
                    # Convert to dosage (0, 1, 2)
                    # None means missing
                    if gt[0] is None or gt[1] is None:
                        dosage = np.nan
                    else:
                        # Check if alleles match our target alt
                        # We are looking for dosage of 'alt'.
                        # alt_index is the index of our alt allele in the VCF record.
                        # 0 is ref.
                        dosage = sum(1 for allele in gt if allele == alt_index)
                    
                    row_gts.append(dosage)
                
                genotypes_list.append(row_gts)
                found_snps_list.append(row)
            else:
                # SNP not found in reference
                pass

    except Exception as e:
        print(f"Failed to process chromosome {chrom}: {e}")

print(f"Extracted genotypes for {len(found_snps_list)} / {len(target_snps)} SNPs.")


In [None]:
# 4. Calculate LD Matrix
if not genotypes_list:
    raise ValueError("No genotypes extracted. Cannot calculate LD.")

# Convert to numpy array
# Shape: (n_snps, n_samples)
G = np.array(genotypes_list)

# Check for missing data
if np.isnan(G).any():
    print("Warning: Missing genotypes detected. Imputing with mean...")
    # Simple mean imputation for LD calculation
    row_means = np.nanmean(G, axis=1)
    inds = np.where(np.isnan(G))
    G[inds] = np.take(row_means, inds[0])

print(f"Genotype matrix shape: {G.shape}")

# Calculate Correlation Matrix (R)
# np.corrcoef expects each row to be a variable (SNP), so shape (n_snps, n_samples) is correct.
ld_matrix = np.corrcoef(G)

print(f"LD Matrix shape: {ld_matrix.shape}")
print("LD calculation complete.")


In [None]:
# 5. Save Results

# Save SNP metadata for the rows/cols of the matrix
ld_snps_df = pd.DataFrame(found_snps_list)
ld_snps_df.to_csv(LD_SNPS_PATH, index=False)
print(f"Saved SNP metadata to: {LD_SNPS_PATH}")

# Save LD Matrix
# We'll save as CSV for simplicity, though .npy or .parquet is better for large matrices.
# Given ~2000 SNPs, CSV is fine (~4M entries).
ld_df = pd.DataFrame(ld_matrix, index=ld_snps_df['ID'], columns=ld_snps_df['ID'])
ld_df.to_csv(LD_MATRIX_PATH)
print(f"Saved LD matrix to: {LD_MATRIX_PATH}")

# Preview
print("\nFirst 5x5 LD Matrix:")
print(ld_df.iloc[:5, :5])
