In [None]:
# Create Gene to DRAGEN pVCFs mapping file
## Run once

In [None]:
import pandas as pd
import os
import requests
import gzip

# --- CONFIGURATION ---
GTF_URL = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.annotation.gtf.gz"
GTF_FILE = "data/blocks/gencode.v46.annotation.gtf.gz"
COORDINATE_FILE = "/mnt/project/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/helper_files/dragen_pvcf_coordinates.zip"  # Ensure this matches your local CSV filename
OUTPUT_FILE = "data/blocks/gene_vcf_overlaps.tsv"
BLOCK_SIZE = 20000

In [28]:
def download_gtf(url, filename="data/blocks/gencode.v46.annotation.gtf.gz"):
    if os.path.exists(filename):
        print(f"File '{filename}' already exists. Skipping download.")
        return

    print(f"Downloading {filename} from EBI...")
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filename, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print("Download complete.")
    except Exception as e:
        print(f"Error downloading file: {e}")


# Trigger the download check
download_gtf(GTF_URL, GTF_FILE)

File '/Users/skn506/Documents/gogoGPCR2/data/blocks/gencode.v46.annotation.gtf.gz' already exists. Skipping download.


In [None]:
print(f"Loading VCF blocks from {COORDINATE_FILE}...")
vcf_df = pd.read_csv(COORDINATE_FILE)

# 1. DROP NA (Assume missing blocks are intentionally excluded)
initial_count = len(vcf_df)
vcf_df = vcf_df.dropna(subset=["chromosome", "starting_position"])
dropped_count = initial_count - len(vcf_df)

if dropped_count > 0:
    print(f"Dropped {dropped_count} rows with missing coordinates (NA).")

# 2. ENSURE INTEGER TYPES
vcf_df["starting_position"] = vcf_df["starting_position"].astype(int)

# 3. CALCULATE END (Simple fixed window)
# We assume the block covers [Start, Start + 20kb]
vcf_df["ending_position"] = vcf_df["starting_position"] + BLOCK_SIZE

print(f"Prepared {len(vcf_df)} VCF blocks.")
vcf_df[["filename", "chromosome", "starting_position", "ending_position"]].head()

Loading VCF blocks from /Users/skn506/Documents/gogoGPCR2/data/blocks/dragen_pvcf_coordinates.zip...
Dropped 8000 rows with missing coordinates (NA).
Prepared 146430 VCF blocks.


Unnamed: 0,filename,chromosome,starting_position,ending_position
0,ukb24310_c1_b0_v1.vcf.gz,chr1,10061,30061
1,ukb24310_c1_b1_v1.vcf.gz,chr1,20019,40019
2,ukb24310_c1_b10_v1.vcf.gz,chr1,200001,220001
3,ukb24310_c1_b100_v1.vcf.gz,chr1,2000001,2020001
4,ukb24310_c1_b1000_v1.vcf.gz,chr1,20000004,20020004


In [30]:
def parse_gtf_attributes(attribute_string):
    """Parses 'gene_id "X"; gene_name "Y";' into a dict."""
    attributes = {}
    for attribute in attribute_string.strip().split(";"):
        if not attribute.strip():
            continue
        parts = attribute.strip().split(" ", 1)
        if len(parts) == 2:
            key = parts[0]
            value = parts[1].replace('"', "")
            attributes[key] = value
    return attributes


print(f"Parsing Genes from {GTF_FILE}...")
gene_list = []

with gzip.open(GTF_FILE, "rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        parts = line.split("\t")

        # Filter for genes only
        if len(parts) < 9 or parts[2] != "gene":
            continue

        attrs = parse_gtf_attributes(parts[8])

        gene_list.append(
            {
                "gene_id": attrs.get("gene_id", "NA"),
                "gene_name": attrs.get("gene_name", "NA"),
                "chromosome": parts[0],
                "start": int(parts[3]),
                "stop": int(parts[4]),
            }
        )

gene_df = pd.DataFrame(gene_list)
print(f"Parsed {len(gene_df)} genes.")

Parsing Genes from /Users/skn506/Documents/gogoGPCR2/data/blocks/gencode.v46.annotation.gtf.gz...
Parsed 63086 genes.


In [33]:
results = []
touched_vcf_indices = set()
unique_chroms = gene_df["chromosome"].unique()

print("Calculating overlaps (including block coordinates)...")

for chrom in unique_chroms:
    # Filter for current chromosome
    chrom_genes = gene_df[gene_df["chromosome"] == chrom]
    chrom_vcfs = vcf_df[vcf_df["chromosome"] == chrom]

    if chrom_vcfs.empty:
        continue

    # Create Interval Index for VCF blocks
    vcf_intervals = pd.IntervalIndex.from_arrays(
        chrom_vcfs["starting_position"], chrom_vcfs["ending_position"], closed="both"
    )

    # Check each gene against VCF intervals
    for idx, gene in chrom_genes.iterrows():
        gene_interval = pd.Interval(gene["start"], gene["stop"], closed="both")
        overlaps = vcf_intervals.overlaps(gene_interval)

        if overlaps.any():
            # Get the actual VCF rows that overlapped
            overlapping_rows = chrom_vcfs.loc[overlaps]

            # Track touched VCFs
            touched_vcf_indices.update(overlapping_rows.index)

            # Extract data lists
            files = overlapping_rows["filename"].tolist()
            starts = overlapping_rows["starting_position"].tolist()
            stops = overlapping_rows["ending_position"].tolist()

            results.append(
                {
                    "gene_id": gene["gene_id"],
                    "gene_name": gene["gene_name"],
                    "chromosome": gene["chromosome"],
                    "start": gene["start"],
                    "stop": gene["stop"],
                    "vcf_count": len(files),
                    # Join lists into strings
                    "overlapping_vcfs": ",".join(files),
                    "vcf_start": ",".join(map(str, starts)),
                    "vcf_stop": ",".join(map(str, stops)),
                }
            )

final_df = pd.DataFrame(results)
print(f"Calculation complete. Mapped {len(final_df)} genes.")

Calculating overlaps (including block coordinates)...
Calculation complete. Mapped 63000 genes.


In [34]:
# --- 1. GIPR CHECK ---
print("--- Check: GIPR ---")
gipr_check = final_df[final_df["gene_name"] == "GIPR"]

if not gipr_check.empty:
    row = gipr_check.iloc[0]
    print(f"Found GIPR! It overlaps {row['vcf_count']} VCF block(s).")
    print(f"Gene Location: {row['chromosome']}:{row['start']}-{row['stop']}")
    print(f"VCF Files:     {row['overlapping_vcfs']}")
    print(f"VCF Starts:    {row['vcf_start']}")
    print(f"VCF Stops:     {row['vcf_stop']}")
else:
    print("WARNING: GIPR not found in the overlap results.")

# --- 2. GENES WITH NO VCF ---
print("\n--- Check: Genes without VCFs ---")
total_genes = len(gene_df)
mapped_genes = len(final_df)
orphaned_genes = total_genes - mapped_genes
print(f"Total Genes: {total_genes}")
print(f"Genes with >0 VCFs: {mapped_genes}")
print(f"Genes with NO VCFs: {orphaned_genes} ({orphaned_genes/total_genes:.2%})")

# --- 3. VCFS WITH NO GENE ---
print("\n--- Check: VCF Blocks without Genes ---")
total_vcfs = len(vcf_df)
touched_vcfs = len(touched_vcf_indices)
empty_vcfs = total_vcfs - touched_vcfs
print(f"Total VCF Blocks: {total_vcfs}")
print(f"VCFs overlapping at least 1 gene: {touched_vcfs}")
print(
    f"VCFs covering NO genes (Intergenic): {empty_vcfs} ({empty_vcfs/total_vcfs:.2%})"
)

--- Check: GIPR ---
Found GIPR! It overlaps 2 VCF block(s).
Gene Location: chr19:45668221-45683722
VCF Files:     ukb24310_c19_b2283_v1.vcf.gz,ukb24310_c19_b2284_v1.vcf.gz
VCF Starts:    45658480,45678477
VCF Stops:     45678480,45698477

--- Check: Genes without VCFs ---
Total Genes: 63086
Genes with >0 VCFs: 63000
Genes with NO VCFs: 86 (0.14%)

--- Check: VCF Blocks without Genes ---
Total VCF Blocks: 146430
VCFs overlapping at least 1 gene: 107647
VCFs covering NO genes (Intergenic): 38783 (26.49%)


In [35]:
final_df

Unnamed: 0,gene_id,gene_name,chromosome,start,stop,vcf_count,overlapping_vcfs,vcf_start,vcf_stop
0,ENSG00000290825.1,DDX11L2,chr1,11869,14409,1,ukb24310_c1_b0_v1.vcf.gz,10061,30061
1,ENSG00000223972.6,DDX11L1,chr1,12010,13670,1,ukb24310_c1_b0_v1.vcf.gz,10061,30061
2,ENSG00000227232.6,WASH7P,chr1,14696,24886,2,"ukb24310_c1_b0_v1.vcf.gz,ukb24310_c1_b1_v1.vcf.gz",1006120019,3006140019
3,ENSG00000278267.1,MIR6859-1,chr1,17369,17436,1,ukb24310_c1_b0_v1.vcf.gz,10061,30061
4,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,2,"ukb24310_c1_b0_v1.vcf.gz,ukb24310_c1_b1_v1.vcf.gz",1006120019,3006140019
...,...,...,...,...,...,...,...,...,...
62995,ENSG00000224240.1,CYCSP49,chrY,26549425,26549743,1,ukb24310_cY_b1327_v1.vcf.gz,26540004,26560004
62996,ENSG00000227629.1,SLC25A15P1,chrY,26586642,26591601,1,ukb24310_cY_b1329_v1.vcf.gz,26580007,26600007
62997,ENSG00000237917.1,PARP4P1,chrY,26594851,26634652,3,"ukb24310_cY_b1329_v1.vcf.gz,ukb24310_cY_b1330_...",265800072660000226620009,266000072662000226640009
62998,ENSG00000231514.1,CCNQP2,chrY,26626520,26627159,1,ukb24310_cY_b1331_v1.vcf.gz,26620009,26640009


In [None]:
final_df.to_csv(OUTPUT_FILE, sep="\t", index=False)
print(f"Saved extended results to {OUTPUT_FILE}")

Saved extended results to /Users/skn506/Documents/gogoGPCR2/data/blocks/gene_vcf_overlaps.tsv
