In [23]:
import os
import csv
import vcfpy
from dotenv import load_dotenv


In [24]:
load_dotenv()

VCF_DATA_PATH = os.getenv("VCF_DATA")
VCF_INDEX_PATH = os.getenv("VCF_DATA_INDEX")

print(VCF_DATA_PATH)
print(VCF_INDEX_PATH)


C:\Users\Noodl\Projects\Big_O\Hackathon\Rift2k26\Data\ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz
C:\Users\Noodl\Projects\Big_O\Hackathon\Rift2k26\Data\ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz.tbi


In [5]:
vcf_reader = vcfpy.Reader.from_path(
    VCF_DATA_PATH,
    tabix_path=VCF_INDEX_PATH
)


In [6]:
for record in vcf_reader:
    print(record.CHROM, record.POS, record.ID)
    break


22 16050075 []


In [7]:
clinical_db = {}

CLINICAL_FILE = r"../Data/clinicalVariants/clinicalVariants.tsv"

with open(CLINICAL_FILE, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")

    print("Columns:", reader.fieldnames)

    for row in reader:
        variant = row["variant"]

        # keep only rsIDs
        if not variant.startswith("rs"):
            continue

        clinical_db.setdefault(variant, []).append({
            "gene": row["gene"],
            "drug": row["chemicals"],
            "phenotype": row["phenotypes"],
            "evidence": row["level of evidence"],
            "type": row["type"]
        })

print("Loaded:", len(clinical_db), "clinical variants")


Columns: ['variant', 'gene', 'type', 'level of evidence', 'chemicals', 'phenotypes']
Loaded: 2844 clinical variants


In [7]:
GENES = {
    "CYP2D6": ("22", 42125962, 42131236),
}


In [8]:
hits = []

chrom, start, end = GENES["CYP2D6"]

for record in vcf_reader.fetch(chrom, start, end):

    if not record.ID:
        continue

    rsid = record.ID[0] if isinstance(record.ID, list) else record.ID

    if rsid in clinical_db:
        for ann in clinical_db[rsid]:
            hit = {
                "rsid": rsid,
                "position": record.POS,
                "gene": ann["gene"],
                "drug": ann["drug"],
                "evidence": ann["evidence"]
            }

            hits.append(hit)
            print("MATCH:", hit)


In [9]:
print("Total clinical hits:", len(hits))


Total clinical hits: 0


In [10]:
for record in vcf_reader.fetch("22", 42125962, 42131236):
    print("POS:", record.POS)
    print("ID:", record.ID)
    break


POS: 42126007
ID: []


In [16]:
import requests

url = "https://rest.ensembl.org/variation/human/rs17376848"

r = requests.get(
    url,
    headers={
        "Content-Type": "application/json",
        "User-Agent": "test"
    },
    timeout=30
)

print(r.status_code)
print(r.text[:500])


200
{"synonyms":["RCV001787906","RCV000244711","VCV000100088","RCV001787905","RCV000270743","RCV000086470","NM_000110.4:c.1896T>C","NM_000110.3:c.1896T>C","rs58485702","rs52815410","rs117467766","PA166153874"],"mappings":[{"end":97450068,"allele_string":"A/G","ancestral_allele":"A","location":"1:97450068-97450068","assembly_name":"GRCh38","coord_system":"chromosome","start":97450068,"seq_region_name":"1","strand":1}],"evidence":["Frequency","1000Genomes","Cited","ESP","Phenotype_or_Disease","ExAC","


In [11]:
import requests
import time

clinical_pos_db = {}

for rsid in list(clinical_db.keys())[:1]:

    print("Querying:", rsid)

    url = f"https://rest.ensembl.org/variation/human/{rsid}"

    try:
        r = requests.get(
            url,
            headers={"Content-Type": "application/json"},
            timeout=10   # ← VERY IMPORTANT
        )

        print("Status:", r.status_code)

    except requests.exceptions.Timeout:
        print("Timeout for", rsid)
        continue

    if not r.ok:
        print("Failed:", rsid)
        continue

    data = r.json()

    if "mappings" not in data:
        print("No mapping")
        continue

    mapping = data["mappings"][0]

    chrom = mapping["seq_region_name"]
    pos = mapping["start"]

    clinical_pos_db[(chrom, pos)] = clinical_db[rsid]

    time.sleep(0.2)

print("Mapped:", len(clinical_pos_db))


Querying: rs17376848
Timeout for rs17376848
Mapped: 0


In [12]:
# show first 20 rsIDs inside CYP2D6 region
count = 0

for record in vcf_reader.fetch("22", 42125962, 42131236):
    if record.ID:
        print(record.ID)
        count += 1
    if count == 20:
        break


In [17]:
import requests

def get_gene_coordinates(gene):
    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene}"

    headers = {
        "Accept": "application/json",
        "User-Agent": "Mozilla/5.0"
    }

    r = requests.get(url, headers=headers, timeout=10)

    r.raise_for_status()   # show real error if any

    data = r.json()

    return {
        "gene": gene,
        "chrom": str(data["seq_region_name"]),
        "start": data["start"],
        "end": data["end"]
    }

print(get_gene_coordinates("CYP2D6"))


{'gene': 'CYP2D6', 'chrom': '22', 'start': 42125962, 'end': 42131236}


In [18]:
PGX_GENES = [
    "CYP2D6",
    "CYP2C19",
    "CYP2C9",
    "SLCO1B1",
    "TPMT",
    "DPYD"
]

gene_regions = {}

for gene in PGX_GENES:
    info = get_gene_coordinates(gene)
    gene_regions[gene] = info
    print(info)


{'gene': 'CYP2D6', 'chrom': '22', 'start': 42125962, 'end': 42131236}
{'gene': 'CYP2C19', 'chrom': '10', 'start': 94762662, 'end': 94856282}
{'gene': 'CYP2C9', 'chrom': '10', 'start': 94938588, 'end': 94990148}
{'gene': 'SLCO1B1', 'chrom': '12', 'start': 21131120, 'end': 21241074}
{'gene': 'TPMT', 'chrom': '6', 'start': 18128311, 'end': 18155348}
{'gene': 'DPYD', 'chrom': '1', 'start': 97077743, 'end': 97995000}


In [19]:
region = gene_regions["CYP2D6"]

for record in vcf_reader.fetch(
        region["chrom"],
        region["start"],
        region["end"]):

    print(
        record.POS,
        record.REF,
        [a.value for a in record.ALT]
    )


42126007 A ['G']
42126013 G ['T']
42126043 A ['G']
42126046 A ['G']
42126135 C ['T']
42126182 A ['T']
42126193 T ['A']
42126230 G ['A']
42126260 T ['G']
42126279 C ['T']
42126301 C ['G']
42126357 A ['G']
42126366 GTTTA ['G']
42126409 T ['C']
42126420 A ['G']
42126430 G ['A']
42126431 T ['C']
42126462 T ['C']
42126470 G ['C']
42126520 G ['C']
42126652 A ['G']
42126683 C ['T']
42126691 T ['C']
42126705 A ['C']
42126708 C ['T']
42126716 T ['TG']
42126786 G ['C']
42126819 C ['G']
42126821 G ['T']
42126827 C ['T']
42126839 G ['A']
42126854 A ['T']
42126857 C ['T']
42126881 C ['T']
42126980 A ['G']
42127027 T ['G']
42127031 G ['C']
42127092 C ['T']
42127094 G ['A']
42127114 A ['G']
42127119 G ['A']
42127139 C ['T']
42127211 C ['T']
42127215 G ['A']
42127246 A ['G']
42127347 C ['T']
42127372 A ['G']
42127375 C ['T']
42127377 G ['T']
42127401 G ['T']
42127402 G ['C']
42127444 T ['C']
42127481 C ['T']
42127525 A ['AT']
42127540 T ['C']
42127549 C ['T']
42127568 G ['A']
42127614 G ['A']
42127620

In [22]:
import requests
import time

clinical_pos_db = {}

headers = {
    "Accept": "application/json",
    "User-Agent": "PGxHackathon/1.0"
}

for rsid, annotations in list(clinical_db.items())[:100]:  # small batch first

    url = f"https://rest.ensembl.org/variation/human/{rsid}"

    try:
        r = requests.get(url, headers=headers, timeout=20)
        if not r.ok:
            continue

        data = r.json()

        if "mappings" not in data:
            continue

        for m in data["mappings"]:
            key = (m["seq_region_name"], m["start"])
            clinical_pos_db.setdefault(key, []).extend(annotations)

    except Exception:
        continue

    time.sleep(0.15)

print("Mapped variants:", len(clinical_pos_db))


Mapped variants: 100


In [24]:
hits = []

for record in vcf_reader.fetch("22", 42125962, 42131236):

    key = (record.CHROM, record.POS)

    if key in clinical_pos_db:
        print("CLINICAL MATCH:", key, clinical_pos_db[key])
        hits.append((key, clinical_pos_db[key]))


In [25]:
# show chromosomes of mapped clinical variants
chroms = set()

for chrom, pos in clinical_pos_db.keys():
    chroms.add(chrom)

print(chroms)


{'MT', '16', '7', '1', '19', '10', '12', '13', '4'}


In [1]:
clinical_pos_db = {}

for rsid, annotations in clinical_db.items():

    url = f"https://rest.ensembl.org/variation/human/{rsid}"

    try:
        r = requests.get(url, headers=headers, timeout=20)
        if not r.ok:
            continue

        data = r.json()

        for m in data.get("mappings", []):

            # ⭐ KEEP ONLY chr22 (CYP2D6 chromosome)
            if m["seq_region_name"] != "22":
                continue

            key = (m["seq_region_name"], m["start"])
            clinical_pos_db.setdefault(key, []).extend(annotations)

    except:
        continue

    time.sleep(0.1)


NameError: name 'clinical_db' is not defined

In [9]:
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm   # progress bar

ENSEMBL_HEADERS = {
    "Accept": "application/json",
    "User-Agent": "PGx-Scanner"
}

MAX_WORKERS = 16   # use your CPU threads


In [11]:
def map_rsid(rsid, annotations):
    url = f"https://rest.ensembl.org/variation/human/{rsid}"

    try:
        r = requests.get(url, headers=ENSEMBL_HEADERS, timeout=15)

        if not r.ok:
            return None

        data = r.json()

        results = []

        for m in data.get("mappings", []):
            if m["seq_region_name"] != "22":   # CYP2D6 chromosome
                continue

            key = (m["seq_region_name"], m["start"])
            results.append((key, annotations))

        return results

    except Exception as e:
        print(f"FAILED {rsid}: {e}")
        return None


In [12]:
clinical_pos_db = {}

items = list(clinical_db.items())

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:

    futures = {
        executor.submit(map_rsid, rsid, ann): rsid
        for rsid, ann in items
    }

    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()

        if not result:
            continue

        for key, annotations in result:
            clinical_pos_db.setdefault(key, []).extend(annotations)


  2%|▏         | 55/2844 [00:15<24:58,  1.86it/s] 

FAILED rs2297595: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


  3%|▎         | 75/2844 [00:20<08:04,  5.71it/s]

FAILED rs2231142: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 13%|█▎        | 375/2844 [01:13<17:25,  2.36it/s]

FAILED rs5030655: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 14%|█▍        | 407/2844 [01:24<09:44,  4.17it/s]

FAILED rs17880887: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)
FAILED rs1799735: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 16%|█▌        | 456/2844 [01:35<05:54,  6.74it/s]

FAILED rs6265: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 17%|█▋        | 492/2844 [01:43<03:54, 10.02it/s]

FAILED rs4630: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 24%|██▍       | 683/2844 [02:16<06:06,  5.90it/s]

FAILED rs17626122: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 27%|██▋       | 774/2844 [02:32<04:18,  8.02it/s]

FAILED rs7867504: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 28%|██▊       | 797/2844 [02:36<08:06,  4.21it/s]

FAILED rs1135989: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 31%|███       | 868/2844 [02:50<03:18,  9.96it/s]

FAILED rs3788189: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 33%|███▎      | 925/2844 [03:01<04:02,  7.93it/s]

FAILED rs13207351: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 44%|████▍     | 1255/2844 [03:56<04:35,  5.77it/s]

FAILED rs6545803: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 50%|████▉     | 1416/2844 [04:30<06:08,  3.87it/s]

FAILED rs80223967: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 61%|██████    | 1734/2844 [05:40<02:54,  6.38it/s]

FAILED rs12459996: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 61%|██████▏   | 1744/2844 [05:42<02:41,  6.82it/s]

FAILED rs75041078: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 64%|██████▍   | 1819/2844 [05:59<04:21,  3.92it/s]

FAILED rs2645400: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Max retries exceeded with url: /variation/human/rs2645400 (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1081)')))


 74%|███████▍  | 2105/2844 [06:59<02:14,  5.49it/s]

FAILED rs1799930: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


 78%|███████▊  | 2228/2844 [07:32<03:08,  3.27it/s]

FAILED rs80338792: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 80%|███████▉  | 2265/2844 [07:47<08:36,  1.12it/s]

FAILED rs2886059: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 80%|███████▉  | 2268/2844 [07:48<06:19,  1.52it/s]

FAILED rs76394784: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 80%|███████▉  | 2269/2844 [07:49<05:07,  1.87it/s]

FAILED rs5030849: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 80%|███████▉  | 2270/2844 [07:49<04:30,  2.12it/s]

FAILED rs62516101: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 80%|████████  | 2281/2844 [07:54<04:15,  2.20it/s]

FAILED rs62435418: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 80%|████████  | 2283/2844 [07:55<03:15,  2.87it/s]

FAILED rs9384825: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 81%|████████  | 2295/2844 [08:03<06:55,  1.32it/s]

FAILED rs12521868: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Max retries exceeded with url: /variation/human/rs12521868 (Caused by ConnectTimeoutError(<HTTPSConnection(host='rest.ensembl.org', port=443) at 0x2639908ac10>, 'Connection to rest.ensembl.org timed out. (connect timeout=15)'))


 81%|████████  | 2300/2844 [08:07<05:24,  1.68it/s]

FAILED rs28404156: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Max retries exceeded with url: /variation/human/rs28404156 (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1081)')))


 81%|████████  | 2301/2844 [08:08<06:52,  1.31it/s]

FAILED rs114077267: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 82%|████████▏ | 2322/2844 [08:19<04:12,  2.06it/s]

FAILED rs2392165: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 83%|████████▎ | 2351/2844 [08:28<02:18,  3.55it/s]

FAILED rs10885: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 83%|████████▎ | 2353/2844 [08:28<02:04,  3.93it/s]

FAILED rs3130907: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))FAILED rs11229: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))



 84%|████████▍ | 2388/2844 [08:40<01:44,  4.38it/s]

FAILED rs797397: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
FAILED rs577001: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 84%|████████▍ | 2399/2844 [08:42<01:21,  5.43it/s]

FAILED rs3802279: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 89%|████████▊ | 2520/2844 [09:10<01:28,  3.65it/s]

FAILED rs1323040: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 94%|█████████▎| 2665/2844 [09:54<01:31,  1.95it/s]

FAILED rs74569896: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 94%|█████████▎| 2666/2844 [09:55<01:48,  1.64it/s]

FAILED rs17064642: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Max retries exceeded with url: /variation/human/rs17064642 (Caused by ConnectTimeoutError(<HTTPSConnection(host='rest.ensembl.org', port=443) at 0x26398ffb890>, 'Connection to rest.ensembl.org timed out. (connect timeout=15)'))


 94%|█████████▍| 2680/2844 [10:00<00:59,  2.78it/s]

FAILED rs917881: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 96%|█████████▌| 2733/2844 [10:16<00:53,  2.07it/s]

FAILED rs140039091: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 96%|█████████▌| 2736/2844 [10:18<00:45,  2.38it/s]

FAILED rs558354142: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 97%|█████████▋| 2751/2844 [10:27<00:30,  3.02it/s]

FAILED rs75961395: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 97%|█████████▋| 2754/2844 [10:28<00:21,  4.17it/s]

FAILED rs75549581: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 97%|█████████▋| 2755/2844 [10:29<00:43,  2.03it/s]

FAILED rs12615320: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 97%|█████████▋| 2762/2844 [10:32<00:32,  2.53it/s]

FAILED rs9516519: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 97%|█████████▋| 2766/2844 [10:35<00:54,  1.43it/s]

FAILED rs2229774: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 98%|█████████▊| 2778/2844 [10:40<00:28,  2.32it/s]

FAILED rs3765467: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


 98%|█████████▊| 2784/2844 [10:42<00:26,  2.23it/s]

FAILED rs3808607: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 98%|█████████▊| 2787/2844 [10:43<00:17,  3.21it/s]

FAILED rs2285676: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 98%|█████████▊| 2797/2844 [10:47<00:16,  2.79it/s]

FAILED rs1042858: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 98%|█████████▊| 2798/2844 [10:48<00:16,  2.72it/s]

FAILED rs2241883: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 98%|█████████▊| 2799/2844 [10:48<00:19,  2.34it/s]

FAILED rs833069: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 98%|█████████▊| 2801/2844 [10:49<00:16,  2.61it/s]

FAILED rs11226: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 99%|█████████▉| 2824/2844 [10:55<00:04,  4.20it/s]

FAILED rs729147: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


 99%|█████████▉| 2825/2844 [10:56<00:06,  3.03it/s]

FAILED rs1149222: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


100%|█████████▉| 2830/2844 [10:58<00:06,  2.28it/s]

FAILED rs2235047: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


100%|█████████▉| 2841/2844 [11:06<00:03,  1.09s/it]

FAILED rs1901633: HTTPSConnectionPool(host='rest.ensembl.org', port=443): Read timed out. (read timeout=15)


100%|██████████| 2844/2844 [11:09<00:00,  4.25it/s]


In [13]:
print("Mapped positions:", len(clinical_pos_db))


Mapped positions: 60


In [15]:
hits = []

for record in vcf_reader.fetch("22", 42125962, 42131236):

    key = (record.CHROM, record.POS)

    if key in clinical_pos_db:

        for ann in clinical_pos_db[key]:
            hit = {
                "pos": record.POS,
                "gene": ann["gene"],
                "drug": ann["drug"],
                "phenotype": ann["phenotype"],
                "evidence": ann["evidence"],
                "ref": record.REF,
                "alt": [a.value for a in record.ALT]
            }


            hits.append(hit)
            print("CLINICAL HIT:", hit)


CLINICAL HIT: {'pos': 42127614, 'gene': 'CYP2D6', 'drug': 'bufuralol,dextromethorphan', 'phenotype': '', 'evidence': '3', 'ref': 'G', 'alt': ['A']}


In [16]:
from collections import defaultdict

gene_report = defaultdict(list)

for h in hits:
    gene_report[h["gene"]].append(h)

for gene, variants in gene_report.items():
    print(f"\n=== {gene} ===")
    print("Variants detected:", len(variants))

    drugs = set()
    for v in variants:
        drugs.update(v["drug"].split(","))

    print("Associated drugs:", ", ".join(drugs))



=== CYP2D6 ===
Variants detected: 1
Associated drugs: dextromethorphan, bufuralol


In [17]:
def analyze_drug(drug_name):
    for h in hits:
        if drug_name.lower() in h["drug"].lower():
            print("Relevant variant found:", h)


In [18]:
analyze_drug("dextromethorphan")


Relevant variant found: {'pos': 42127614, 'gene': 'CYP2D6', 'drug': 'bufuralol,dextromethorphan', 'phenotype': '', 'evidence': '3', 'ref': 'G', 'alt': ['A']}


In [19]:
def generate_drug_report(drug_name, hits):

    print("\n==============================")
    print(f"PHARMACOGENOMIC REPORT")
    print("==============================\n")

    print(f"Drug analyzed: {drug_name}\n")

    relevant = [
        h for h in hits
        if drug_name.lower() in h["drug"].lower()
    ]

    if not relevant:
        print("No pharmacogenomic variants detected.")
        return

    for h in relevant:
        print(f"Gene: {h['gene']}")
        print(f"Variant: chr22:{h['pos']} {h['ref']}→{','.join(h['alt'])}")
        print(f"Evidence level: {h['evidence']}")

        # interpretation logic
        if h["evidence"] == "1A":
            impact = "Strong clinical guideline available."
        elif h["evidence"] in ["1B", "2"]:
            impact = "Moderate clinical evidence."
        else:
            impact = "Research association only."

        print(f"Interpretation: {impact}")
        print("Possible effect: altered drug metabolism.\n")


In [20]:
generate_drug_report("dextromethorphan", hits)



PHARMACOGENOMIC REPORT

Drug analyzed: dextromethorphan

Gene: CYP2D6
Variant: chr22:42127614 G→A
Evidence level: 3
Interpretation: Research association only.
Possible effect: altered drug metabolism.



In [21]:
drug_gene_map = {}

for rsid, anns in clinical_db.items():
    for a in anns:
        for drug in a["drug"].split(","):
            drug_gene_map.setdefault(drug.lower(), set()).add(a["gene"])


In [22]:
print(drug_gene_map["dextromethorphan"])


{'CYP2D6'}


In [14]:
VCF_DATA_PATH

'C:\\Users\\Noodl\\Projects\\Big_O\\Hackathon\\Rift2k26\\Data\\ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz'

In [15]:
VCF_DATA_INDEX_PATH

NameError: name 'VCF_DATA_INDEX_PATH' is not defined

In [None]:
with open(VCF_DATA_PATH, "rb") as f:
    print(f.read(4))


b'\x1f\x8b\x08\x04'


In [None]:
reader = vcfpy.Reader.from_path(
    VCF_DATA_PATH,
    tabix_path=VCF_DATA_INDEX_PATH
)


In [9]:
for record in reader.fetch("22", 42125962, 42131236):
    print(record.POS, record.ID)
    break


42126007 []


In [10]:
for record in reader.fetch("22", 42125962, 42131236):
    print("POS:", record.POS)
    print("REF:", record.REF)
    print("ALT:", [a.value for a in record.ALT])
    print("INFO:", record.INFO)
    print("-----")
    break


POS: 42126007
REF: A
ALT: ['G']
INFO: {'AC': [4], 'AF': [0.000798722], 'AN': 5008, 'NS': 2504, 'DP': 22118, 'EAS_AF': [0.0], 'AMR_AF': [0.0014], 'AFR_AF': [0.0], 'EUR_AF': [0.003], 'SAS_AF': [0.0], 'AA': 'A|||', 'VT': ['SNP']}
-----


In [6]:
import os
from dotenv import load_dotenv

load_dotenv()

VCF_DATA = os.getenv("VCF_DATA")
VCF_DATA_INDEX = os.getenv("VCF_DATA_INDEX")
DATA_PATH = os.getenv("DATA_PATH")

print(VCF_DATA)


C:\Users\Noodl\Projects\Big_O\Hackathon\Rift2k26\Data\ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz


In [5]:
from pathlib import Path

print("VCF exists:", Path(VCF_DATA).exists())
print("Index exists:", Path(VCF_DATA_INDEX).exists())


VCF exists: True
Index exists: True


In [10]:
import csv
from pathlib import Path

CLINICAL_FILE = Path(DATA_PATH) / "clinicalVariants" / "clinicalVariants.tsv"

clinical_db = {}

with open(CLINICAL_FILE, encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")

    for row in reader:
        rsid = row["variant"]

        if not rsid:
            continue

        clinical_db.setdefault(rsid, []).append({
            "gene": row["gene"],
            "drug": row["chemicals"],        # <-- drug column
            "phenotype": row["phenotypes"], # <-- phenotype column
            "evidence": row["level of evidence"],
            "type": row["type"]
        })

print("Loaded variants:", len(clinical_db))


Loaded variants: 3161


In [11]:
for k, v in list(clinical_db.items())[:3]:
    print(k, v)


CYP2C9*1, CYP2C9*3, CYP2C9*13 [{'gene': 'CYP2C9', 'drug': 'lornoxicam', 'phenotype': '', 'evidence': '1A', 'type': 'Metabolism/PK'}, {'gene': 'CYP2C9', 'drug': 'zafirlukast', 'phenotype': '', 'evidence': '3', 'type': 'Metabolism/PK'}]
rs17376848 [{'gene': 'DPYD', 'drug': 'capecitabine', 'phenotype': 'Neoplasms', 'evidence': '1A', 'type': 'Toxicity'}, {'gene': 'DPYD', 'drug': 'fluorouracil', 'phenotype': 'Neoplasms', 'evidence': '1A', 'type': 'Toxicity'}]
rs2297595 [{'gene': 'DPYD', 'drug': 'capecitabine', 'phenotype': 'Neoplasms', 'evidence': '1A', 'type': 'Toxicity'}, {'gene': 'DPYD', 'drug': 'fluorouracil', 'phenotype': 'Neoplasms', 'evidence': '1A', 'type': 'Toxicity'}]


In [20]:
import csv

clinical_db = {}
CLINICAL_FILE = "C:\\Users\\Noodl\\Projects\\Big_O\\Hackathon\\Rift2k26\\Data\\clinicalVariants\\clinicalVariants.tsv"

with open(CLINICAL_FILE, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)

    for row in reader:
        variant = row["variant"]

        # keep only rsIDs
        if not variant.startswith("rs"):
            continue

        clinical_db.setdefault(variant, []).append({
            "gene": row["gene"],
            "drug": row["chemicals"],
            "phenotype": row["phenotypes"],
            "evidence": row["level of evidence"],
            "type": row["type"]
        })

print("Loaded:", len(clinical_db))


KeyError: 'variant'

In [22]:
with open(CLINICAL_FILE, encoding="utf-8") as f:
    reader = csv.DictReader(f)
    print(reader.fieldnames)


['variant\tgene\ttype\tlevel of evidence\tchemicals\tphenotypes']


In [23]:
import csv

clinical_db = {}

with open(CLINICAL_FILE, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter="\t")

    print("Columns:", reader.fieldnames)  # sanity check

    for row in reader:
        variant = row["variant"]

        if not variant.startswith("rs"):
            continue

        clinical_db.setdefault(variant, []).append({
            "gene": row["gene"],
            "drug": row["chemicals"],
            "phenotype": row["phenotypes"],
            "evidence": row["level of evidence"],
            "type": row["type"]
        })

print("Loaded clinical variants:", len(clinical_db))


Columns: ['variant', 'gene', 'type', 'level of evidence', 'chemicals', 'phenotypes']
Loaded clinical variants: 2844


In [24]:
hits = []

for record in reader:
    if not record.ID:
        continue

    rsid = record.ID[0] if isinstance(record.ID, list) else record.ID

    if rsid in clinical_db:
        for annotation in clinical_db[rsid]:
            hits.append({
                "rsid": rsid,
                "gene": annotation["gene"],
                "drug": annotation["drug"],
                "phenotype": annotation["phenotype"],
                "evidence": annotation["evidence"],
                "position": record.POS
            })


ValueError: I/O operation on closed file.

In [25]:
vcf_reader = vcfpy.Reader.from_path(
    str(VCF_DATA),
    tabix_path=str(VCF_DATA_INDEX)
)


In [28]:
CYP2D6 = ("22", 42125962, 42131236)

hits = []

for record in vcf_reader.fetch(*CYP2D6):
    if not record.ID:
        continue

    rsid = record.ID[0] if isinstance(record.ID, list) else record.ID

    if rsid in clinical_db:
        print("MATCH:", rsid, record.POS)


# Begin


In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

VCF_DATA_PATH = os.getenv("VCF_DATA")
VCF_INDEX_PATH = os.getenv("VCF_DATA_INDEX")
DATA_PATH = os.getenv("DATA_PATH")

print(VCF_DATA_PATH)


C:\Users\Noodl\Projects\Big_O\Hackathon\Rift2k26\Data\ALL.chr22.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz


In [2]:
import vcfpy
import csv
import requests
from collections import defaultdict


In [3]:
vcf_reader = vcfpy.Reader.from_path(
    VCF_DATA_PATH,
    tabix_path=VCF_INDEX_PATH
)


In [5]:
clinical_db = {}

with open(f"{DATA_PATH}/clinicalVariants/clinicalVariants.tsv",
          newline="",
          encoding="utf-8") as f:

    reader = csv.DictReader(f, delimiter="\t")

    for row in reader:
        variant = row["variant"]

        if not variant.startswith("rs"):
            continue

        clinical_db.setdefault(variant, []).append({
            "gene": row["gene"],
            "drug": row["chemicals"],
            "phenotype": row["phenotypes"],
            "evidence": row["level of evidence"]
        })

print("Loaded:", len(clinical_db))


Loaded: 2844


In [6]:
drug_gene_map = defaultdict(set)

for anns in clinical_db.values():
    for a in anns:
        for drug in a["drug"].split(","):
            drug_gene_map[drug.lower()].add(a["gene"])


In [7]:
def get_gene_coordinates(gene):

    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene}"

    r = requests.get(
        url,
        headers={"Accept":"application/json"},
        timeout=10
    )

    data = r.json()

    return (
        str(data["seq_region_name"]),
        data["start"],
        data["end"]
    )


In [10]:
TARGET_GENES = {
    "CYP2D6",
    "CYP2C19",
    "CYP2C9",
    "SLCO1B1",
    "TPMT",
    "DPYD"
}

filtered_rsids = []

for rsid, anns in clinical_db.items():
    if any(a["gene"] in TARGET_GENES for a in anns):
        filtered_rsids.append(rsid)

print(len(filtered_rsids))


123


In [11]:
session = requests.Session()

headers = {
    "Content-Type": "application/json",
    "Accept": "application/json",
    "User-Agent": "RIFT2K26-PGX/1.0"
}


In [12]:
clinical_pos_db = {}

rsids = list(clinical_db.keys())

BATCH_SIZE = 25   # safe size

for i in range(0, len(rsids), BATCH_SIZE):

    batch = rsids[i:i+BATCH_SIZE]

    try:
        r = session.post(
            "https://rest.ensembl.org/variation/human",
            json={"ids": batch},
            headers=headers,
            timeout=120
        )

        if not r.ok:
            print("Batch failed:", i)
            continue

        data = r.json()

        for rsid, info in data.items():

            for m in info.get("mappings", []):

                if m["seq_region_name"] != "22":
                    continue

                key = (m["seq_region_name"], m["start"])
                clinical_pos_db.setdefault(key, []).extend(
                    clinical_db[rsid]
                )

    except Exception as e:
        print("Batch error:", e)

    print(f"Processed {i+len(batch)} / {len(rsids)}")


Processed 25 / 2844
Processed 50 / 2844
Processed 75 / 2844
Processed 100 / 2844
Processed 125 / 2844
Processed 150 / 2844
Processed 175 / 2844
Processed 200 / 2844
Processed 225 / 2844
Processed 250 / 2844
Processed 275 / 2844
Processed 300 / 2844
Processed 325 / 2844
Processed 350 / 2844
Processed 375 / 2844
Processed 400 / 2844
Processed 425 / 2844
Processed 450 / 2844
Processed 475 / 2844
Processed 500 / 2844
Processed 525 / 2844
Processed 550 / 2844
Processed 575 / 2844
Processed 600 / 2844
Processed 625 / 2844
Processed 650 / 2844
Processed 675 / 2844
Processed 700 / 2844
Processed 725 / 2844
Processed 750 / 2844
Processed 775 / 2844
Processed 800 / 2844
Processed 825 / 2844
Processed 850 / 2844
Processed 875 / 2844
Processed 900 / 2844
Processed 925 / 2844
Processed 950 / 2844
Processed 975 / 2844
Processed 1000 / 2844
Processed 1025 / 2844
Processed 1050 / 2844
Processed 1075 / 2844
Processed 1100 / 2844
Processed 1125 / 2844
Processed 1150 / 2844
Processed 1175 / 2844
Processe

In [14]:
drug = "dextromethorphan"
genes = drug_gene_map[drug]

In [16]:
gene_regions = {
    "CYP2D6": ("22", 42125962, 42131236),
    "CYP2C19": ("10", 96522437, 96612962),
    "CYP2C9": ("10", 96698402, 96761229),
    "SLCO1B1": ("12", 21176840, 21256333),
    "TPMT": ("6", 18128306, 18143127),
    "DPYD": ("1", 97450000, 97590000)
}

print(gene_regions)


{'CYP2D6': ('22', 42125962, 42131236), 'CYP2C19': ('10', 96522437, 96612962), 'CYP2C9': ('10', 96698402, 96761229), 'SLCO1B1': ('12', 21176840, 21256333), 'TPMT': ('6', 18128306, 18143127), 'DPYD': ('1', 97450000, 97590000)}


In [19]:
available_chroms = set()

for line in vcf_reader.header.lines:
    if line.key == "contig":
        available_chroms.add(line.mapping["ID"])

print("VCF chromosomes:", available_chroms)


VCF chromosomes: {'GL000206.1', '5', 'GL000207.1', 'GL000196.1', 'GL000199.1', '18', '4', 'GL000238.1', '19', '9', 'GL000205.1', 'GL000229.1', 'GL000226.1', 'GL000194.1', 'GL000247.1', 'GL000222.1', 'GL000239.1', '8', 'GL000218.1', '7', 'GL000219.1', 'GL000198.1', 'hs37d5', 'GL000242.1', 'GL000204.1', '1', 'GL000203.1', 'GL000223.1', 'GL000202.1', '15', 'GL000191.1', '22', '3', 'GL000225.1', 'GL000211.1', 'Y', 'NC_007605', '11', 'GL000245.1', 'GL000240.1', 'GL000200.1', 'GL000231.1', 'GL000237.1', 'GL000192.1', 'GL000235.1', 'GL000236.1', 'GL000220.1', '2', 'GL000215.1', 'MT', 'GL000209.1', 'GL000249.1', 'GL000195.1', 'GL000234.1', 'GL000232.1', '17', '20', 'GL000216.1', '14', 'GL000201.1', 'GL000214.1', '6', 'GL000248.1', 'GL000213.1', 'X', 'GL000243.1', 'GL000246.1', '21', 'GL000221.1', 'GL000212.1', '10', 'GL000210.1', '12', 'GL000193.1', 'GL000228.1', 'GL000230.1', 'GL000208.1', 'GL000217.1', 'GL000227.1', 'GL000224.1', '16', 'GL000197.1', 'GL000244.1', 'GL000233.1', '13', 'GL00024

In [20]:
filtered_gene_regions = {
    gene: region
    for gene, region in gene_regions.items()
    if region[0] in available_chroms
}

print(filtered_gene_regions)


{'CYP2D6': ('22', 42125962, 42131236), 'CYP2C19': ('10', 96522437, 96612962), 'CYP2C9': ('10', 96698402, 96761229), 'SLCO1B1': ('12', 21176840, 21256333), 'TPMT': ('6', 18128306, 18143127), 'DPYD': ('1', 97450000, 97590000)}


In [21]:
available_chroms = set()

for line in vcf_reader.header.lines:
    if line.key == "contig":
        available_chroms.add(line.mapping["ID"])

print(available_chroms)


{'GL000206.1', '5', 'GL000207.1', 'GL000196.1', 'GL000199.1', '18', '4', 'GL000238.1', '19', '9', 'GL000205.1', 'GL000229.1', 'GL000226.1', 'GL000194.1', 'GL000247.1', 'GL000222.1', 'GL000239.1', '8', 'GL000218.1', '7', 'GL000219.1', 'GL000198.1', 'hs37d5', 'GL000242.1', 'GL000204.1', '1', 'GL000203.1', 'GL000223.1', 'GL000202.1', '15', 'GL000191.1', '22', '3', 'GL000225.1', 'GL000211.1', 'Y', 'NC_007605', '11', 'GL000245.1', 'GL000240.1', 'GL000200.1', 'GL000231.1', 'GL000237.1', 'GL000192.1', 'GL000235.1', 'GL000236.1', 'GL000220.1', '2', 'GL000215.1', 'MT', 'GL000209.1', 'GL000249.1', 'GL000195.1', 'GL000234.1', 'GL000232.1', '17', '20', 'GL000216.1', '14', 'GL000201.1', 'GL000214.1', '6', 'GL000248.1', 'GL000213.1', 'X', 'GL000243.1', 'GL000246.1', '21', 'GL000221.1', 'GL000212.1', '10', 'GL000210.1', '12', 'GL000193.1', 'GL000228.1', 'GL000230.1', 'GL000208.1', 'GL000217.1', 'GL000227.1', 'GL000224.1', '16', 'GL000197.1', 'GL000244.1', 'GL000233.1', '13', 'GL000241.1'}


In [22]:
import vcfpy

vcf_reader = vcfpy.Reader.from_path(VCF_DATA_)


NameError: name 'VCF_DATA_' is not defined

In [17]:
hits = []

for gene, (chrom,start,end) in gene_regions.items():

    for record in vcf_reader.fetch(chrom, start, end):

        key = (record.CHROM, record.POS)

        if key in clinical_pos_db:

            for ann in clinical_pos_db[key]:
                hits.append({
                    "gene": gene,
                    "pos": record.POS,
                    "ref": record.REF,
                    "alt": [a.value for a in record.ALT],
                    **ann
                })


ValueError: Reference 10 not found in index

In [None]:
def generate_drug_report(drug, hits):

    print("\n=== PHARMACOGENOMIC REPORT ===\n")
    print("Drug:", drug)

    relevant = [
        h for h in hits
        if drug.lower() in h["drug"].lower()
    ]

    if not relevant:
        print("No relevant variants detected.")
        return

    for h in relevant:

        print("\nGene:", h["gene"])
        print(f"Variant: chr22:{h['pos']} {h['ref']}→{','.join(h['alt'])}")
        print("Evidence:", h["evidence"])

        if h["evidence"] == "1A":
            print("Clinical impact: Strong guideline exists.")
        elif h["evidence"] in ["1B","2"]:
            print("Clinical impact: Moderate evidence.")
        else:
            print("Clinical impact: Research association.")
