In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Project Overview

# Genomic Variant Interpretation Assistant (GVIA)

This project builds a Gen AI-powered assistant for interpreting genomic variants from a VCF file. It is designed to support rare disease diagnosis by helping clinicians understand the likely impact of a variant and associated phenotypes.

We use Gen AI in the following ways:
- Interpret the clinical significance of variants (structured output)
- Map patient phenotypes to likely genes (few-shot prompting)
- Summarize relevant literature for context (RAG-style grounding)


Capabilities Used

## Gen AI Capabilities Demonstrated

1. **Structured Output / JSON Mode** — for variant interpretation
2. **Few-Shot Prompting** — for phenotype-to-gene inference
3. **RAG-style Grounding** — summarizing literature with PMIDs

The model used is **OpenAI GPT-4**, accessed through the OpenAI API.


Capstone Project: Genomic Variant Interpretation Assistant (GVIA)

A Gen AI-powered assistant that helps clinicians and researchers interpret variants from whole genome/exome sequencing using structured summaries, phenotype mapping, and literature support.

In [20]:
!pip install -U google-generativeai --quiet



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.4/155.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [21]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyASQp95_puzrVst-ydgjGQQ57mWI2o7pZs")

In [23]:
!pip install openai vcfpy pandas --quiet
import vcfpy
import pandas as pd

reader = vcfpy.Reader.from_path("/kaggle/input/vcf-file/test.vcf")  # Upload 'test.vcf' in the sidebar

variants = []
for i, record in enumerate(reader):
    if i >= 5:  # For demo purposes
        break
    alt = str(record.ALT[0].value) if record.ALT else "NA"
    variants.append({
        'CHROM': record.CHROM,
        'POS': record.POS,
        'ID': record.ID,
        'REF': record.REF,
        'ALT': alt,
        'QUAL': record.QUAL,
        'FILTER': record.FILTER,
        'INFO': record.INFO
    })

df = pd.DataFrame(variants)
df.head()


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,chrM,1,[],G,,,[PASS],"{'UCSC.conservation': [272, 713], 'UCSC.tss': ..."
1,chrM,2,[],A,,,[PASS],"{'END': 72, 'BLOCKAVG_min30p3a': True, 'UCSC.B..."
2,chrM,73,[],G,A,8752.78,[TruthSensitivityTranche99.90to100.00],"{'AC': [2], 'AF': [1.0], 'AN': 2, 'DP': 250, '..."
3,chrM,74,[],T,,,[PASS],"{'END': 149, 'BLOCKAVG_min30p3a': True, 'UCSC...."
4,chrM,150,[],T,C,9341.36,[TruthSensitivityTranche99.90to100.00],"{'AC': [2], 'AF': [1.0], 'AN': 2, 'DP': 250, '..."


In [25]:
model = genai.GenerativeModel(model_name="models/gemini-2.0-flash")

def interpret_variant_with_gemini(variant):
    prompt = f"""
You are a genomics assistant.

Given this variant:
Chromosome: {variant['CHROM']}
Position: {variant['POS']}
Reference: {variant['REF']}
Alternate: {variant['ALT']}

Return the following in JSON format:
- gene_name
- predicted_impact
- known_diseases
- phenotype_terms (HPO IDs)
- supporting_literature (PMIDs)
"""
    response = model.generate_content(prompt)
    return response.text

# Test it
variant = df.iloc[0]
print(interpret_variant_with_gemini(variant))


```json
{
  "gene_name": "MT-TF",
  "predicted_impact": "MODIFIER",
  "known_diseases": [],
  "phenotype_terms": [],
  "supporting_literature": []
}
```


Step 4: Gen AI Capability 2 — Few-Shot Prompting for Phenotype → Gene Mapping

In [26]:
few_shot_prompt = """
Example 1:
Phenotype: seizures, intellectual disability → Likely Genes: SCN1A, CDKL5

Example 2:
Phenotype: muscle weakness, elevated CK → Likely Genes: DMD, LMNA

Now analyze:
Phenotype: microcephaly, delayed milestones
→ Likely Genes:
"""

response = model.generate_content(few_shot_prompt)
print(response.text)


Phenotype: microcephaly, delayed milestones
→ Likely Genes: ASPM, MCPH1, CDK5RAP2, WDR62, STIL, CENPJ, TUBB4A, ARID1B



Step 5: Gen AI Capability 3 — RAG-style Grounded Literature Summary

In [27]:
literature_context = """
PMID: 12345678 — BRCA1 variant c.68_69delAG causes frameshift, high breast cancer risk.
PMID: 23456789 — Loss-of-function variants in BRCA1 impair DNA repair pathways.
"""

rag_prompt = f"""
You are a genomics assistant. Given the following literature:

{literature_context}

Summarize the clinical relevance of BRCA1 variant c.68_69delAG. Include PMIDs in your output.
"""

rag_response = model.generate_content(rag_prompt)
print(rag_response.text)


The BRCA1 variant c.68_69delAG is clinically relevant because it causes a frameshift mutation (PMID: 12345678). Loss-of-function variants in BRCA1, such as frameshift mutations, impair DNA repair pathways (PMID: 23456789) and are associated with a high risk of breast cancer (PMID: 12345678).



Step 6: Export Final Report

In [28]:
import json

report = {
    "Project": "Genomic Variant Interpretation Assistant (GVIA)",
    "Sample Variant": f"{df.iloc[0]['CHROM']}:{df.iloc[0]['POS']} {df.iloc[0]['REF']}>{df.iloc[0]['ALT']}",
    "Interpretation": interpret_variant_with_gemini(df.iloc[0]),
    "Phenotype to Genes": response.text,
    "Literature Summary": rag_response.text
}

with open("GVIA_Gemini_Report.json", "w") as f:
    json.dump(report, f, indent=2)

print("✅ Report saved as GVIA_Gemini_Report.json")


✅ Report saved as GVIA_Gemini_Report.json


# Genomic Variant Interpretation Assistant (GVIA)

This project uses Google Gemini to automate genomic variant interpretation for rare disease support. Given a VCF file, it:
- Extracts and interprets variants
- Maps patient phenotypes to genes
- Summarizes literature from structured abstracts

## Gen AI Capabilities Used
1. Structured output generation (variant interpretation)
2. Few-shot prompting (phenotype to gene)
3. RAG-style grounding (literature summarization)

Powered by: Google Gemini (models/gemini-2.0-flash)
