In [1]:
!pip install pysam

Collecting pysam
  Downloading pysam-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Downloading pysam-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl (26.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.0/26.0 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pysam
Successfully installed pysam-0.23.0


In [2]:
!pip install google-generativeai



In [3]:
import os  # For environment variable access and file path handling
import json  # To work with JSON data (e.g., storing/loading model inputs or outputs)
import pysam  # For reading and parsing VCF/BAM files (variant/calling data)
from datetime import datetime  # For timestamping logs, outputs, or reports

import numpy as np  # For numerical operations (e.g., array manipulations)
import pandas as pd  # For handling tabular data (e.g., VCFs converted to DataFrames)
import google.generativeai as genai  # To interact with Google Generative AI models (e.g., Gemini or Bison)


In [4]:
VCF_FILEPATH = "/kaggle/input/vcf-data/clinvar.vcf"
GEN_API = "YOUR_GENERATIVE_API"


In [5]:
def parse_vcf(filepath):
    """Parse VCF file and extract relevant variant information"""
    vcf_file = pysam.VariantFile(filepath)  # supports .vcf.gz too
    variants = []
    for record in vcf_file.fetch():
        #print(record.info.keys())
        variant = {
                "chromosome": record.contig,
                "position": record.pos,
                "reference": record.ref,
                "alternative": record.alts[0]
        }
        if 'CLNSIG' in record.info.keys():
            variant["significance"] = record.info['CLNSIG'][0]
        if 'CLNHGVS' in record.info.keys():
            variant['hgvs'] = record.info['CLNHGVS'][0]
        if 'GENEINFO' in record.info.keys():
            variant["gene"] = record.info['GENEINFO']
        if 'MC' in record.info.keys():
            variant['mc'] = record.info['MC'][0]
        variants.append(variant)
    return variants

# Example Usage
variants = parse_vcf(VCF_FILEPATH)
print(variants)


[{'chromosome': 'Y', 'position': 338615, 'reference': 'G', 'alternative': 'A', 'significance': 'Likely_benign', 'hgvs': 'NC_000024.10:g.338615G>A', 'gene': 'PPP2R3B:28227', 'mc': 'SO:0001819|synonymous_variant'}, {'chromosome': 'Y', 'position': 624386, 'reference': 'C', 'alternative': 'CT', 'significance': 'Benign', 'hgvs': 'NC_000024.10:g.624399dup', 'gene': 'SHOX:6473', 'mc': 'SO:0001623|5_prime_UTR_variant'}, {'chromosome': 'Y', 'position': 624386, 'reference': 'C', 'alternative': 'CTT', 'significance': 'Likely_benign', 'hgvs': 'NC_000024.10:g.624398_624399dup', 'gene': 'SHOX:6473', 'mc': 'SO:0001623|5_prime_UTR_variant'}, {'chromosome': 'Y', 'position': 624386, 'reference': 'C', 'alternative': 'G', 'significance': 'Likely_benign', 'hgvs': 'NC_000024.10:g.624386C>G', 'gene': 'SHOX:6473', 'mc': 'SO:0001623|5_prime_UTR_variant'}, {'chromosome': 'Y', 'position': 624386, 'reference': 'CT', 'alternative': 'C', 'significance': 'Benign', 'hgvs': 'NC_000024.10:g.624399del', 'gene': 'SHOX:64

[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)


In [6]:
# Your API key from AI Studio
genai.configure(api_key=GEN_API)

# List available models
for model in genai.list_models():
    print(model.name)


models/chat-bison-001
models/text-bison-001
models/embedding-gecko-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.5-pro-exp-03-25
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-04-17
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
mod

In [7]:
class GeneticExplainer:
    def __init__(self, model_name="gemma-3-27b-it", api_key=None):
        """
        Initialize the GeneticExplainer with a generative model
        
        Args:
            model_name (str): Name of the generative model to use
            api_key (str): API key for the generative model service
        """
        if api_key:
            genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(model_name)
        
    def parse_vcf(self, filepath):
        """
        Parse VCF file and extract relevant variant information
        
        Args:
            filepath (str): Path to the VCF file
            
        Returns:
            list: List of variant dictionaries
        """
        vcf_file = pysam.VariantFile(filepath)  # supports .vcf.gz too
        variants = []
        
        for record in vcf_file.fetch():
            variant = {
                "chromosome": record.contig,
                "position": record.pos,
                "reference": record.ref,
                "alternative": record.alts[0] if record.alts else "",
                "id": record.id if record.id else ".",
                "quality": record.qual if record.qual else None,
                "filter": list(record.filter) if record.filter else [],
                "format": record.format.keys() if record.format else [],
                "info": {}
            }
            
            # Extract all INFO fields
            for key in record.info.keys():
                try:
                    # Handle different types of INFO fields
                    if isinstance(record.info[key], tuple) and len(record.info[key]) == 1:
                        variant["info"][key] = record.info[key][0]
                    else:
                        variant["info"][key] = record.info[key]
                except TypeError:
                    # Some INFO fields might cause type errors
                    variant["info"][key] = str(record.info[key])
            
            # Extract common fields for easier access
            if 'CLNSIG' in record.info.keys():
                variant["significance"] = record.info['CLNSIG'][0] if isinstance(record.info['CLNSIG'], tuple) else record.info['CLNSIG']
            if 'CLNHGVS' in record.info.keys():
                variant['hgvs'] = record.info['CLNHGVS'][0] if isinstance(record.info['CLNHGVS'], tuple) else record.info['CLNHGVS']
            if 'GENEINFO' in record.info.keys():
                variant["gene"] = record.info['GENEINFO']
            if 'MC' in record.info.keys():
                variant['mc'] = record.info['MC'][0] if isinstance(record.info['MC'], tuple) else record.info['MC']
            
            # Add additional useful fields if they exist
            if 'CLNDISDB' in record.info.keys():
                variant['disease_db'] = record.info['CLNDISDB']
            if 'CLNDN' in record.info.keys():
                variant['disease_name'] = record.info['CLNDN']
            if 'CLNREVSTAT' in record.info.keys():
                variant['review_status'] = record.info['CLNREVSTAT']
            
            variants.append(variant)
            
        return variants
    
    def explain_variant(self, variant, user_type="patient"):
        """
        Generate an explanation for a genetic variant
        
        Args:
            variant (dict): Variant information
            user_type (str): Type of user ("patient" or "doctor")
            
        Returns:
            str: Generated explanation
        """
        # Prepare all available fields
        variant_fields = {}
        for key, value in variant.items():
            if key != "info" and value:  # Skip empty values and the info dict
                variant_fields[key] = value
        
        # Add selected info fields if they exist
        for info_key in ["significance", "hgvs", "gene", "mc", "disease_name", "review_status"]:
            if info_key in variant:
                variant_fields[info_key] = variant[info_key]
        
        # Create field descriptions
        field_descriptions = []
        for key, value in variant_fields.items():
            field_descriptions.append(f"{key.capitalize()}: {value}")
        
        field_text = "\n".join(field_descriptions)
        
        # Adjust tone based on user type
        tone_guidance = {
            "patient": "Use simple language a non-expert can understand. Avoid technical jargon and explain any medical terms.",
            "doctor": "Use professional medical terminology appropriate for a healthcare provider with genetic knowledge."
        }
        
        prompt = f"""
        You are a helpful genetic assistant.
        Please explain the following genetic variant to a {user_type} in a way that is clear and accurate.
        
        {field_text}
        
        {tone_guidance.get(user_type, tone_guidance["patient"])}
        Do not include greetings or questions.
        Focus only on explaining the variant and its implications.
        If any critical information is missing, mention what would be needed for a more complete assessment.
        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            return f"Error generating explanation: {str(e)}"
    
    def process_vcf_file(self, filepath, user_type="patient", output_filepath=None):
        """
        Process a VCF file, generate explanations, and save as structured JSON
        
        Args:
            filepath (str): Path to the VCF file
            user_type (str, default: patient): User type to generate explanations for
            output_filepath (str, optional): Path to save the output JSON
            
        Returns:
            dict: Structured report data
        """
        # Parse the VCF file
        print(f"Parsing VCF file: {filepath}")
        variants = self.parse_vcf(filepath)
        print(f"Found {len(variants)} variants")
        
        # Create report structure
        report = {
            "metadata": {
                "filename": os.path.basename(filepath),
                "processed_date": datetime.now().isoformat(),
                "variant_count": len(variants)
            },
            "variants": []
        }
        
        # Process each variant
        for i, variant in enumerate(variants):
            print(f"Processing variant {i+1}/{len(variants)}")
            
            variant_report = {
                "raw_data": variant,
                "explanations": {}
            }
            
            # Generate explanations for different user types
            print(f"  Generating {user_type} explanation...")
            explanation = self.explain_variant(variant, user_type)
            variant_report["explanations"] = explanation
            
            report["variants"].append(variant_report)
        
        # Generate summary explanations
        print("Generating report summaries...")
        
        variant_count = len(variants)
        pathogenic_count = sum(1 for v in variants if v.get("significance", "").lower() == "pathogenic")
        vus_count = sum(1 for v in variants if "uncertain" in str(v.get("significance", "")).lower())
        
        summary_prompt = f"""
        Generate a brief summary of a genetic report containing {variant_count} variants.
        Pathogenic variants: {pathogenic_count}
        Variants of uncertain significance: {vus_count}
        
        Write this summary for a {user_type} in 2-3 sentences.
        Focus on what this means overall and any important next steps.
        """
        
        try:
            response = self.model.generate_content(summary_prompt)
            report["summary"] = response.text.strip()
        except Exception as e:
            report["summary"] = f"Error generating summary: {str(e)}"
        
        # Save the report if an output filepath is provided
        if output_filepath:
            print(f"Saving report to {output_filepath}")
            with open(output_filepath, 'w') as f:
                json.dump(report, f, indent=2)
        
        return report


In [8]:
# Initialize with your API key
explainer = GeneticExplainer(api_key=GEN_API)

# Process a VCF file
output_filepath = "genetic_report.json"

report = explainer.process_vcf_file(
    VCF_FILEPATH,
    user_type="patient",
    output_filepath=output_filepath
)

# Display summary
print("\nReport Summary:", report["summary"])

# Display first variant explanation as example
if report["variants"]:
    print("\nExample Variant Explanation (Patient):")
    print(report["variants"][0]["explanations"])


Parsing VCF file: /kaggle/input/vcf-data/clinvar.vcf
Found 5 variants
Processing variant 1/5
  Generating patient explanation...


[W::vcf_parse] Contig 'Y' is not defined in the header. (Quick workaround: index the file with tabix.)


Processing variant 2/5
  Generating patient explanation...
Processing variant 3/5
  Generating patient explanation...
Processing variant 4/5
  Generating patient explanation...
Processing variant 5/5
  Generating patient explanation...
Generating report summaries...
Saving report to genetic_report.json

Report Summary: Here's a summary you could provide to a patient:

"Your genetic test results have come back showing five variants in the genes analyzed, but thankfully none of these are known to cause disease and none require immediate concern. All identified variants were considered normal variations, meaning they are common in the population and not expected to impact your health. No further testing is recommended based on these results."

Example Variant Explanation (Patient):
This report describes a change in your Y chromosome, which is a sex chromosome found only in males. Specifically, there’s a difference in the genetic code at a particular location – think of it like a typo in a