In [2]:
import os 
os.getcwd()
os.chdir(r"..")

In [3]:
from langchain.schema import Document
import pandas as pd
import json

# --- Load abstracts ---
with open("data/pubmed_abstracts_top1000.json") as f:
    pmid_abstracts = json.load(f)

# --- Load and filter data ---
df = pd.read_csv("data/significant_associations.csv")
df = df.dropna(subset=["PUBMEDID"])
df["PUBMEDID"] = df["PUBMEDID"].astype(int).astype(str)
df = df[df["PUBMEDID"].isin(pmid_abstracts.keys())]

# --- Group by study (PMID) ---
grouped = df.groupby("PUBMEDID")
docs = []

for pmid, group in grouped:
    abstract = pmid_abstracts.get(pmid)
    if not abstract:
        continue

    # Unique fields per study
    genes = sorted(set(group["GENE"].dropna()))
    mapped_genes = sorted(set(group["MAPPED_GENE"].dropna()))
    traits = sorted(set(group["DISEASE/TRAIT"].dropna()))
    snps = sorted(set(group["SNPS"].dropna()))
    chromosomes = sorted(set(map(str, group["CHR_ID"].dropna())))
    positions = sorted(set(map(str, group["CHR_POS"].dropna())))
    pvals = sorted(set(map(str, group["P-VALUE"].dropna())))
    effects = sorted(set(map(str, group["OR or BETA"].dropna())))

    ancestries = sorted(set(group["INITIAL SAMPLE SIZE"].dropna()))
    authors = sorted(set(group["FIRST AUTHOR"].dropna()))
    journals = sorted(set(group["JOURNAL"].dropna()))
    dates = sorted(set(group["DATE"].dropna()))

    # --- Construct textual content ---
    content = (
        f"Study PMID {pmid} reports genetic associations from a GWAS.\n\n"
        f"- SNPs: {', '.join(snps)}\n"
        f"- Chromosomes: {', '.join(map(str, chromosomes))}\n"
        f"- Positions: {', '.join(map(str, positions))}\n"
        f"- Genes (Reported): {', '.join(genes)}\n"
        f"- Genes (Mapped): {', '.join(mapped_genes)}\n"
        f"- Traits: {', '.join(traits)}\n"
        f"- P-values: {', '.join(map(str, pvals))}\n"
        f"- Effect sizes (OR/Beta): {', '.join(map(str, effects))}\n"
        f"- Ancestries: {', '.join(ancestries)}\n"
        f"- Authors: {', '.join(authors)}\n"
        f"- Journal(s): {', '.join(journals)}\n"
        f"- Publication dates: {', '.join(dates)}\n\n"
        f"Abstract:\n{abstract}"
    )

    # --- Metadata (machine-usable, clean keys) ---
    metadata = {
        "pmid": pmid,
        "snps": ", ".join(snps),
        "chromosomes": ", ".join(chromosomes),
        "positions": ", ".join(positions),
        "genes_reported": ", ".join(genes),
        "genes_mapped": ", ".join(mapped_genes),
        "traits": ", ".join(traits),
        "p_values": ", ".join(pvals),
        "effect_sizes": ", ".join(effects),
        "ancestries": ", ".join(ancestries),
        "authors": ", ".join(authors),
        
        "journals": ", ".join(journals),
        "publication_dates": ", ".join(dates),
        "source": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
    }


    docs.append(Document(page_content=content, metadata=metadata))

print(f"✅ Created {len(docs)} unique documents (one per study)")


  df = pd.read_csv("data/significant_associations.csv")


✅ Created 1000 unique documents (one per study)


In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.docstore.document import Document
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from tqdm import tqdm
import shutil

# 📌 Paramètres
N_DOCS = 1000
INDEX_DIR = "rag_gwas_index"

# 📚 Sous-échantillon
subset_docs = docs[:N_DOCS]

# 🔤 Texte et métadonnées
texts = [doc.page_content for doc in subset_docs]
metas = [doc.metadata for doc in subset_docs]

# 🧠 Créer l'embedding Ollama
embedding = OllamaEmbeddings(model="nomic-embed-text")

# 🔢 Embeddings avec barre de progression
print("⏳ Calcul des embeddings...")
embeddings = [embedding.embed_query(text) for text in tqdm(texts)]

# 📄 Créer les documents avec embeddings
docs_embedded = [
    Document(page_content=texts[i], metadata=metas[i])
    for i in range(len(texts))
]




  embedding = OllamaEmbeddings(model="nomic-embed-text")


⏳ Calcul des embeddings...


100%|██████████| 1000/1000 [00:48<00:00, 20.77it/s]


In [5]:
import shutil
from langchain.vectorstores import Chroma

# 🔧 Dossier où l’index sera sauvegardé
INDEX_DIR = "rag_gwas_index"

# 🧹 Supprimer l'ancien index si présent
shutil.rmtree(INDEX_DIR, ignore_errors=True)

# 🧠 Créer le vectorstore avec les bons documents
print(f"📦 Indexation de {len(docs_embedded)} documents...")
vectorstore = Chroma.from_documents(
    documents=docs_embedded,
    embedding=embedding,
    persist_directory=INDEX_DIR
)

# 💾 Sauvegarde sur disque
vectorstore.persist()
print(f"✅ Index sauvegardé dans '{INDEX_DIR}' avec {len(docs_embedded)} documents.")





📦 Indexation de 1000 documents...
✅ Index sauvegardé dans 'rag_gwas_index' avec 1000 documents.


  vectorstore.persist()


In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# --- Parameters ---
INDEX_DIR = "rag_gwas_index"

# --- RAG QA PIPELINE ---

# 🔁 Reload vectorstore
print("🔁 Loading vectorstore for QA...")
embedding = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma(
    persist_directory=INDEX_DIR,
    embedding_function=embedding
)

# 🔍 Retriever
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}
)

# 🤖 Local LLM
llm = Ollama(model="mistral:instruct")

# 📜 Structured Prompt
system_prompt = """
You are a human genetics expert. Answer the user's question **using only the provided documents**.

Your answer must have two parts:

1. 💡 **Gene summary**:
   - Start with a concise list or sentence stating which genes are associated with the disease mentioned in the question ALWAYS citing the source.

2. 📚 **Study-by-study details**:
   - For each of the 5 documents:
     - Mention the **PMID**
     - Summarize the **associated genes**, **population studied**, **identified loci**, and any other relevant info.
     - If the document does not mention any genes, say: “This study does not mention any gene associated with the condition.”

❌ Do NOT infer or guess.
❌ Do NOT fabricate any information.
✅ Use only the content of the documents below.



Question: {question}
=========
{context}
=========
Answer:
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=system_prompt
)

# 🔗 QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

# 🧠 Ask your question
query = "Which genes are associated with hypertension?"
result = qa_chain({"query": query})

# --- Display Results ---
print("\n🔎 Retrieved Documents:")
pmid_set = set()
for doc in result["source_documents"]:
    pmid = doc.metadata.get("pmid", "unknown")
    if pmid not in pmid_set:
        print("→ PMID:", pmid)
        pmid_set.add(pmid)

print("\n🧠 Generated Answer:\n")
print(result["result"])

print("\n📚 Sources:")
for doc in result["source_documents"]:
    print("-", doc.metadata.get("source", "no source"))

# --- Automated Judgment ---
critic_prompt = f"""
You are an expert scientific reviewer.

Evaluate the following RESPONSE and compare it to the DOCUMENTS.

⚠️ Only answer "YES" if the response is **explicitly supported** by facts present in the documents.

Rules:
- If you are not completely sure, say "NO".
- First line MUST be "YES" or "NO" (in ALL CAPS, nothing else).
- If "NO", give a short explanation on the second line.

--- RESPONSE TO EVALUATE ---
{result['result']}

--- DOCUMENTS USED ---
{[doc.page_content[:1000] for doc in result['source_documents']]}

Was the response 100% faithful to the information provided in the documents?
"""

judgment = llm.invoke(critic_prompt)
print("\n🧠 Verdict:\n", judgment.strip())


  vectorstore = Chroma(
  llm = Ollama(model="mistral:instruct")
  result = qa_chain({"query": query})


🔁 Loading vectorstore for QA...

🔎 Retrieved Documents:
→ PMID: 19430479
→ PMID: 28498854
→ PMID: 28739976
→ PMID: 25249183
→ PMID: 26390057

🧠 Generated Answer:

 The study you've provided is a genome-wide association and replication study of blood pressure phenotypes among individuals of East Asian, European, and South Asian ancestry. The research was conducted by up to 320,251 individuals from various contributing institutions. They identified 12 new genetic loci associated with blood pressure (P value ranges from 3.9e-11 to 5.0e-21).

The sentinel SNPs at the newly discovered loci suggest that DNA methylation may play a role in blood pressure regulation, as they are enriched for association with multiple nearby CpG sites. The genes involved in vascular smooth muscle (IGFBP3, KCNK3, PDE3A, and PRDM6) and renal function (ARHGAP24, OSR1, SLC22A7, and TBX2) are implicated in these new genetic variants.

These newly identified genetic variations are found to predict increased left ventr

In [16]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# --- Parameters ---
INDEX_DIR = "rag_gwas_index"
embedding = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma(persist_directory=INDEX_DIR, embedding_function=embedding)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
llm = Ollama(model="mistral:instruct")

# --- Step 1: Get 5 documents
docs = retriever.get_relevant_documents("Which genes are associated with hypertension?")

# --- Step 2: Ask the question on each document individually
single_doc_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a human genetics expert.

Using ONLY the content below, answer the question.

Content:
---------
{context}
---------
Question: {question}

Answer:"""
)

answers = []
for i, doc in enumerate(docs):
    context = doc.page_content
    question = "Which genes are associated with hypertension?"
    prompt = single_doc_prompt.format(context=context, question=question)
    response = llm.invoke(prompt)
    answers.append((doc.metadata.get("pmid", f"doc_{i+1}"), response.strip()))

# --- Step 3: Summarize the 5 answers together
all_responses = "\n\n".join(
    f"PMID {pmid}:\n{answer}" for pmid, answer in answers
)

synthesis_prompt = f"""
You are a scientific summarizer.

You are given 5 independent answers to the question "Which genes are associated with hypertension?", each based on a scientific document.

Your task is to write a detailed, structured synthesis that:

1. **Starts with a clear summary** of the main genetic findings across all studies.
2. **Groups genes by biological relevance** (e.g., vascular function, renal function, signal transduction, etc.) if such information is available.
3. **Mentions the study (PMID)** that supports each gene's association.
4. Provides short contextual details for each gene (e.g., role, type of variant, pathway) **only if present in the text**.
5. Does **NOT invent or infer** anything not in the original answers.

Here is the content to synthesize:
---
{all_responses}
---
Now write the final answer.
"""


final_answer = llm.invoke(synthesis_prompt)

# --- Display
print("\n🧠 Synthesized Answer:\n")
print(final_answer)

print("\n📚 Sources:")
for doc in docs:
    print("-", doc.metadata.get("source", "no source"))




🧠 Synthesized Answer:

 Summary:
The studies collectively suggest several genes associated with hypertension. These include ATP2B1 (PMID 19430479, 25249183), CACNB2, TBX3-AS1, SH2B3, ULK4 (PMID 19430479), GPR20, TARID, and FRMD3 (PMID 28498854), HSPB7, TNXB (PMID 28739976, 25249183), CACNA1D (PMID 25249183), CASZ1, FGF5, HECTD4, LINC01752 - LINC02871, PRDM8, RN7SL865P - LINC02463, SOX6 (PMID 25249183), and AKT2, EBF2, NFKBIA, IGFBP3, KCNK3, PDE3A, PRDM6, ARHGAP24, OSR1, SLC22A7, TBX2 (PMID 26390057).

Grouped by Biological Relevance:

1. **Vascular Function**: TNXB (associated in multiple studies), IGFBP3, KCNK3, PDE3A, ARHGAP24, and OSR1 (PMID 26390057). These genes play roles in vascular smooth muscle function.

2. **Renal Function**: TNXB (associated in multiple studies), SLC22A7, and TBX2 (PMID 26390057) are associated with renal function. Additionally, EBF2 and NFKBIA were found to be involved in blood pressure regulation in a study (PMID 28739976), but their direct relevance to 