In [None]:
!pip install biopython



In [None]:
from Bio import Entrez
import pandas as pd
import time

# NCBI requirement — use your own email here
Entrez.email = "vtyag@illinois.edu"

# Load the search queries from file
df = pd.read_excel("/content/assertions_with_search_queries.xlsx")

def search_ncbi(keywords, db="pubmed", retmax=5):
    """
    Search NCBI (pubmed or bookshelf) and return top titles with IDs.
    """
    try:
        query = keywords
        handle = Entrez.esearch(db=db, term=query, retmax=retmax)
        record = Entrez.read(handle)
        handle.close()

        id_list = record.get("IdList", [])
        if not id_list:
            return []

        handle = Entrez.esummary(db=db, id=",".join(id_list))
        summaries = Entrez.read(handle)
        handle.close()

        results = []
        for summary in summaries:
            title = summary.get("Title", "No title")
            uid = summary.get("Id", "")
            results.append(f"{title} (ID: {uid})")
        return results
    except Exception as e:
        return [f"Error: {str(e)}"]

# Loop over each query and get top 5 results from PubMed + Bookshelf
pubmed_list = []
books_list = []

for i, row in df.iterrows():
    query = row["search_query"]

    pubmed_results = search_ncbi(query, db="pubmed", retmax=5)
    bookshelf_results = search_ncbi(query, db="books", retmax=5)

    pubmed_list.append(pubmed_results)
    books_list.append(bookshelf_results)

    # Respect NCBI rate limits
    time.sleep(0.4)

# Add results to DataFrame
df["pubmed_results"] = pubmed_list
df["bookshelf_results"] = books_list

# Save output
df.to_excel("fact_checker_results.xlsx", index=False)

print("Search complete. Results saved to fact_checker_results.xlsx")


Search complete. Results saved to fact_checker_results.xlsx


In [None]:
import pandas as pd
import re
import time
from Bio import Entrez
import google.generativeai as genai

# -------------------------
# CONFIGURATION
# -------------------------
Entrez.email = "vtyag@illinois.edu"  # Required by NCBI
GOOGLE_API_KEY = "AIzaSyBFbz5zLf3ClIq6rGbjOlve1H6kGhCHMdg"   # Replace with your Gemini API Key
genai.configure(api_key=GOOGLE_API_KEY)

MODEL = "gemini-2.5-flash"

# -------------------------
# FUNCTIONS
# -------------------------
def extract_ids_from_results(results_list):
    """
    Extract NCBI IDs from results list like:
    'Title (ID: 12345678)' → 12345678
    """
    ids = []
    if isinstance(results_list, str):
        try:
            results_list = eval(results_list)
        except:
            results_list = []
    if isinstance(results_list, list):
        for r in results_list:
            match = re.search(r"\(ID:\s*(\d+)\)", r)
            if match:
                ids.append(match.group(1))
    return ids

def fetch_ncbi_documents(id_list, db):
    """
    Fetch abstracts or summaries from NCBI PubMed or Bookshelf.
    """
    if not id_list:
        return []

    try:
        handle = Entrez.efetch(db=db, id=",".join(id_list), rettype="abstract", retmode="text")
        docs = handle.read()
        handle.close()
        return [docs]
    except Exception as e:
        return [f"Error fetching from {db}: {e}"]

def fact_check_with_gemini(assertion, evidence_text):
    """
    Send assertion + evidence to Gemini 2.5 Flash to verify.
    """
    prompt = f"""
    You are a scientific fact-checking assistant.
    Given the following scientific assertion:

    Assertion:
    {assertion}

    And the following evidence from scientific literature:
    {evidence_text}

    Task:
    1. Determine if the assertion is factually correct, incorrect, or partially correct.
    2. Provide a brief reasoning.
    3. Return the answer in JSON format:
       {{
         "verdict": "Factually correct / Incorrect / Partially correct",
         "reasoning": "Short reasoning here"
       }}
    """
    try:
        response = genai.GenerativeModel(MODEL).generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error: {e}"

# -------------------------
# MAIN PIPELINE
# -------------------------
# Load your fact checker search results
df = pd.read_excel("fact_checker_results.xlsx")

final_verdicts = []

for idx, row in df.iterrows():
    assertion = row["statement_text"]

    # Extract PubMed & Bookshelf IDs
    pubmed_ids = extract_ids_from_results(row["pubmed_results"])
    books_ids = extract_ids_from_results(row["bookshelf_results"])

    # Fetch documents
    pubmed_docs = fetch_ncbi_documents(pubmed_ids, db="pubmed")
    books_docs = fetch_ncbi_documents(books_ids, db="books")

    # Combine evidence
    all_evidence = "\n\n".join(pubmed_docs + books_docs)

    # Fact check with Gemini
    verdict_json = fact_check_with_gemini(assertion, all_evidence)

    final_verdicts.append(verdict_json)

    # NCBI rate limiting
    time.sleep(0.5)

# Add verdicts to DataFrame
df["fact_check_verdict"] = final_verdicts

# Save results
df.to_excel("fact_checker_with_verdicts.xlsx", index=False)

print("Fact checking complete. Results saved to fact_checker_with_verdicts.xlsx")


Fact checking complete. Results saved to fact_checker_with_verdicts.xlsx


In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m148.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from Bio import Entrez

# Always include your email per NCBI policy
Entrez.email = "vtyag@illinois.edu"

# Your query
query = "present-day prokaryotes eukaryotes descended ancestor single"

# Search PubMed
print("Searching PubMed...")
handle = Entrez.esearch(db="pubmed", term=query, retmax=10)
pubmed_results = Entrez.read(handle)
handle.close()
pubmed_ids = pubmed_results["IdList"]
print("PubMed IDs:", pubmed_ids)

# Fetch summaries for PubMed IDs
if pubmed_ids:
    handle = Entrez.esummary(db="pubmed", id=",".join(pubmed_ids))
    pubmed_summaries = Entrez.read(handle)
    handle.close()
    print("\nPubMed Summaries:")
    for summary in pubmed_summaries:
        print("-", summary.get("Title", "No title"))

# Search NCBI Bookshelf
print("\nSearching Bookshelf...")
handle = Entrez.esearch(db="books", term=query, retmax=10)
books_results = Entrez.read(handle)
handle.close()
books_ids = books_results["IdList"]
print("Bookshelf IDs:", books_ids)

# Fetch summaries for Bookshelf IDs
# Fetch summaries for Bookshelf IDs
if books_ids:
    handle = Entrez.esummary(db="books", id=",".join(books_ids))
    books_summaries = Entrez.read(handle)
    handle.close()
    print("\nBookshelf Summaries:")
    for summary in books_summaries:
        print("-", summary.get("Title", "No title"))


Searching PubMed...
PubMed IDs: []

Searching Bookshelf...
Bookshelf IDs: ['1603254', '1599746']

Bookshelf Summaries:
- THE ORIGIN AND EVOLUTION OF CELLS
- The Origin and Evolution of Cells


In [None]:
from Bio import Entrez

# Always include your email per NCBI policy
Entrez.email = "vtyag@illinois.edu"

# Your query
query = "DNA production cellular proteins carries genetic"

# Search PubMed
print("Searching PubMed...")
handle = Entrez.esearch(db="pubmed", term=query, retmax=10)
pubmed_results = Entrez.read(handle)
handle.close()
pubmed_ids = pubmed_results["IdList"]
print("PubMed IDs:", pubmed_ids)

# Fetch summaries for PubMed IDs
if pubmed_ids:
    handle = Entrez.esummary(db="pubmed", id=",".join(pubmed_ids))
    pubmed_summaries = Entrez.read(handle)
    handle.close()
    print("\nPubMed Summaries:")
    for summary in pubmed_summaries:
        print("-", summary.get("Title", "No title"))

# Search NCBI Bookshelf
print("\nSearching Bookshelf...")
handle = Entrez.esearch(db="books", term=query, retmax=10)
books_results = Entrez.read(handle)
handle.close()
books_ids = books_results["IdList"]
print("Bookshelf IDs:", books_ids)

# Fetch summaries for Bookshelf IDs
# Fetch summaries for Bookshelf IDs
if books_ids:
    handle = Entrez.esummary(db="books", id=",".join(books_ids))
    books_summaries = Entrez.read(handle)
    handle.close()
    print("\nBookshelf Summaries:")
    for summary in books_summaries:
        print("-", summary.get("Title", "No title"))


Searching PubMed...
PubMed IDs: ['40748513', '40691627', '40629055', '40623929', '40608358', '40598996', '40597313', '40597040', '40560800', '40558095']

PubMed Summaries:
- Loss of MALT1 Function in a Patient With Combined Immunodeficiency: a Novel Pathogenic Variant and Immunological Insights.
- "Small extracellular vesicles: messengers at the service of breast cancer agenda in the primary and distant microenvironments".
- Extremely low-frequency electromagnetic field (ELF-EMF) enhances mitochondrial energy production in NARP cybrids.
- [Exploration of the pathogenic mechanism of a novel c.661_664dup (p.P222Lfs*60) variant of SOX10 gene].
- Conjugative delivery of toxin genes ccdB and kil confers synergistic killing of bacterial recipients.
- Development of a CRISPR/Cas9 RNP-mediated genetic engineering system in Paecilomyces variotii.
- Outer membrane vesicles of Glaesserlla parasuis activate the endosomal cGAS-STING-IRF3 pathway through nucleic acid payload delivery: a biological p

In [None]:
!pip install -q tavily-python

In [None]:
import os
os.environ["TAVILY_API_KEY"] = "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"

from tavily import TavilyClient

tavily = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])

query = "give link for ncbi that talks about 'Ribosomes approximately diameter'"

search_results = tavily.search(query=query)

print(search_results)

ModuleNotFoundError: No module named 'tavily'