In [1]:
import csv
import time
from typing import List, Dict, Any

import torch
import torch.nn.functional as F
from Bio import Entrez
from vllm import LLM

# --- CONFIGURATION ---

# Required by NCBI to use the Entrez API
ENTREZ_EMAIL = "your.email@example.com"
if ENTREZ_EMAIL == "your.email@example.com":
    print("⚠️ WARNING: Please replace 'your.email@example.com' with your actual email address.")
Entrez.email = ENTREZ_EMAIL

# The general topic to search on PubMed
PUBMED_QUERY = "myostatin inhibitor"

# A specific, PICO-formatted research question for the instructed embedding
# P: Patients with Duchenne Muscular Dystrophy (DMD)
# I: Myostatin inhibitor therapy
# C: Placebo or standard of care
# O: Improvement in muscle function and strength
PICO_QUESTION = (
    "In patients with Duchenne Muscular Dystrophy, does myostatin inhibitor therapy, "
    "compared to placebo, lead to a significant improvement in muscle function and strength?"
)

# Model and output configuration
MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
MAX_PAPERS_TO_FETCH = 75  # Limit the number of papers for this test
OUTPUT_CSV_FILE = "myostatin_pico_similarity_results.csv"

# --- FUNCTIONS ---

def fetch_pubmed_data(query: str, max_count: int) -> List[Dict[str, Any]]:
    """
    Fetches article data (PMID, title, abstract) from PubMed for a given query.
    """
    print(f"Searching PubMed for: '{query}' (limit: {max_count} articles)...")
    
    try:
        # Step 1: Search PubMed to get a list of article IDs (PMIDs)
        handle = Entrez.esearch(db="pubmed", term=query, retmax=max_count)
        record = Entrez.read(handle)
        handle.close()
        pmids = record["IdList"]

        if not pmids:
            print("No articles found for the query.")
            return []

        print(f"Found {len(pmids)} PMIDs. Fetching details...")

        # Step 2: Fetch the detailed records for the retrieved PMIDs
        handle = Entrez.efetch(db="pubmed", id=pmids, rettype="medline", retmode="xml")
        records = Entrez.read(handle)
        handle.close()
        
        articles = []
        for pubmed_article in records.get('PubmedArticle', []):
            medline_citation = pubmed_article.get('MedlineCitation', {})
            article_info = medline_citation.get('Article', {})
            
            # Extract abstract, handling cases where it might be missing or structured
            abstract_parts = article_info.get('Abstract', {}).get('AbstractText', [])
            abstract = ' '.join(part for part in abstract_parts)
            
            if abstract:  # Only include articles that have an abstract
                articles.append({
                    "pmid": str(medline_citation.get('PMID', '')),
                    "title": article_info.get('ArticleTitle', 'No Title Found'),
                    "abstract": abstract
                })
        
        print(f"Successfully fetched {len(articles)} articles with abstracts.")
        return articles

    except Exception as e:
        print(f"An error occurred while fetching data from PubMed: {e}")
        return []

def create_pico_instruction(pico_question: str) -> str:
    """Formats the PICO question into the instruction format for the model."""
    return (
        "Instruct: Find medical abstracts that address the following clinical research question.\n"
        f"Query: {pico_question}"
    )

def main():
    """Main function to run the entire pipeline."""
    
    # 1. Fetch data from PubMed
    articles = fetch_pubmed_data(PUBMED_QUERY, MAX_PAPERS_TO_FETCH)
    if not articles:
        return

    # 2. Initialize the vLLM embedding model
    print(f"\nInitializing embedding model: {MODEL_NAME}...")
    # Use 'tensor_parallel_size' to leverage multiple GPUs if available
    # llm = LLM(model=MODEL_NAME, task="embed", tensor_parallel_size=torch.cuda.device_count())
    llm = LLM(model=MODEL_NAME, task="embed")


    # 3. Prepare the texts for embedding
    instructed_query = create_pico_instruction(PICO_QUESTION)
    abstracts = [article["abstract"] for article in articles]
    
    # The first item in the list is our query, the rest are documents
    texts_to_embed = [instructed_query] + abstracts

    # 4. Generate embeddings
    print(f"Generating embeddings for {len(texts_to_embed)} texts...")
    start_time = time.time()
    outputs = llm.embed(texts_to_embed)
    end_time = time.time()
    print(f"Embedding generation took {end_time - start_time:.2f} seconds.")

    # 5. Calculate similarity scores
    all_embeddings = torch.tensor([o.outputs.embedding for o in outputs])

    # Best practice: L2 normalize the embeddings for accurate cosine similarity
    all_embeddings = F.normalize(all_embeddings, p=2, dim=1)
    
    # Separate the query embedding from the document embeddings
    query_embedding = all_embeddings[0]
    document_embeddings = all_embeddings[1:]

    # Calculate cosine similarity (dot product of normalized vectors)
    similarity_scores = query_embedding @ document_embeddings.T
    
    # 6. Combine scores with article data
    for i, article in enumerate(articles):
        article['similarity_score'] = similarity_scores[i].item()

    # 7. Sort articles by similarity score in descending order
    sorted_articles = sorted(articles, key=lambda x: x['similarity_score'], reverse=True)

    # 8. Save results to CSV
    print(f"\nSaving results to '{OUTPUT_CSV_FILE}'...")
    with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['pmid', 'similarity_score', 'title', 'abstract']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for article in sorted_articles:
            writer.writerow(article)
    
    print("\n✅ Success! Process complete.")
    print("\nTop 5 most relevant articles based on the PICO question:")
    for i, article in enumerate(sorted_articles[:5]):
        print(f"{i+1}. PMID: {article['pmid']} | Score: {article['similarity_score']:.4f} | Title: {article['title']}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


INFO 08-07 22:00:26 [__init__.py:235] Automatically detected platform cuda.


ImportError: DLL load failed while importing _C: Não foi possível encontrar o módulo especificado.

In [1]:
import os, ctypes, torch, importlib.util, sys
print("Python:", sys.version)
print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
for dll in ["cudart64_124.dll", "torch_cuda.dll"]:
    try:
        ctypes.WinDLL(dll)
        print(dll, " ✓ found")
    except OSError as e:
        print(dll, " ✗", e)


Python: 3.12.9 | packaged by conda-forge | (main, Mar  4 2025, 22:37:18) [MSC v.1943 64 bit (AMD64)]
Torch: 2.6.0+cu124 CUDA: 12.4
CUDA available: True
cudart64_124.dll  ✗ Could not find module 'cudart64_124.dll' (or one of its dependencies). Try using the full path with constructor syntax.
torch_cuda.dll  ✓ found


In [None]:
"""
Find PubMed abstracts that best answer a PICO question
— WITHOUT vLLM —
Uses Qwen/Qwen3-Embedding-0.6B via Sentence-Transformers.
"""

import csv, time, torch, torch.nn.functional as F
from typing import List, Dict, Any
from Bio import Entrez
from sentence_transformers import SentenceTransformer           # <-- NEW

# ---------- USER CONFIG -----------------------------------------------------
ENTREZ_EMAIL      = "levi@example.com"              # REQUIRED by NCBI
PUBMED_QUERY      = "myostatin inhibitor"
PICO_QUESTION     = ("In patients with Duchenne Muscular Dystrophy, does "
                     "myostatin inhibitor therapy, compared to placebo, "
                     "lead to significant improvement in muscle function and strength?")
MAX_PAPERS        = 75
MODEL_NAME        = "Qwen/Qwen3-Embedding-0.6B"     # can swap for any ST model
USE_FP16          = True                            # set False if on CPU
OUT_CSV           = "myostatin_pico_similarity_results.csv"
# ---------------------------------------------------------------------------

Entrez.email = ENTREZ_EMAIL


# -------------------- 1. Fetch PubMed data ---------------------------------
def fetch_pubmed(query: str, limit: int) -> List[Dict[str, Any]]:
    handle = Entrez.esearch(db="pubmed", term=query, retmax=limit)
    pmids = Entrez.read(handle)["IdList"]; handle.close()
    if not pmids: return []

    handle = Entrez.efetch(db="pubmed", id=pmids, rettype="medline", retmode="xml")
    records = Entrez.read(handle); handle.close()

    articles = []
    for art in records.get("PubmedArticle", []):
        cite  = art["MedlineCitation"]
        info  = cite.get("Article", {})
        abst  = " ".join(info.get("Abstract", {}).get("AbstractText", []))
        if abst:
            articles.append({
                "pmid": cite.get("PMID", ""),
                "title": info.get("ArticleTitle", "—"),
                "abstract": abst})
    return articles


# -------------------- 2. Load embedding model ------------------------------
dtype = torch.float16 if (torch.cuda.is_available() and USE_FP16) else torch.float32
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer(              # wraps AutoModel + pooling head
    MODEL_NAME,
    device=device,
    cache_folder=".hf_cache",             # optional local cache
    model_kwargs={"torch_dtype": dtype,   # forward precision
                  "device_map": "auto"},  # split layers if multi-GPU
    tokenizer_kwargs={"padding_side": "left"}  # recommended for Qwen3 :contentReference[oaicite:0]{index=0}
)

# -------------------- 3. Prepare texts -------------------------------------
articles = fetch_pubmed(PUBMED_QUERY, MAX_PAPERS)
assert articles, "No PubMed abstracts found."

instruction = (
    "Instruct: Find medical abstracts that address the following clinical research question.\n"
    f"Query: {PICO_QUESTION}"
)
texts = [instruction] + [a["abstract"] for a in articles]

# -------------------- 4. Encode --------------------------------------------
start = time.time()
emb = model.encode(
    texts,
    batch_size=32,            # adjust to fit your GPU
    convert_to_tensor=True,   # returns a single torch.Tensor
    normalize_embeddings=True # built-in L2 normalisation
)
print(f"Embedding time: {time.time() - start:.1f}s")

query_vec, doc_vecs = emb[0], emb[1:]
sims = torch.matmul(doc_vecs, query_vec)   # cosine because already unit-norm

# -------------------- 5. Rank & export -------------------------------------
for art, score in zip(articles, sims):
    art["similarity_score"] = score.item()

articles.sort(key=lambda d: d["similarity_score"], reverse=True)

with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, ["pmid", "similarity_score", "title", "abstract"])
    writer.writeheader(); writer.writerows(articles)

print("\nTop-5 hits:")
for i, art in enumerate(articles[:5], 1):
    print(f"{i}. PMID {art['pmid']}  ({art['similarity_score']:.4f})  {art['title'][:90]}…")
