# üß¨ PubMed to Qdrant Cloud Ingestion Notebook

This notebook allows you to fetch biomedical articles from PubMed, generate embeddings using Colab's GPU, and upsert them into **Qdrant Cloud**.

### üöÄ Setup Instructions:
1. Go to **Edit -> Notebook settings** and ensure **GPU** is selected.
2. Replace the `URL` and `API_KEY` in the configuration cell with your Qdrant Cloud credentials.
3. Run all cells.

In [None]:
# Install dependencies
!pip install -q qdrant-client sentence-transformers biopython

## ‚öôÔ∏è Configuration
Replace these placeholders with your Qdrant Cloud Cluster details.

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from Bio import Entrez
from sentence_transformers import SentenceTransformer
import torch
import json
from datetime import datetime

# --- QDRANT CLOUD CONFIG ---
QDRANT_URL = "YOUR_QDRANT_CLOUD_URL"  # e.g., https://xxxxxx.aws.cloud.qdrant.io:6333
QDRANT_API_KEY = "YOUR_API_KEY"
COLLECTION_NAME = "Articles"
VECTOR_SIZE = 384
VECTOR_NAME = "text"

# --- PUBMED CONFIG ---
Entrez.email = "your_email@example.com"

# Initialize Client
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

## üõ†Ô∏è Define Functions

In [None]:
def init_collection():
    if not client.collection_exists(COLLECTION_NAME):
        print(f"Creating collection: {COLLECTION_NAME}")
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config={
                VECTOR_NAME: VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
            }
        )
    else:
        print(f"Collection {COLLECTION_NAME} already exists.")

def fetch_pubmed_articles(from_date=None, max_results=500):
    query = "hasabstract[text]"
    if from_date:
        query += f" AND ({from_date}[PDAT] : 3000[PDAT])"

    print(f"Searching PubMed for: {query}")
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results, sort="pub+date")
    search_results = Entrez.read(handle)
    pmids = search_results["IdList"]

    if not pmids:
        return []

    print(f"Fetching {len(pmids)} articles...")
    fetch_handle = Entrez.efetch(db="pubmed", id=",".join(pmids), rettype="abstract", retmode="xml")
    records = Entrez.read(fetch_handle)
    
    articles = []
    for article in records["PubmedArticle"]:
        medline = article["MedlineCitation"]
        pmid = str(medline["PMID"])
        art = medline["Article"]
        
        abstract = " ".join(art["Abstract"]["AbstractText"]) if "Abstract" in art else ""
        
        articles.append({
            "pmid": pmid,
            "title": art.get("ArticleTitle", ""),
            "abstract": abstract,
            "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
        })
    return articles

def run_ingestion(from_date=None, max_results=1000):
    # 1. Init
    init_collection()
    
    # 2. Fetch
    articles = fetch_pubmed_articles(from_date=from_date, max_results=max_results)
    if not articles:
        print("No articles found.")
        return
    
    # 3. Embed (GPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Generating embeddings on: {device.upper()}")
    model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
    
    abstracts = [a["abstract"] for a in articles]
    vectors = model.encode(abstracts, show_progress_bar=True)
    
    # 4. Upsert
    print(f"Upserting {len(articles)} points to Qdrant Cloud...")
    points = []
    for i, (art, emb) in enumerate(zip(articles, vectors)):
        points.append(PointStruct(
            id=int(art["pmid"]),
            vector={VECTOR_NAME: emb.tolist()},
            payload=art
        ))
    
    client.upsert(collection_name=COLLECTION_NAME, points=points)
    print("‚úÖ Ingestion Complete!")

## üöÄ Run Ingestion

In [None]:
# Set your date here or leave as None for initial load
DATE_TO_FETCH = "2026/01/29"

run_ingestion(from_date=DATE_TO_FETCH, max_results=1000)