#### Warning:
To run this must be in the "blackwell" package root directory.

# Medline XML Indexing Notebook

In [None]:
import requests
import xml.etree.ElementTree as ET
from typing import List, Dict
import re
import json

# Download the Medline XML file
XML_URL = "https://medlineplus.gov/xml/mplus_topics_2025-11-29.xml"

response = requests.get(XML_URL)
xml_content = response.content

Downloading XML from https://medlineplus.gov/xml/mplus_topics_2025-11-29.xml...
Downloaded 29957417 bytes


#### Complete XML Topic-Parsing logic

In [43]:
def parse_health_topic(topic_elem, source_url: str) -> Dict:
    """
    Parse a single health-topic XML element into ONE document dictionary.
    Matches the exact format of the original database.
    """
    # Extract attributes
    topic_id = topic_elem.get('id', '')
    title = topic_elem.get('title', '')
    url = topic_elem.get('url', '')
    date_created = topic_elem.get('date-created', '')
    language = topic_elem.get('language', 'English')
    meta_desc = topic_elem.get('meta-desc', '')
    
    # Extract 'also-called' elements
    also_called_list = [ac.text.strip() for ac in topic_elem.findall('also-called') if ac.text]
    also_called_text = ', '.join(also_called_list)
    
    # Extract full summary and clean HTML
    full_summary = topic_elem.find('full-summary')
    full_summary_text = full_summary.text if full_summary is not None and full_summary.text else ''
    full_summary_clean = re.sub(r'<[^>]+>', '', full_summary_text)
    
    # Extract primary institute
    primary_inst = topic_elem.find('primary-institute')
    primary_inst_text = primary_inst.text.strip() if primary_inst is not None and primary_inst.text else ''
    
    # Extract see-reference
    see_refs = [sr.text.strip() for sr in topic_elem.findall('see-reference') if sr.text]
    see_also_text = ', '.join(see_refs)
    
    # Extract groups (categories)
    groups = [g.text.strip() for g in topic_elem.findall('group') if g.text]
    categories_text = ', '.join(groups)
    
    # Extract mesh headings
    mesh_headings = [m.find('descriptor').text.strip() 
                     for m in topic_elem.findall('mesh-heading') 
                     if m.find('descriptor') is not None and m.find('descriptor').text]
    mesh_text = ', '.join(mesh_headings)
    
    # Extract site links grouped by category
    links_by_category = {}
    for site in topic_elem.findall('site'):
        site_title = site.get('title', '')
        site_url = site.get('url', '')
        info_cat = site.find('information-category')
        category = info_cat.text.strip() if info_cat is not None and info_cat.text else 'General'
        
        if category not in links_by_category:
            links_by_category[category] = []
        links_by_category[category].append({'title': site_title, 'url': site_url})
    
    # Build content
    content_parts = []
    
    if title:
        content_parts.append(f"Title: {title}")
    if meta_desc:
        content_parts.append(f"\nDescription: {meta_desc}")
    if also_called_text:
        content_parts.append(f"\nAlso Called: {also_called_text}")
    if full_summary_clean:
        content_parts.append(f"\nFull Summary:\n{full_summary_clean}")
    if primary_inst_text:
        content_parts.append(f"\nPrimary Institute: {primary_inst_text}")
    if see_also_text:
        content_parts.append(f"\nSee Also: {see_also_text}")
    if categories_text:
        content_parts.append(f"\nCategories: {categories_text}")
    if mesh_text:
        content_parts.append(f"\nMedical Subject Headings: {mesh_text}")
    
    # Add links grouped by category
    if links_by_category:
        content_parts.append("\n\nAdditional Resources:\n")
        for category, links in links_by_category.items():
            content_parts.append(f"\n{category}:\n")
            for link in links:
                content_parts.append(f"  - {link['title']}: {link['url']}\n")
    
    content = "".join(content_parts)
    
    # Build metadata
    metadata = {
        'source': source_url,
        'topic_title': title,
        'topic_url': url,
        'topic_id': topic_id,
        'date_created': date_created,
        'num_site_links': sum(len(v) for v in links_by_category.values()),
        'language': language,
        'site_links': json.dumps([
            {'title': l['title'], 'url': l['url'], 'category': cat}
            for cat, links in links_by_category.items() for l in links
        ]),
        'type': 'medlineplus_topic'
    }
    
    return {'content': content, 'metadata': metadata}

# Parse XML
root = ET.fromstring(xml_content)
topics = root.findall('.//health-topic')

#### Parse all topics into documents

In [44]:
from langchain_core.documents import Document

documents = []
for topic in topics:
    parsed = parse_health_topic(topic, XML_URL)
    doc = Document(page_content=parsed['content'], metadata=parsed['metadata'])
    documents.append(doc)

print(f"Created {len(documents)} documents from {len(topics)} topics")

Created 2033 documents from 2033 topics


#### Perform chunking over the documents

In [45]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1536,
    chunk_overlap=256,
    length_function=len,
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks from {len(documents)} documents")

Created 10328 chunks from 2033 documents


#### Ingest Documents into ChromaDB

In [46]:
# Ingest into ChromaDB
from blackwell.config import embeddings_model, DB_PATH, DB_COLLECTION
import time

vector_store = Chroma(
    collection_name="test_collection",
    embedding_function=embeddings_model,
    persist_directory=DB_PATH,
)

batch_size = 3000

# Add in batches of 3000 with 60s delay given rate limits of Tier 1 API
if len(chunks) > batch_size:
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size] if i+batch_size < len(chunks) else chunks[i:]
        print(f"Importing chunks {i} to {i + len(batch) - 1}")
        vector_store.add_documents(batch)
        time.sleep(60)
else:
    vector_store.add_documents(chunks)

print(f"Total docs in DB: {len(vector_store.get()['ids'])}")

Importing chunks 0 to 2999
Importing chunks 3000 to 5999
Importing chunks 3000 to 5999
Importing chunks 6000 to 8999
Importing chunks 6000 to 8999
Importing chunks 9000 to 10327
Importing chunks 9000 to 10327
Done! Total docs in DB: 10328
Done! Total docs in DB: 10328
