# Advanced Full Text Retrieval Examples

This notebook demonstrates the advanced features of the PyEuropePMC FullTextClient, including:

1. **HTML Full Text Download** - New HTML content retrieval
2. **Integrated Search and Download** - End-to-end workflow from search to full text
3. **Multi-format Batch Downloads** - Efficient bulk processing including HTML
4. **Smart Content Filtering** - Automatic availability checking

## Setup

In [None]:
import logging
from pathlib import Path
import tempfile

from pyeuropepmc import FullTextClient

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

print("‚úÖ PyEuropePMC imported successfully")

## 1. HTML Full Text Download

The FullTextClient now supports downloading HTML content directly from Europe PMC articles.

In [None]:
# Initialize the full text client
fulltext_client = FullTextClient(rate_limit_delay=1.0)

print("üîç Testing HTML full text download...")

# Test HTML download for a known article
pmcid = "3257301"  # Known open access article

with tempfile.TemporaryDirectory() as temp_dir:
    output_path = Path(temp_dir) / f"PMC{pmcid}.html"

    try:
        # Download HTML content
        result = fulltext_client.download_html_by_pmcid(pmcid, output_path)

        if result and result.exists():
            file_size = result.stat().st_size
            print(f"‚úÖ HTML downloaded successfully: {result.name}")
            print(f"   File size: {file_size:,} bytes")

            # Show a preview of the content
            with open(result, 'r', encoding='utf-8') as f:
                content = f.read()
                print(f"   Content preview: {content[:200]}...")
        else:
            print("‚ùå HTML download failed")

    except Exception as e:
        print(f"‚ùå Error during HTML download: {e}")

## 2. Multi-format Availability Checking

Check what formats are available for different articles.

In [None]:
print("üîç Checking multi-format availability...")

# Test with multiple PMC IDs
test_pmcids = ["3257301", "1716993", "5251083"]

for pmcid in test_pmcids:
    try:
        availability = fulltext_client.check_fulltext_availability(pmcid)
        print(f"\nPMC{pmcid} availability:")
        print(f"  üìÑ PDF:  {'‚úÖ' if availability['pdf'] else '‚ùå'}")
        print(f"  üìù XML:  {'‚úÖ' if availability['xml'] else '‚ùå'}")
        print(f"  üåê HTML: {'‚úÖ' if availability['html'] else '‚ùå'}")

        # Count available formats
        available_count = sum(availability.values())
        print(f"  Total: {available_count}/3 formats available")

    except Exception as e:
        print(f"‚ùå Error checking PMC{pmcid}: {e}")

## 3. Multi-format Batch Downloads

Download multiple articles in different formats efficiently.

In [None]:
print("üì• Testing multi-format batch downloads...")

test_pmcids = ["3257301", "1716993"]

with tempfile.TemporaryDirectory() as temp_dir:
    output_dir = Path(temp_dir)

    # Test each format
    formats = ['xml', 'html']  # Skip PDF for speed in this demo

    for format_type in formats:
        print(f"\nüîÑ Downloading {format_type.upper()} files...")

        try:
            results = fulltext_client.download_fulltext_batch(
                pmcids=test_pmcids,
                format_type=format_type,
                output_dir=output_dir,
                skip_errors=True
            )

            print(f"   Results for {format_type.upper()}:")
            successful = 0
            total_size = 0

            for pmcid, file_path in results.items():
                if file_path and file_path.exists():
                    size = file_path.stat().st_size
                    total_size += size
                    successful += 1
                    print(f"   ‚úÖ PMC{pmcid}: {file_path.name} ({size:,} bytes)")
                else:
                    print(f"   ‚ùå PMC{pmcid}: Failed")

            print(f"   Summary: {successful}/{len(test_pmcids)} files, {total_size:,} bytes total")

        except Exception as e:
            print(f"‚ùå Error in {format_type} batch download: {e}")

## 4. Integrated Search and Download Workflow

The new `search_and_download_fulltext` method provides an end-to-end workflow from search to full text download.

In [None]:
print("üîç‚û°Ô∏èüì• Testing integrated search and download workflow...")

with tempfile.TemporaryDirectory() as temp_dir:
    output_dir = Path(temp_dir)

    # Search for open access articles and download XML
    try:
        print("\nüîç Searching for 'CRISPR AND open access' and downloading XML...")

        results = fulltext_client.search_and_download_fulltext(
            query="CRISPR AND open access AND pmcid",
            format_type="xml",
            max_results=3,
            output_dir=output_dir,
            only_available=True  # Only download papers where XML is actually available
        )

        print("\nüìä Search and download results:")
        print(f"   Found and processed: {len(results)} articles")

        successful = 0
        total_size = 0

        for pmcid, file_path in results.items():
            if file_path and file_path.exists():
                size = file_path.stat().st_size
                total_size += size
                successful += 1
                print(f"   ‚úÖ PMC{pmcid}: {file_path.name} ({size:,} bytes)")

                # Show XML preview
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read(200)
                    print(f"      Preview: {content.strip()[:100]}...")
            else:
                print(f"   ‚ùå PMC{pmcid}: Download failed")

        print(f"\nüìà Summary: {successful}/{len(results)} successful downloads")
        print(f"   Total size: {total_size:,} bytes")

    except Exception as e:
        print(f"‚ùå Error in integrated workflow: {e}")

## 5. HTML-specific Workflow

Demonstrate HTML-specific download workflow with search integration.

In [None]:
print("üåê Testing HTML-specific integrated workflow...")

with tempfile.TemporaryDirectory() as temp_dir:
    output_dir = Path(temp_dir)

    try:
        print("\nüîç Searching for articles and downloading HTML content...")

        # Search and download HTML for open access articles
        results = fulltext_client.search_and_download_fulltext(
            query="vaccine AND COVID-19 AND pmcid",
            format_type="html",
            max_results=2,
            output_dir=output_dir,
            only_available=True,
        )

        print("\nüåê HTML download workflow results:")

        for pmcid, file_path in results.items():
            if file_path and file_path.exists():
                size = file_path.stat().st_size
                print(f"   ‚úÖ PMC{pmcid}: {file_path.name} ({size:,} bytes)")

                # Analyze HTML content
                with open(file_path, "r", encoding="utf-8") as f:
                    html_content = f.read()

                    # Count some HTML elements
                    title_count = html_content.count("<title")
                    p_count = html_content.count("<p")
                    link_count = html_content.count("<a href")

                    print(
                        f"HTML analysis: {title_count} titles, {p_count} paragraphs, {link_count} links"
                    )
            else:
                print(f"   ‚ùå PMC{pmcid}: HTML download failed")

        success_rate = (
            len([p for p in results.values() if p]) / len(results) * 100 if results else 0
        )
        print(f"\nüìä HTML workflow success rate: {success_rate:.1f}%")

    except Exception as e:
        print(f"‚ùå Error in HTML workflow: {e}")

## 6. Performance and Comparison

Compare the performance of different formats and workflows.

In [None]:
import time

print("‚ö° Performance comparison of different formats...")

test_pmcid = "3257301"  # Known fast-downloading article

with tempfile.TemporaryDirectory() as temp_dir:
    output_dir = Path(temp_dir)

    formats = ['xml', 'html']  # Skip PDF for speed
    timing_results = {}

    for format_type in formats:
        print(f"\n‚è±Ô∏è  Testing {format_type.upper()} download speed...")

        start_time = time.time()

        try:
            if format_type == 'xml':
                result = fulltext_client.download_xml_by_pmcid(
                    test_pmcid, output_dir / f"PMC{test_pmcid}.xml"
                )
            elif format_type == 'html':
                result = fulltext_client.download_html_by_pmcid(
                    test_pmcid, output_dir / f"PMC{test_pmcid}.html"
                )

            end_time = time.time()
            duration = end_time - start_time

            if result and result.exists(): # type: ignore
                size = result.stat().st_size
                speed = size / duration / 1024  # KB/s
                timing_results[format_type] = {
                    'duration': duration,
                    'size': size,
                    'speed': speed
                }
                print(f"   ‚úÖ {format_type.upper()}: {duration:.2f}s, {size:,} bytes, {speed:.1f} KB/s")
            else:
                print(f"   ‚ùå {format_type.upper()}: Download failed")

        except Exception as e:
            print(f"   ‚ùå {format_type.upper()}: Error - {e}")

    # Summary
    if timing_results:
        print("\nüìä Performance Summary:")
        fastest = min(timing_results.items(), key=lambda x: x[1]['duration'])
        largest = max(timing_results.items(), key=lambda x: x[1]['size'])
        print(f"   üèÉ Fastest: {fastest[0].upper()} ({fastest[1]['duration']:.2f}s)")
        print(f"   üìè Largest: {largest[0].upper()} ({largest[1]['size']:,} bytes)")

## 7. Cleanup

Clean up resources properly.

In [None]:
# Clean up
fulltext_client.close()
print("‚úÖ FullTextClient closed successfully")

print("\nüéâ Advanced Full Text Retrieval demonstration completed!")
print("\nKey new features demonstrated:")
print("   üåê HTML content download")
print("   üîç‚û°Ô∏èüì• Integrated search-to-download workflow")
print("   üì¶ Multi-format batch processing")
print("   üéØ Smart availability filtering")
print("   ‚ö° Performance optimization")