## Import Libraries

In [None]:
from pyeuropepmc import SearchClient, FullTextClient
import os
import time
from datetime import datetime

## Search for Papers

Let's search for papers that are likely to have XML full text available.

In [None]:
# Initialize clients
search_client = SearchClient()
fulltext_client = FullTextClient()

# Search for papers (open access papers are more likely to have XML)
query = "cancer immunotherapy OPEN_ACCESS:y"

print(f"Searching for papers...")
print(f"Query: {query}")

# Search for 30 papers
results = search_client.search(query, limit=30)
papers = results["resultList"]["result"]

print(f"\nFound {len(papers)} papers")

# Show sample papers
for i, paper in enumerate(papers[:5], 1):
    print(f"{i}. {paper.get('title', 'No title')[:80]}...")
    print(f"   DOI: {paper.get('doi', 'N/A')}")
    print(f"   PMCID: {paper.get('pmcid', 'N/A')}")
    print()

## Download XML for All Papers

Now let's download the XML content for all papers. We'll try each one and keep track of successes and failures.

In [None]:
# Create output directory
output_dir = "xml_downloads"
os.makedirs(output_dir, exist_ok=True)

print(f"Downloading XML for {len(papers)} papers...")
print(f"Saving to: {output_dir}/")
print()

downloaded_files = []
failed_downloads = []

start_time = time.time()

for i, paper in enumerate(papers, 1):
    doi = paper.get('doi')
    pmcid = paper.get('pmcid')

    identifier = doi or pmcid or f"paper_{i}"
    print(f"[{i}/{len(papers)}] Downloading: {identifier}")

    try:
        # Try to download XML
        if pmcid:
            xml_content = fulltext_client.download_xml(pmcid)
        elif doi:
            xml_content = fulltext_client.download_xml_by_doi(doi)
        else:
            raise ValueError("No PMCID or DOI available")

        # Save to file
        safe_identifier = identifier.replace('/', '_').replace('.', '_').replace(':', '_')
        filename = f"{output_dir}/{safe_identifier}.xml"

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(xml_content)

        downloaded_files.append(filename)
        print(f"  ✓ Saved: {filename} ({len(xml_content)} chars)")

    except Exception as e:
        failed_downloads.append((identifier, str(e)))
        print(f"  ✗ Failed: {e}")

    # Small delay to be respectful to the API
    time.sleep(0.5)

end_time = time.time()
duration = end_time - start_time

print(f"\n{'='*60}")
print("DOWNLOAD SUMMARY")
print(f"{'='*60}")
print(f"Total papers: {len(papers)}")
print(f"Successful downloads: {len(downloaded_files)}")
print(f"Failed downloads: {len(failed_downloads)}")
print(f"Total time: {duration:.1f} seconds")
if len(papers) > 0:
    print(f"Average time per paper: {duration/len(papers):.1f} seconds")

if downloaded_files:
    print(f"\nDownloaded files:")
    for filename in downloaded_files[:10]:  # Show first 10
        print(f"  - {filename}")
    if len(downloaded_files) > 10:
        print(f"  ... and {len(downloaded_files) - 10} more")

if failed_downloads:
    print(f"\nFailed downloads:")
    for identifier, error in failed_downloads[:5]:  # Show first 5 failures
        print(f"  - {identifier}: {error}")
    if len(failed_downloads) > 5:
        print(f"  ... and {len(failed_downloads) - 5} more failures")

print(f"\nXML files saved in: {output_dir}/")

## Verify Downloads

Let's check that the downloads were successful by examining one of the files.

In [None]:
if downloaded_files:
    # Check the first downloaded file
    sample_file = downloaded_files[0]
    print(f"Checking sample file: {sample_file}")

    with open(sample_file, 'r', encoding='utf-8') as f:
        content = f.read()

    print(f"File size: {len(content)} characters")
    print(f"Contains '<article>': {'<article>' in content}")
    print(f"Contains '<front>': {'<front>' in content}")
    print(f"Contains '<body>': {'<body>' in content}")

    # Show first 500 characters
    print(f"\nFirst 500 characters:")
    print("-" * 40)
    print(content[:500])
    print("-" * 40)
else:
    print("No files were downloaded successfully")

## Summary

This notebook demonstrated:

1. Searching for papers that are likely to have XML full text
2. Batch downloading XML content for multiple papers
3. Saving XML files locally with organized filenames
4. Error handling for failed downloads
5. Download statistics and verification

## Next Steps

- Parse the downloaded XML files using `FullTextXMLParser`
- Convert to RDF using the `RDFMapper`
- Build a knowledge graph from multiple papers
- Analyze content across papers

The downloaded XML files can now be used as input for further processing in other notebooks like the data models demo or RDF conversion demo.