In [None]:
# !pip install lxml

In [None]:
# Download the metadata file "PMCLiteMetadata.tgz" from here: https://europepmc.org/pub/databases/pmc/PMCLiteMetadata/

In [None]:
# Upzip PMCLiteMetadata.tgz

import tarfile

# Path to the .tgz file
tgz_path = "PMCLiteMetadata.tgz" # Update the path if needed

# Extract the .tgz file
with tarfile.open(tgz_path, "r:gz") as tar:
    tar.extractall(path="extracted_pmc_metadata")
    print(f"Extracted files to: extracted_pmc_metadata")

In [None]:
import pandas as pd
import os
import tarfile
import time
import glob
import os
import re
import io
from tqdm import tqdm

# Ultra-fast version using lxml (much faster than xml.etree)
try:
    from lxml import etree
    LXML_AVAILABLE = True
    print("lxml is available - using high-performance XML parser")
except ImportError:
    LXML_AVAILABLE = False
    print("lxml not available - using standard parser (slower)")

def parse_pmc_xml_robust(xml_file, max_articles=None):
    """
    Robust parser that can handle malformed XML with missing closing tags
    Uses text processing to extract article data even when XML structure is broken
    """
    
    file_size = os.path.getsize(xml_file)
    estimated_articles = file_size // 4000
    
    if max_articles:
        estimated_articles = min(estimated_articles, max_articles)
    
    print(f"File: {xml_file}")
    print(f"Size: {file_size:,} bytes")
    print(f"Estimated articles: ~{estimated_articles:,}")
    print("Starting robust extraction (handles malformed XML)...")
    
    data = []
    article_count = 0
    start_time = time.time()
    
    try:
        with open(xml_file, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read()
        
        # Split content by PMC_ARTICLE tags
        article_sections = content.split('<PMC_ARTICLE>')
        
        print(f"Found {len(article_sections)-1} article sections")
        
        # Progress bar
        pbar = tqdm(total=min(len(article_sections)-1, estimated_articles), desc="Extracting articles", unit="articles")
        
        for i, section in enumerate(article_sections[1:], 1):  # Skip first empty section
            if max_articles and article_count >= max_articles:
                break
            
            # Extract data using regex patterns instead of XML parsing
            article_data = extract_article_data_regex(section)
            if article_data:
                data.append(article_data)
            
            article_count += 1
            pbar.update(1)
            
            # Update progress every 1000 articles
            if article_count % 1000 == 0:
                elapsed = time.time() - start_time
                rate = article_count / elapsed if elapsed > 0 else 0
                pbar.set_postfix({
                    'Rate': f'{rate:.1f}/s',
                    'Valid': len(data)
                })
        
        pbar.close()
        
    except Exception as e:
        print(f"Error processing file: {e}")
        if 'pbar' in locals():
            pbar.close()
        return pd.DataFrame()
    
    elapsed_time = time.time() - start_time
    print(f"\nProcessing completed in {elapsed_time/60:.2f} minutes")
    print(f"Processed {article_count:,} articles")
    print(f"Extracted {len(data):,} valid articles")
    if elapsed_time > 0:
        print(f"Rate: {article_count/elapsed_time:.1f} articles/second")
    
    return pd.DataFrame(data)

def extract_article_data_regex(section):
    """
    Extract article data using regex patterns instead of XML parsing
    """
    try:
        import re
        
        # Extract basic metadata using regex
        article_id = extract_tag_content(section, 'id')
        pmid = extract_tag_content(section, 'pmid')
        pmcid = extract_tag_content(section, 'pmcid')
        doi = extract_tag_content(section, 'DOI')
        title = extract_tag_content(section, 'title')
        
        # Extract journal info
        journal_title = extract_tag_content(section, 'JournalTitle')
        pub_year = extract_tag_content(section, 'PubYear')
        journal_volume = extract_tag_content(section, 'JournalVolume')
        issue = extract_tag_content(section, 'Issue')
        page_info = extract_tag_content(section, 'PageInfo')
        
        # Extract publication info
        pub_type = extract_tag_content(section, 'PubType')
        is_open_access = extract_tag_content(section, 'IsOpenAccess')
        
        # Extract authors
        authors = extract_authors_regex(section)
        
        return {
            'article_id': article_id or '',
            'pmid': pmid or '',
            'pmcid': pmcid or '',
            'doi': doi or '',
            'title': title or '',
            'journal_title': journal_title or '',
            'pub_year': pub_year or '',
            'journal_volume': journal_volume or '',
            'issue': issue or '',
            'page_info': page_info or '',
            'pub_type': pub_type or '',
            'is_open_access': is_open_access or '',
            'authors': authors or ''
        }
    except Exception as e:
        return None

def extract_tag_content(text, tag_name):
    """Extract content between XML tags using regex"""
    import re
    pattern = f'<{tag_name}>(.*?)</{tag_name}>'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    
    # Also try self-closing pattern
    pattern = f'<{tag_name}>(.*?)<'
    match = re.search(pattern, text)
    if match:
        content = match.group(1).strip()
        # Make sure we didn't capture across tags
        if '<' not in content:
            return content
    
    return None

def extract_authors_regex(section):
    """Extract author information using regex"""
    import re
    
    authors = []
    
    # Find AuthorList section - be more flexible with the pattern
    author_list_patterns = [
        r'<AuthorList[^>]*>(.*?)(?=</AuthorList>)',
        r'<AuthorList[^>]*>(.*?)(?=<Journal)',
        r'<AuthorList[^>]*>(.*?)(?=<[A-Z][a-z])',
    ]
    
    author_list_content = None
    for pattern in author_list_patterns:
        author_list_match = re.search(pattern, section, re.DOTALL)
        if author_list_match:
            author_list_content = author_list_match.group(1)
            break
    
    if not author_list_content:
        return ''
    
    # Extract individual authors - use the pattern that we know works
    author_pattern = r'<Author[^>]*>(.*?)(?=</Author>)'
    author_matches = re.finditer(author_pattern, author_list_content, re.DOTALL)
    
    for author_match in author_matches:
        author_content = author_match.group(1)
        
        # Extract LastName and Initials
        last_name = extract_tag_content(author_content, 'LastName') or ''
        initials = extract_tag_content(author_content, 'Initials') or ''
        collective_name = extract_tag_content(author_content, 'CollectiveName') or ''
        
        # Clean up collective names that might have escaped characters
        if collective_name:
            collective_name = collective_name.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
            authors.append(collective_name)
        elif last_name or initials:
            author_name = f"{last_name}, {initials}".strip(', ')
            if author_name and author_name not in authors:  # Avoid duplicates
                authors.append(author_name)
    
    return '; '.join(authors)

class CleaningFileWrapper:
    """File wrapper that cleans invalid XML characters on-the-fly"""
    
    def __init__(self, file_path):
        self.file_path = file_path
        self.file = open(file_path, 'rb')  # Open in binary mode
        
        # Precompiled regex for invalid XML characters (bytes pattern)
        # XML 1.0 valid: tab(9), LF(10), CR(13), and chars 32-126, plus UTF-8 sequences
        self.invalid_chars_regex = re.compile(b'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F]')
        
        # Regex to fix unescaped ampersands (& not followed by valid entity references)
        # Valid XML entities: &amp; &lt; &gt; &quot; &apos; &#number; &#xhex;
        self.ampersand_regex = re.compile(b'&(?!(?:amp|lt|gt|quot|apos|#(?:\d+|x[0-9a-fA-F]+));)')
        
        # Regex to fix unescaped < and > characters in text content
        # This is more complex - we need to avoid breaking real XML tags
        # We'll look for < that's not followed by valid tag patterns
        self.unescaped_lt_regex = re.compile(b'<(?![/?]?[a-zA-Z_][a-zA-Z0-9_:-]*[^>]*>)')
        self.unescaped_gt_regex = re.compile(b'(?<![a-zA-Z0-9_:-])>(?![^<]*</)')
        
        self.chars_cleaned = 0
        self.ampersands_fixed = 0
        self.lt_fixed = 0
        self.gt_fixed = 0
    
    def read(self, size=-1):
        chunk = self.file.read(size)
        if chunk:
            # Remove invalid characters on-the-fly
            original_len = len(chunk)
            cleaned_chunk = self.invalid_chars_regex.sub(b'', chunk)
            self.chars_cleaned += original_len - len(cleaned_chunk)
            
            # Fix unescaped ampersands
            original_amp_len = len(cleaned_chunk)
            cleaned_chunk = self.ampersand_regex.sub(b'&amp;', cleaned_chunk)
            amp_matches = len(re.findall(b'&amp;', cleaned_chunk)) - len(re.findall(b'&amp;', chunk))
            self.ampersands_fixed += amp_matches
            
            # Fix unescaped < characters (broader pattern)
            # Look for < that's not part of valid XML tags
            # This catches cases like "B<FIT" or "<2.66" in text content
            lt_pattern = re.compile(b'<(?![/?]?[a-zA-Z_][a-zA-Z0-9_:-]*(?:\s[^>]*)?>)')
            lt_matches = len(re.findall(lt_pattern, cleaned_chunk))
            cleaned_chunk = lt_pattern.sub(b'&lt;', cleaned_chunk)
            self.lt_fixed += lt_matches
            
            # Fix missing </PMC_ARTICLE> closing tags
            # Look for patterns like </PMC_ARTICLE><PMC_ARTICLE> that should be there
            # but are missing the closing tag before starting a new article
            article_pattern = re.compile(b'<PMC_ARTICLE><id>')
            if b'<PMC_ARTICLE><id>' in cleaned_chunk and not cleaned_chunk.startswith(b'<PMC_ARTICLE><id>'):
                # Insert missing closing tags where needed
                cleaned_chunk = article_pattern.sub(b'</PMC_ARTICLE><PMC_ARTICLE><id>', cleaned_chunk)
            
            return cleaned_chunk
        return chunk
    
    def close(self):
        self.file.close()
        if self.chars_cleaned > 0:
            print(f"Cleaned {self.chars_cleaned:,} invalid XML characters on-the-fly")
        if self.ampersands_fixed > 0:
            print(f"Fixed {self.ampersands_fixed:,} unescaped ampersands on-the-fly")
        if self.lt_fixed > 0:
            print(f"Fixed {self.lt_fixed:,} unescaped < characters on-the-fly")
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

def extract_article_data_lxml(elem):
    """
    Extract article data from a PMC_ARTICLE element using lxml methods
    """
    try:
        # Extract basic metadata
        article_id = elem.findtext('.//id', default='')
        pmid = elem.findtext('.//pmid', default='')
        pmcid = elem.findtext('.//pmcid', default='')
        doi = elem.findtext('.//DOI', default='')
        title = elem.findtext('.//title', default='')
        
        # Extract journal info
        journal_title = elem.findtext('.//JournalTitle', default='')
        pub_year = elem.findtext('.//PubYear', default='')
        journal_volume = elem.findtext('.//JournalVolume', default='')
        issue = elem.findtext('.//Issue', default='')
        page_info = elem.findtext('.//PageInfo', default='')
        
        # Extract publication info
        pub_type = elem.findtext('.//PubType', default='')
        is_open_access = elem.findtext('.//IsOpenAccess', default='')
        
        # Extract author information
        authors = []
        author_list = elem.find('.//AuthorList')
        if author_list is not None:
            for author in author_list.findall('.//Author'):
                last_name = author.findtext('.//LastName', default='')
                initials = author.findtext('.//Initials', default='')
                collective_name = author.findtext('.//CollectiveName', default='')
                
                if collective_name:
                    # Clean up escaped characters in collective names
                    collective_name = collective_name.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
                    authors.append(collective_name)
                elif last_name or initials:
                    author_name = f"{last_name}, {initials}".strip(', ')
                    if author_name:  # Only add non-empty names
                        authors.append(author_name)
        
        authors_str = '; '.join(authors)
        
        return {
            'article_id': article_id,
            'pmid': pmid,
            'pmcid': pmcid,
            'doi': doi,
            'title': title,
            'journal_title': journal_title,
            'pub_year': pub_year,
            'journal_volume': journal_volume,
            'issue': issue,
            'page_info': page_info,
            'pub_type': pub_type,
            'is_open_access': is_open_access,
            'authors': authors_str
        }
    except Exception as e:
        # Return None if extraction fails
        return None

# Ultra-fast parser with on-the-fly character cleaning
def parse_pmc_xml_lxml_safe(xml_file, max_articles=None):
    """
    Ultra-fast parser for PMC XML files using lxml with on-the-fly character cleaning
    """
    
    if not LXML_AVAILABLE:
        print("lxml not available, falling back to standard parser")
        return pd.DataFrame()
    
    file_size = os.path.getsize(xml_file)
    estimated_articles = file_size // 4000
    
    if max_articles:
        estimated_articles = min(estimated_articles, max_articles)
    
    print(f"File: {xml_file}")
    print(f"Size: {file_size:,} bytes")
    print(f"Estimated articles: ~{estimated_articles:,}")
    print("Starting extraction with lxml and on-the-fly cleaning...")
    
    data = []
    article_count = 0
    start_time = time.time()
    
    try:
        # Use cleaning file wrapper for on-the-fly character cleaning
        with CleaningFileWrapper(xml_file) as clean_file:
            # Use lxml iterparse with the cleaning wrapper
            context = etree.iterparse(clean_file, events=('start', 'end'), tag='PMC_ARTICLE')
            
            # Progress bar
            pbar = tqdm(total=estimated_articles, desc="Extracting articles", unit="articles")
            
            for event, elem in context:
                if event == 'end':
                    if max_articles and article_count >= max_articles:
                        break
                    
                    # Extract article data using lxml methods
                    article_data = extract_article_data_lxml(elem)
                    if article_data:
                        data.append(article_data)
                    
                    article_count += 1
                    pbar.update(1)
                    
                    # Update progress every 1000 articles
                    if article_count % 1000 == 0:
                        elapsed = time.time() - start_time
                        rate = article_count / elapsed if elapsed > 0 else 0
                        pbar.set_postfix({
                            'Rate': f'{rate:.1f}/s',
                            'Valid': len(data)
                        })
                    
                    # Clear processed element to save memory
                    elem.clear()
                    # Also clear preceding siblings
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
            
            pbar.close()
        
    except Exception as e:
        print(f"Error parsing XML: {e}")
        if 'pbar' in locals():
            pbar.close()
        
        # Print the problematic XML content around the error
        try:
            error_str = str(e)
            # Extract line number from error message
            line_match = re.search(r'line (\d+)', error_str)
            if line_match:
                error_line = int(line_match.group(1))
                print(f"\nExtracting XML content around line {error_line}...")
                
                with open(xml_file, 'r', encoding='utf-8', errors='replace') as f:
                    lines = []
                    current_line = 0
                    
                    # Read lines around the error
                    for line in f:
                        current_line += 1
                        
                        # Keep lines around the error (±10 lines)
                        if error_line - 10 <= current_line <= error_line + 10:
                            lines.append(f"Line {current_line}: {repr(line)}")
                        
                        # Stop reading after we've passed the error area
                        if current_line > error_line + 10:
                            break
                    
                    print("\nProblematic XML content:")
                    print("=" * 80)
                    for line in lines:
                        print(line)
                    print("=" * 80)
                    
                    # Also show the specific character at the column position
                    if len(lines) > 10:  # Find the error line
                        error_line_content = lines[10]  # Middle line should be the error line
                        column_match = re.search(r'column (\d+)', error_str)
                        if column_match:
                            column = int(column_match.group(1))
                            print(f"\nSpecific problematic area around column {column}:")
                            line_text = error_line_content.split(': ', 1)[1] if ': ' in error_line_content else error_line_content
                            line_text = eval(line_text)  # Convert from repr back to string
                            start_pos = max(0, column - 50)
                            end_pos = min(len(line_text), column + 50)
                            problematic_section = line_text[start_pos:end_pos]
                            print(f"Context: {repr(problematic_section)}")
                            if column < len(line_text):
                                print(f"Character at column {column}: {repr(line_text[column])}")
                    
        except Exception as debug_e:
            print(f"Could not extract problematic XML content: {debug_e}")
        
        return pd.DataFrame()
    
    elapsed_time = time.time() - start_time
    print(f"\nProcessing completed in {elapsed_time/60:.2f} minutes")
    print(f"Processed {article_count:,} articles")
    print(f"Extracted {len(data):,} valid articles")
    if elapsed_time > 0:
        print(f"Rate: {article_count/elapsed_time:.1f} articles/second")
    
    return pd.DataFrame(data)

def process_xml_directory(xml_dir, output_dir, max_articles_per_file=None):
    """
    Process all XML files in a directory and save each as a parquet file
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Find all XML files in the directory
    xml_files = glob.glob(os.path.join(xml_dir, "*.xml"))
    
    if not xml_files:
        print(f"No XML files found in directory: {xml_dir}")
        return
    
    print(f"Found {len(xml_files)} XML files in {xml_dir}")
    print(f"Output directory: {output_dir}")
    print("=" * 80)
    
    successful_files = 0
    failed_files = 0
    total_articles = 0
    
    for i, xml_file_path in enumerate(xml_files, 1):
        filename = os.path.basename(xml_file_path)
        base_name = os.path.splitext(filename)[0]
        output_path = os.path.join(output_dir, f"{base_name}.parquet")
        
        print(f"\n[{i}/{len(xml_files)}] Processing: {filename}")
        
        # Skip if output file already exists
        if os.path.exists(output_path):
            print(f"Output file already exists, skipping: {output_path}")
            continue
        
        try:
            # Try the XML parser first, fall back to robust parser if it fails
            print("Attempting XML-based parsing...")
            df_pmc = parse_pmc_xml_lxml_safe(xml_file_path, max_articles=max_articles_per_file)
            
            # If XML parsing failed, try the robust parser
            if df_pmc.empty:
                print("XML parsing failed due to malformed XML. Trying robust text-based parser...")
                df_pmc = parse_pmc_xml_robust(xml_file_path, max_articles=max_articles_per_file)
            
            if not df_pmc.empty:
                # Save as parquet file
                df_pmc.to_parquet(output_path, compression='snappy')
                print(f"✓ Saved {len(df_pmc):,} articles to: {output_path}")
                print(f"  File size: {os.path.getsize(output_path):,} bytes")
                
                successful_files += 1
                total_articles += len(df_pmc)
                
                # Show some sample data for the first file
                if i == 1:
                    print(f"\nSample data from first file:")
                    print(f"Columns: {list(df_pmc.columns)}")
                    print(f"Shape: {df_pmc.shape}")
                    sample_df = df_pmc[df_pmc['doi'] != ''].head(2)
                    if not sample_df.empty:
                        print("Sample entries with DOI:")
                        for col in ['pmid', 'pmcid', 'doi', 'title', 'journal_title']:
                            if col in sample_df.columns:
                                print(f"  {col}: {sample_df[col].iloc[0]}")
            else:
                print(f"✗ No articles extracted from: {filename}")
                failed_files += 1
                
        except Exception as e:
            print(f"✗ Error processing {filename}: {e}")
            failed_files += 1
    
    print("\n" + "=" * 80)
    print("PROCESSING SUMMARY")
    print("=" * 80)
    print(f"Total XML files found: {len(xml_files)}")
    print(f"Successfully processed: {successful_files}")
    print(f"Failed to process: {failed_files}")
    print(f"Total articles extracted: {total_articles:,}")
    print(f"Output directory: {output_dir}")
    
    if successful_files > 0:
        # List all created parquet files
        parquet_files = glob.glob(os.path.join(output_dir, "*.parquet"))
        print(f"\nCreated parquet files ({len(parquet_files)}):")
        for pf in sorted(parquet_files):
            size = os.path.getsize(pf)
            print(f"  {os.path.basename(pf)} ({size:,} bytes)")

# Set directories
xml_directory = "extracted_pmc_metadata/out"
output_directory = r"pmc_parquet_files"

# Process all XML files in the directory
process_xml_directory(xml_directory, output_directory, max_articles_per_file=None)

In [None]:
import pandas as pd
import glob
import os

# Get all parquet files in the folder
parquet_files = glob.glob("pmc_parquet_files/*.parquet")
print(f"Found {len(parquet_files)} parquet files")

# List to store processed dataframes
processed_dfs = []

# Process each parquet file
for file_path in parquet_files:
    print(f"Processing {os.path.basename(file_path)}...")
    
    # Read the parquet file
    df = pd.read_parquet(file_path)
    
    # Drop rows where doi is NaN
    df = df.dropna(subset=['doi'])
    
    # Drop rows where doi is empty string
    df = df[df['doi'].str.strip() != '']
    
    print(f"  - Rows after cleaning: {len(df)}")
    processed_dfs.append(df)

# Concatenate all processed dataframes
print("Concatenating all dataframes...")
df_combined = pd.concat(processed_dfs, ignore_index=True)
print(f"Total combined rows: {len(df_combined)}")

# Optimize data types and compress for smaller file size
metadata_df = df_combined[['pmcid','article_id','authors','title','journal_title','pub_year']].copy()

# Convert pub_year to int16 if it's numeric (saves space)
if metadata_df['pub_year'].dtype in ['float64', 'int64']:
    metadata_df['pub_year'] = metadata_df['pub_year'].astype('int16')

# Use gzip compression for size reduction
metadata_df.to_parquet("PMC_article_metadata.parquet", 
                      compression='gzip', 
                      index=False)

print(f"Saved {len(metadata_df)} rows to compressed parquet file")