In [None]:
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# MedRAG Corpus Statistics

This notebook computes corpus statistics for the 4 MedRAG sources: PubMed, Wikipedia, Textbooks, and StatPearls.

**Measurements:**
- **#Doc.**: Number of unique documents 
- **#Snippets**: Number of text chunks
- **Avg. L**: Average snippet length in characters

In [None]:
# Paths
BASE_CORPUS_DIR = Path('/data/wang/junh/githubs/MedRAG/src/data/corpus')
HF_CACHE_DIR = Path('/data/wang/junh/githubs/MedRAG/src/data/hf_cache')

print('Corpus directory:', BASE_CORPUS_DIR)
print('HF cache directory:', HF_CACHE_DIR)
print('Corpus exists:', BASE_CORPUS_DIR.exists())
print('HF cache exists:', HF_CACHE_DIR.exists())

In [None]:
def get_text_content(obj):
    """Extract text content from a JSON object."""
    for field in ['content', 'contents', 'text', 'body']:
        if field in obj and isinstance(obj[field], str):
            return obj[field]
    return ""

def get_document_id(obj):
    """Extract document ID from a JSON object."""
    for field in ['PMID', 'pmid', 'paper_id', 'paperId', 'document_id', 'doc_id']:
        if field in obj:
            return str(obj[field])
    return None

def process_corpus(corpus_path):
    """Process a single corpus and return statistics."""
    corpus_name = corpus_path.name
    chunk_dir = corpus_path / 'chunk'
    
    if not chunk_dir.exists():
        print(f"No chunk directory found for {corpus_name}")
        return None
    
    chunk_files = list(chunk_dir.glob('*.jsonl'))
    if not chunk_files:
        print(f"No JSONL files found for {corpus_name}")
        return None
    
    total_snippets = 0
    total_chars = 0
    unique_docs = set()
    
    print(f"Processing {corpus_name} ({len(chunk_files)} files)...")
    
    for file_path in tqdm(chunk_files, desc=f"{corpus_name}"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    
                    try:
                        obj = json.loads(line)
                        total_snippets += 1
                        
                        # Get text content and length
                        text = get_text_content(obj)
                        total_chars += len(text)
                        
                        # Get document ID
                        doc_id = get_document_id(obj)
                        if doc_id:
                            unique_docs.add(doc_id)
                            
                    except json.JSONDecodeError:
                        continue
                        
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    avg_length = total_chars / total_snippets if total_snippets > 0 else 0
    n_docs = len(unique_docs) if unique_docs else total_snippets
    
    return {
        'Corpus': corpus_name,
        '#Doc.': n_docs,
        '#Snippets': total_snippets,
        'Avg. L': round(avg_length)
    }

In [5]:
# Find available corpora
corpus_names = ['pubmed', 'wikipedia', 'textbooks', 'statpearls']
results = []

for corpus_name in corpus_names:
    corpus_path = BASE_CORPUS_DIR / corpus_name
    if corpus_path.exists():
        stats = process_corpus(corpus_path)
        if stats:
            results.append(stats)
    else:
        print(f"Corpus not found: {corpus_path}")

# Create DataFrame
if results:
    df = pd.DataFrame(results)
    
    # Format numbers for better display
    df['#Doc. (M)'] = (df['#Doc.'] / 1_000_000).round(1)
    df['#Snippets (M)'] = (df['#Snippets'] / 1_000_000).round(1)
    
    # Add MedCorp total if available
    if 'medcorp_count' in globals() and medcorp_count:
        medcorp_row = {
            'Corpus': 'MedCorp (Total)',
            '#Doc.': medcorp_count,
            '#Snippets': medcorp_count,
            'Avg. L': '-',
            '#Doc. (M)': round(medcorp_count / 1_000_000, 1),
            '#Snippets (M)': round(medcorp_count / 1_000_000, 1)
        }
        df = pd.concat([df, pd.DataFrame([medcorp_row])], ignore_index=True)
    
    # Display the main table
    display_df = df[['Corpus', '#Doc. (M)', '#Snippets (M)', 'Avg. L']].copy()
    print("\nMedRAG Corpus Statistics:")
    print("=" * 50)
    display(display_df)
    
    # Also show raw numbers
    print("\nRaw Numbers:")
    print("=" * 30)
    display(df[['Corpus', '#Doc.', '#Snippets', 'Avg. L']])
else:
    print("No corpus data found!")

Processing pubmed (1166 files)...


pubmed:   0%|          | 0/1166 [00:00<?, ?it/s]

pubmed: 100%|██████████| 1166/1166 [03:51<00:00,  5.04it/s]
pubmed: 100%|██████████| 1166/1166 [03:51<00:00,  5.04it/s]


Processing wikipedia (646 files)...


wikipedia: 100%|██████████| 646/646 [02:53<00:00,  3.72it/s]
wikipedia: 100%|██████████| 646/646 [02:53<00:00,  3.72it/s]


Processing textbooks (18 files)...


textbooks: 100%|██████████| 18/18 [00:00<00:00, 22.70it/s]



Processing statpearls (9625 files)...


statpearls: 100%|██████████| 9625/9625 [00:05<00:00, 1848.76it/s]
statpearls: 100%|██████████| 9625/9625 [00:05<00:00, 1848.76it/s]



MedRAG Corpus Statistics:


Unnamed: 0,Corpus,#Doc. (M),#Snippets (M),Avg. L
0,pubmed,23.9,23.9,1309
1,wikipedia,29.9,29.9,682
2,textbooks,0.1,0.1,777
3,statpearls,0.4,0.4,516
4,MedCorp (Total),54.3,54.3,-



Raw Numbers:


Unnamed: 0,Corpus,#Doc.,#Snippets,Avg. L
0,pubmed,23895135,23898701,1309
1,wikipedia,29913202,29913202,682
2,textbooks,125847,125847,777
3,statpearls,352155,352155,516
4,MedCorp (Total),54289905,54289905,-


## Check MedCorp Mapping File

In [3]:
def count_medcorp_entries():
    """Count entries in MedCorp_id2text.json without loading the entire file."""
    medcorp_file = BASE_CORPUS_DIR / 'MedCorp_id2text.json'
    
    if not medcorp_file.exists():
        print(f"MedCorp file not found: {medcorp_file}")
        return None
    
    print(f"Counting entries in {medcorp_file.name}...")
    print(f"File size: {medcorp_file.stat().st_size / (1024*1024*1024):.1f} GB")
    
    # First, show an example entry by reading the first few lines
    print("\nReading example entry...")
    try:
        with open(medcorp_file, 'r', encoding='utf-8') as f:
            # Read first few KB to find a complete entry
            sample_text = f.read(10000)  # Read 10KB
            
            # Parse the format: "id": {"nested": "json"}
            if '"' in sample_text and ':' in sample_text:
                # Find first complete entry
                lines = sample_text.split('\n')
                for line in lines[:10]:  # Check first 10 lines
                    line = line.strip().rstrip(',')  # Remove trailing comma
                    if '"' in line and ':' in line and '{' in line:
                        # Try to extract ID and object
                        try:
                            # Look for pattern: "id": {object}
                            if '": {' in line:
                                id_part = line.split('": {')[0].strip('"').strip()
                                # Find the matching closing brace for this entry
                                brace_count = 0
                                start_idx = line.find('": {') + 3
                                obj_start = start_idx
                                for i, char in enumerate(line[start_idx:], start_idx):
                                    if char == '{':
                                        brace_count += 1
                                    elif char == '}':
                                        brace_count -= 1
                                        if brace_count == 0:
                                            # Found complete object
                                            obj_text = line[obj_start:i+1]
                                            try:
                                                obj = json.loads(obj_text)
                                                print(f"Example entry:")
                                                print(f"  ID: {id_part}")
                                                print(f"  Title: {obj.get('title', 'N/A')[:100]}...")
                                                print(f"  Content length: {len(obj.get('content', ''))} chars")
                                                print(f"  PMID: {obj.get('PMID', 'N/A')}")
                                                break
                                            except:
                                                continue
                        except:
                            continue
                    if 'Example entry:' in locals():
                        break
                        
                # Fallback: show raw format
                if 'Example entry:' not in str(locals()):
                    first_line = sample_text.split('\n')[0][:200]
                    print(f"Raw format example: {first_line}...")
                    
    except Exception as e:
        print(f"Could not read example entry: {e}")
    
    print("\nCounting all entries...")
    count = 0
    buffer_size = 1024 * 1024  # 1MB buffer for faster reading
    
    try:
        with open(medcorp_file, 'r', encoding='utf-8') as f:
            # Read in chunks for better performance
            in_string = False
            escape_next = False
            found_first_brace = False
            brace_depth = 0
            
            while True:
                chunk = f.read(buffer_size)
                if not chunk:
                    break
                
                for char in chunk:
                    if not found_first_brace:
                        if char == '{':
                            found_first_brace = True
                            brace_depth = 1
                        continue
                    
                    if escape_next:
                        escape_next = False
                        continue
                        
                    if char == '\\':
                        escape_next = True
                        continue
                        
                    if char == '"' and not escape_next:
                        in_string = not in_string
                        continue
                    
                    if not in_string:
                        if char == '{':
                            brace_depth += 1
                        elif char == '}':
                            brace_depth -= 1
                        elif char == ',' and brace_depth == 1:
                            # Found a top-level comma, indicating end of an entry
                            count += 1
                
                # Show progress every million entries
                if count > 0 and count % 1000000 == 0:
                    print(f"Processed ~{count/1000000:.1f}M entries so far...")
                
    except Exception as e:
        print(f"Error counting entries: {e}")
        return None
    
    # Add 1 for the last entry (no comma after it)
    if count > 0:
        count += 1
    
    return count

# Count MedCorp entries
medcorp_count = count_medcorp_entries()
if medcorp_count:
    print(f"\nMedCorp mapping file contains: {medcorp_count:,} entries")
    print(f"MedCorp entries (millions): {medcorp_count / 1_000_000:.1f}M")
else:
    print("Could not count MedCorp entries")

Counting entries in MedCorp_id2text.json...
File size: 57.1 GB

Reading example entry...
Example entry:
  ID: {"pubmed23n0001_0
  Title: [Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (...
  Content length: 395 chars
  PMID: 21
Raw format example: {"pubmed23n0001_0": {"id": "pubmed23n0001_0", "title": "[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)].", "c...

Counting all entries...

MedCorp mapping file contains: 54,289,905 entries
MedCorp entries (millions): 54.3M

MedCorp mapping file contains: 54,289,905 entries
MedCorp entries (millions): 54.3M
