In [1]:
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# MedRAG Corpus Statistics

This notebook computes corpus statistics for the 4 MedRAG sources: PubMed, Wikipedia, Textbooks, and StatPearls.

**Measurements:**
- **#Doc.**: Number of unique documents 
- **#Snippets**: Number of text chunks
- **Avg. L**: Average snippet length in characters

In [2]:
# Paths
BASE_CORPUS_DIR = Path('/data/wang/junh/githubs/mirage_medrag/MedRAG/src/data/corpus')
HF_CACHE_DIR = Path('/data/wang/junh/githubs/mirage_medrag/MedRAG/src/data/hf_cache')

print('Corpus directory:', BASE_CORPUS_DIR)
print('HF cache directory:', HF_CACHE_DIR)
print('Corpus exists:', BASE_CORPUS_DIR.exists())
print('HF cache exists:', HF_CACHE_DIR.exists())

Corpus directory: /data/wang/junh/githubs/mirage_medrag/MedRAG/src/data/corpus
HF cache directory: /data/wang/junh/githubs/mirage_medrag/MedRAG/src/data/hf_cache
Corpus exists: True
HF cache exists: True


In [3]:
def get_text_content(obj):
    """Extract text content from a JSON object."""
    for field in ['content', 'contents', 'text', 'body']:
        if field in obj and isinstance(obj[field], str):
            return obj[field]
    return ""

def get_document_id(obj):
    """Extract document ID from a JSON object."""
    for field in ['PMID', 'pmid', 'paper_id', 'paperId', 'document_id', 'doc_id']:
        if field in obj:
            return str(obj[field])
    return None

def process_corpus(corpus_path):
    """Process a single corpus and return statistics."""
    corpus_name = corpus_path.name
    chunk_dir = corpus_path / 'chunk'
    
    if not chunk_dir.exists():
        print(f"No chunk directory found for {corpus_name}")
        return None
    
    chunk_files = list(chunk_dir.glob('*.jsonl'))
    if not chunk_files:
        print(f"No JSONL files found for {corpus_name}")
        return None
    
    total_snippets = 0
    total_chars = 0
    unique_docs = set()
    
    print(f"Processing {corpus_name} ({len(chunk_files)} files)...")
    
    for file_path in tqdm(chunk_files, desc=f"{corpus_name}"):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    
                    try:
                        obj = json.loads(line)
                        total_snippets += 1
                        
                        # Get text content and length
                        text = get_text_content(obj)
                        total_chars += len(text)
                        
                        # Get document ID
                        doc_id = get_document_id(obj)
                        if doc_id:
                            unique_docs.add(doc_id)
                            
                    except json.JSONDecodeError:
                        continue
                        
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    avg_length = total_chars / total_snippets if total_snippets > 0 else 0
    n_docs = len(unique_docs) if unique_docs else total_snippets
    
    return {
        'Corpus': corpus_name,
        '#Doc.': n_docs,
        '#Snippets': total_snippets,
        'Avg. L': round(avg_length)
    }

In [4]:
# Find available corpora
corpus_names = ['pubmed', 'wikipedia', 'textbooks', 'statpearls']
results = []

for corpus_name in corpus_names:
    corpus_path = BASE_CORPUS_DIR / corpus_name
    if corpus_path.exists():
        stats = process_corpus(corpus_path)
        if stats:
            results.append(stats)
    else:
        print(f"Corpus not found: {corpus_path}")

# Create DataFrame
if results:
    df = pd.DataFrame(results)
    
    # Format numbers for better display
    df['#Doc. (M)'] = (df['#Doc.'] / 1_000_000).round(1)
    df['#Snippets (M)'] = (df['#Snippets'] / 1_000_000).round(1)
    
    # Add MedCorp total if available
    if 'medcorp_count' in globals() and medcorp_count:
        medcorp_row = {
            'Corpus': 'MedCorp (Total)',
            '#Doc.': medcorp_count,
            '#Snippets': medcorp_count,
            'Avg. L': '-',
            '#Doc. (M)': round(medcorp_count / 1_000_000, 1),
            '#Snippets (M)': round(medcorp_count / 1_000_000, 1)
        }
        df = pd.concat([df, pd.DataFrame([medcorp_row])], ignore_index=True)
    
    # Display the main table
    display_df = df[['Corpus', '#Doc. (M)', '#Snippets (M)', 'Avg. L']].copy()
    print("\nMedRAG Corpus Statistics:")
    print("=" * 50)
    display(display_df)
    
    # Also show raw numbers
    print("\nRaw Numbers:")
    print("=" * 30)
    display(df[['Corpus', '#Doc.', '#Snippets', 'Avg. L']])
else:
    print("No corpus data found!")

Processing pubmed (1166 files)...


pubmed:   0%|          | 0/1166 [00:00<?, ?it/s]

pubmed: 100%|██████████| 1166/1166 [03:57<00:00,  4.90it/s]


Processing wikipedia (646 files)...


wikipedia: 100%|██████████| 646/646 [03:04<00:00,  3.51it/s]


Processing textbooks (18 files)...


textbooks: 100%|██████████| 18/18 [00:00<00:00, 22.59it/s]


Processing statpearls (9625 files)...


statpearls: 100%|██████████| 9625/9625 [00:04<00:00, 2048.07it/s]


MedRAG Corpus Statistics:





Unnamed: 0,Corpus,#Doc. (M),#Snippets (M),Avg. L
0,pubmed,23.9,23.9,1309
1,wikipedia,29.9,29.9,682
2,textbooks,0.1,0.1,777
3,statpearls,0.4,0.4,516



Raw Numbers:


Unnamed: 0,Corpus,#Doc.,#Snippets,Avg. L
0,pubmed,23895135,23898701,1309
1,wikipedia,29913202,29913202,682
2,textbooks,125847,125847,777
3,statpearls,352155,352155,516


## Common Elements:
1. **Required JSON keys**: id, title, content, contents
2. **contents field**: concat(title, content) - title + ". " + content
3. **Whitespace normalization**: re.sub("\s+", " ", text)
4. **Helper functions**: ends_with_ending_punctuation(), concat()
5. **Target chunk size**: ~1000 characters (with 200 overlap for long texts)
6. **Output format**: JSONL (one JSON object per line)
7. **Directory structure**: corpus/{source}/chunk/*.jsonl

===============================================================================
                    UMLS DATA SOURCE CREATION SUMMARY
===============================================================================

## ANALYSIS COMPLETED:

### 1. Chunk Size Analysis from Existing Corpora:
   - PubMed: 395-1897 chars (no chunking - abstracts are naturally short)
   - Wikipedia: 559-900 chars (chunked with RecursiveCharacterTextSplitter)
   - Textbooks: 935-970 chars (chunked with RecursiveCharacterTextSplitter)
   - StatPearls: 533-634 chars (smart merging to keep <1000 chars)
   
   Target: ~1000 characters per chunk with 200 char overlap

### 2. UMLS Communities Data Analysis:
   - Total communities: 35,797
   - Valid summaries: 34,620 (96.7%)
   - Invalid (blank/error): 1,177 (3.3%)
   - Summary length stats:
     * Mean: 1,230 chars
     * Median: 952 chars
     * 95th percentile: 3,061 chars
     * Max: 20,541 chars
   - Summaries >1000 chars: 16,235 (46.9% of valid)
   
   Decision: YES, chunking is needed for long summaries

### 3. General Data Processing Strategy (from 4 existing sources):
   ✓ Common JSON structure: {id, title, content, contents}
   ✓ contents = concat(title, content) with smart punctuation
   ✓ Whitespace normalization: re.sub("\s+", " ", text)
   ✓ Target chunk size: 1000 chars with 200 overlap
   ✓ Output format: JSONL (one JSON per line)
   ✓ Directory: corpus/{source}/chunk/*.jsonl
   ✓ Multiple files per source for organization

### 4. UMLS Processing Strategy Implemented:
   ✓ Group by run (35 runs total) - one JSONL file per run
   ✓ Filter invalid summaries (errors, blank, "too large")
   ✓ Hierarchical titles: "UMLS -- Run X -- Level Y -- Community Z"
   ✓ Chunk long summaries (>1000 chars) with RecursiveCharacterTextSplitter
   ✓ Keep short summaries as-is (like PubMed approach)
   ✓ Unique IDs: "UMLS_RX_LY_CZ" or "UMLS_RX_LY_CZ_chunkN"

## RESULTS:

✓ Files created: 35 JSONL files (umls_run00.jsonl through umls_run34.jsonl)
✓ Total size: 123 MB
✓ Valid communities processed: 34,620
✓ Invalid communities skipped: 1,177
✓ Communities chunked: 16,235 (46.9%)
✓ Total chunks created: 62,340

Average chunks per file: 1,781
Chunking rate: 46.9% (communities that needed splitting)