In [None]:
"""
Enhanced WHO Medical Knowledge Extractor
==========================================

This advanced system transforms the basic scraper into a comprehensive medical knowledge
processing engine with improved performance, intelligence, and reliability.

Key Improvements:
1. Asynchronous processing for better performance
2. Advanced NLP for better content understanding
3. Robust error handling with retry mechanisms
4. Configuration management for flexibility
5. Database integration for persistent storage
6. Enhanced data validation and quality assurance
7. Modular design for maintainability
8. Advanced medical content classification
"""

import asyncio
import aiohttp
import json
import re
import sqlite3
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, asdict
from pathlib import Path
from urllib.parse import urljoin, urlparse
import hashlib
from contextlib import asynccontextmanager
import time
import yaml
from abc import ABC, abstractmethod

# Enhanced imports for NLP and data processing
try:
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False
    print("NLTK not available. Install with: pip install nltk")

try:
    from bs4 import BeautifulSoup, NavigableString
    BS4_AVAILABLE = True
except ImportError:
    BS4_AVAILABLE = False
    print("BeautifulSoup not available. Install with: pip install beautifulsoup4")

# Configure comprehensive logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('medical_scraper.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Configuration Management
@dataclass
class ScrapingConfig:
    """Configuration class for scraping parameters - think of this as your control panel"""
    max_concurrent_requests: int = 5  # How many pages to process simultaneously
    request_delay: float = 2.0        # Respectful delay between requests
    max_retries: int = 3              # How many times to retry failed requests
    timeout: int = 30                 # Request timeout in seconds
    max_content_length: int = 1000000 # Maximum page size to process
    enable_nlp: bool = True           # Whether to use advanced text processing
    cache_duration: int = 24          # Cache validity in hours
    output_formats: List[str] = None  # Supported output formats
    
    def __post_init__(self):
        if self.output_formats is None:
            self.output_formats = ['json', 'sqlite', 'csv']

# Data Models for Better Structure
@dataclass
class MedicalEntity:
    """Represents a single piece of medical information with metadata"""
    content: str
    confidence: float        # How confident we are in this extraction (0-1)
    source_section: str     # Which section this came from
    extraction_method: str  # How this was extracted (keywords, nlp, etc.)
    
@dataclass
class DiseaseInformation:
    """Comprehensive disease information model - like a medical record"""
    name: str
    overview: str
    symptoms: List[MedicalEntity]
    risk_factors: List[MedicalEntity]
    prevention: List[MedicalEntity]
    diagnosis: List[MedicalEntity]
    treatment: List[MedicalEntity]
    statistics: Dict[str, Any]
    severity: str
    prevalence: float
    key_facts: List[str]
    source_metadata: Dict[str, Any]
    last_updated: datetime
    data_quality_score: float  # Overall quality assessment (0-1)

# Advanced Content Processor using NLP
class MedicalContentProcessor:
    """
    Advanced text processing engine that understands medical content better.
    Think of this as a medical student who's learned to identify different
    types of medical information more accurately.
    """
    
    def __init__(self, enable_nlp: bool = True):
        self.enable_nlp = enable_nlp
        self.medical_keywords = self._load_medical_keywords()
        
        if enable_nlp and NLTK_AVAILABLE:
            self._initialize_nltk()
    
    def _initialize_nltk(self):
        """Download required NLTK data if not present"""
        try:
            nltk.data.find('tokenizers/punkt')
            nltk.data.find('corpora/stopwords')
            nltk.data.find('corpora/wordnet')
        except LookupError:
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('wordnet')
        
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    
    def _load_medical_keywords(self) -> Dict[str, List[str]]:
        """
        Load comprehensive medical keyword dictionaries.
        This is like giving our system a medical vocabulary.
        """
        return {
            'symptoms': [
                'symptom', 'sign', 'manifestation', 'present', 'experience', 'feel',
                'pain', 'ache', 'fever', 'headache', 'fatigue', 'weakness', 'nausea',
                'vomiting', 'diarrhea', 'cough', 'shortness of breath', 'chest pain',
                'swelling', 'inflammation', 'rash', 'itching', 'dizziness', 'confusion'
            ],
            'risk_factors': [
                'risk factor', 'risk', 'factor', 'cause', 'associated', 'increase',
                'likely', 'predispose', 'contribute', 'age', 'gender', 'genetics',
                'lifestyle', 'smoking', 'alcohol', 'obesity', 'diabetes', 'hypertension'
            ],
            'prevention': [
                'prevent', 'prevention', 'avoid', 'reduce', 'lifestyle', 'diet',
                'exercise', 'vaccination', 'immunization', 'screening', 'early detection',
                'healthy eating', 'physical activity', 'weight management', 'quit smoking'
            ],
            'diagnosis': [
                'diagnosis', 'diagnostic', 'test', 'testing', 'screen', 'screening',
                'detect', 'detection', 'measure', 'measurement', 'examine', 'examination',
                'blood test', 'imaging', 'x-ray', 'ct scan', 'mri', 'biopsy', 'laboratory'
            ],
            'treatment': [
                'treatment', 'treat', 'therapy', 'therapeutic', 'manage', 'management',
                'medication', 'medicine', 'drug', 'surgery', 'surgical', 'intervention',
                'procedure', 'rehabilitation', 'care', 'antibiotic', 'antiviral'
            ]
        }
    
    def extract_medical_entities(self, text: str, section_title: str = "") -> List[MedicalEntity]:
        """
        Extract medical entities with confidence scores.
        This method acts like a medical expert reading through text and
        identifying important medical information with varying degrees of certainty.
        """
        if not text.strip():
            return []
        
        entities = []
        
        # First, try keyword-based extraction (fast and reliable)
        keyword_entities = self._extract_by_keywords(text, section_title)
        entities.extend(keyword_entities)
        
        # Then, use NLP for more sophisticated extraction if available
        if self.enable_nlp and NLTK_AVAILABLE:
            nlp_entities = self._extract_by_nlp(text, section_title)
            entities.extend(nlp_entities)
        
        # Remove duplicates and merge similar content
        entities = self._deduplicate_entities(entities)
        
        return entities
    
    def _extract_by_keywords(self, text: str, section_title: str) -> List[MedicalEntity]:
        """Extract entities using keyword matching - like a medical keyword dictionary"""
        entities = []
        sentences = self._split_into_sentences(text)
        
        for sentence in sentences:
            sentence_lower = sentence.lower()
            
            # Determine category based on keywords and section title
            category = self._classify_sentence(sentence_lower, section_title)
            if not category:
                continue
            
            # Calculate confidence based on keyword density and sentence quality
            confidence = self._calculate_keyword_confidence(sentence_lower, category)
            
            if confidence > 0.3:  # Only include reasonably confident extractions
                entities.append(MedicalEntity(
                    content=sentence.strip(),
                    confidence=confidence,
                    source_section=section_title,
                    extraction_method="keyword_matching"
                ))
        
        return entities
    
    def _extract_by_nlp(self, text: str, section_title: str) -> List[MedicalEntity]:
        """
        Use NLP techniques for more sophisticated extraction.
        This is like having a medical student who understands grammar and context.
        """
        entities = []
        
        try:
            # Tokenize into sentences
            sentences = sent_tokenize(text)
            
            for sentence in sentences:
                # Skip very short or very long sentences
                if len(sentence.split()) < 5 or len(sentence.split()) > 50:
                    continue
                
                # Analyze sentence structure and medical content
                features = self._extract_sentence_features(sentence)
                category = self._classify_by_features(features, section_title)
                
                if category:
                    confidence = self._calculate_nlp_confidence(features, category)
                    
                    if confidence > 0.4:  # Higher threshold for NLP extraction
                        entities.append(MedicalEntity(
                            content=sentence.strip(),
                            confidence=confidence,
                            source_section=section_title,
                            extraction_method="nlp_analysis"
                        ))
        
        except Exception as e:
            logger.warning(f"NLP extraction failed: {e}")
        
        return entities
    
    def _split_into_sentences(self, text: str) -> List[str]:
        """Smart sentence splitting that handles medical text peculiarities"""
        # Handle common medical abbreviations that contain periods
        text = re.sub(r'\b(Dr|Mr|Mrs|Ms|Ph\.D|M\.D|etc)\.', r'\1<PERIOD>', text)
        
        # Split by sentence endings
        sentences = re.split(r'[.!?]+\s+', text)
        
        # Restore abbreviations
        sentences = [s.replace('<PERIOD>', '.') for s in sentences]
        
        return [s.strip() for s in sentences if s.strip()]
    
    def _classify_sentence(self, sentence: str, section_title: str) -> Optional[str]:
        """
        Classify a sentence into medical categories.
        This works like a medical student learning to categorize information.
        """
        section_lower = section_title.lower()
        
        # First, check if section title gives us a strong hint
        for category, keywords in self.medical_keywords.items():
            if any(keyword in section_lower for keyword in keywords[:3]):  # Check main keywords
                return category
        
        # Then, analyze sentence content
        best_category = None
        best_score = 0
        
        for category, keywords in self.medical_keywords.items():
            score = sum(1 for keyword in keywords if keyword in sentence)
            if score > best_score and score >= 2:  # Need at least 2 matching keywords
                best_score = score
                best_category = category
        
        return best_category
    
    def _calculate_keyword_confidence(self, sentence: str, category: str) -> float:
        """
        Calculate confidence based on keyword matching.
        More medical keywords = higher confidence, like a medical expert
        being more certain when they see familiar patterns.
        """
        keywords = self.medical_keywords.get(category, [])
        matches = sum(1 for keyword in keywords if keyword in sentence)
        
        # Base confidence from keyword density
        keyword_confidence = min(matches / 3.0, 1.0)  # Max at 3 keywords
        
        # Adjust based on sentence quality indicators
        quality_factors = {
            'appropriate_length': 0.8 if 10 <= len(sentence.split()) <= 30 else 0.5,
            'has_medical_terms': 0.9 if any(term in sentence for term in 
                ['patient', 'treatment', 'diagnosis', 'symptom', 'disease']) else 0.7,
            'not_too_general': 0.8 if len(sentence) > 20 else 0.5
        }
        
        quality_score = sum(quality_factors.values()) / len(quality_factors)
        
        return keyword_confidence * quality_score
    
    def _extract_sentence_features(self, sentence: str) -> Dict[str, Any]:
        """Extract linguistic and medical features from a sentence"""
        words = word_tokenize(sentence.lower())
        
        return {
            'word_count': len(words),
            'medical_word_ratio': sum(1 for word in words if self._is_medical_word(word)) / len(words),
            'has_numbers': bool(re.search(r'\d+', sentence)),
            'has_percentages': bool(re.search(r'\d+%', sentence)),
            'sentence_complexity': len([w for w in words if w not in self.stop_words]) / len(words),
            'contains_action_words': any(word in sentence.lower() for word in 
                ['cause', 'prevent', 'treat', 'diagnose', 'manage']),
            'medical_entity_count': len(re.findall(r'\b(?:mg|ml|mmHg|bpm|temperature|blood pressure)\b', sentence.lower()))
        }
    
    def _is_medical_word(self, word: str) -> bool:
        """Check if a word is likely medical terminology"""
        medical_suffixes = ['itis', 'osis', 'emia', 'pathy', 'therapy', 'gram', 'scopy']
        medical_prefixes = ['anti', 'hyper', 'hypo', 'pre', 'post', 'inter', 'intra']
        
        return (word in [item for sublist in self.medical_keywords.values() for item in sublist] or
                any(word.endswith(suffix) for suffix in medical_suffixes) or
                any(word.startswith(prefix) for prefix in medical_prefixes))
    
    def _classify_by_features(self, features: Dict[str, Any], section_title: str) -> Optional[str]:
        """Classify based on extracted features using simple rules"""
        # This is a simplified classifier - in production, you might use machine learning
        
        if 'symptom' in section_title.lower() or 'sign' in section_title.lower():
            return 'symptoms'
        elif 'risk' in section_title.lower() or 'factor' in section_title.lower():
            return 'risk_factors'
        elif 'prevent' in section_title.lower():
            return 'prevention'
        elif 'diagnos' in section_title.lower() or 'test' in section_title.lower():
            return 'diagnosis'
        elif 'treat' in section_title.lower() or 'therap' in section_title.lower():
            return 'treatment'
        
        return None
    
    def _calculate_nlp_confidence(self, features: Dict[str, Any], category: str) -> float:
        """Calculate confidence based on NLP features"""
        base_confidence = 0.5
        
        # Adjust based on features
        if features['medical_word_ratio'] > 0.2:
            base_confidence += 0.2
        if features['has_numbers']:
            base_confidence += 0.1
        if features['contains_action_words']:
            base_confidence += 0.1
        if 5 <= features['word_count'] <= 25:
            base_confidence += 0.1
        
        return min(base_confidence, 1.0)
    
    def _deduplicate_entities(self, entities: List[MedicalEntity]) -> List[MedicalEntity]:
        """
        Remove duplicate and very similar entities.
        This is like an editor reviewing medical notes and removing redundant information.
        """
        if not entities:
            return []
        
        unique_entities = []
        seen_content = set()
        
        # Sort by confidence (highest first)
        entities.sort(key=lambda x: x.confidence, reverse=True)
        
        for entity in entities:
            # Simple deduplication based on content similarity
            content_hash = hashlib.md5(entity.content.lower().encode()).hexdigest()
            
            if content_hash not in seen_content:
                seen_content.add(content_hash)
                unique_entities.append(entity)
        
        return unique_entities

# Enhanced Database Manager
class MedicalDataManager:
    """
    Manages persistent storage of medical data.
    Think of this as a medical records system that keeps track of
    all the diseases and their information over time.
    """
    
    def __init__(self, db_path: str = "medical_knowledge.db"):
        self.db_path = db_path
        self.init_database()
    
    def init_database(self):
        """Initialize database schema"""
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            
            # Main diseases table
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS diseases (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    name TEXT UNIQUE NOT NULL,
                    overview TEXT,
                    severity TEXT,
                    prevalence REAL,
                    data_quality_score REAL,
                    source_url TEXT,
                    last_updated TIMESTAMP,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            ''')
            
            # Medical entities table (symptoms, treatments, etc.)
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS medical_entities (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    disease_id INTEGER,
                    entity_type TEXT,  -- symptoms, risk_factors, etc.
                    content TEXT,
                    confidence REAL,
                    source_section TEXT,
                    extraction_method TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (disease_id) REFERENCES diseases (id)
                )
            ''')
            
            # Statistics table
            cursor.execute('''
                CREATE TABLE IF NOT EXISTS disease_statistics (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    disease_id INTEGER,
                    stat_type TEXT,
                    stat_value TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (disease_id) REFERENCES diseases (id)
                )
            ''')
            
            conn.commit()
    
    def save_disease(self, disease_info: DiseaseInformation) -> int:
        """Save disease information to database"""
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.cursor()
            
            # Insert or update main disease record
            cursor.execute('''
                INSERT OR REPLACE INTO diseases 
                (name, overview, severity, prevalence, data_quality_score, source_url, last_updated)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (
                disease_info.name,
                disease_info.overview,
                disease_info.severity,
                disease_info.prevalence,
                disease_info.data_quality_score,
                disease_info.source_metadata.get('source_url'),
                disease_info.last_updated
            ))
            
            disease_id = cursor.lastrowid
            
            # Clear existing entities for this disease
            cursor.execute('DELETE FROM medical_entities WHERE disease_id = ?', (disease_id,))
            cursor.execute('DELETE FROM disease_statistics WHERE disease_id = ?', (disease_id,))
            
            # Insert medical entities
            entity_types = ['symptoms', 'risk_factors', 'prevention', 'diagnosis', 'treatment']
            for entity_type in entity_types:
                entities = getattr(disease_info, entity_type, [])
                for entity in entities:
                    cursor.execute('''
                        INSERT INTO medical_entities 
                        (disease_id, entity_type, content, confidence, source_section, extraction_method)
                        VALUES (?, ?, ?, ?, ?, ?)
                    ''', (
                        disease_id, entity_type, entity.content, entity.confidence,
                        entity.source_section, entity.extraction_method
                    ))
            
            # Insert statistics
            for stat_type, stat_values in disease_info.statistics.items():
                if isinstance(stat_values, list):
                    for value in stat_values:
                        cursor.execute('''
                            INSERT INTO disease_statistics (disease_id, stat_type, stat_value)
                            VALUES (?, ?, ?)
                        ''', (disease_id, stat_type, str(value)))
            
            conn.commit()
            return disease_id
    
    def get_disease_by_name(self, name: str) -> Optional[DiseaseInformation]:
        """Retrieve disease information by name"""
        with sqlite3.connect(self.db_path) as conn:
            conn.row_factory = sqlite3.Row
            cursor = conn.cursor()
            
            # Get main disease info
            cursor.execute('SELECT * FROM diseases WHERE name = ?', (name,))
            disease_row = cursor.fetchone()
            
            if not disease_row:
                return None
            
            # Get medical entities
            entities_by_type = {}
            for entity_type in ['symptoms', 'risk_factors', 'prevention', 'diagnosis', 'treatment']:
                cursor.execute('''
                    SELECT * FROM medical_entities 
                    WHERE disease_id = ? AND entity_type = ?
                ''', (disease_row['id'], entity_type))
                
                entities = []
                for row in cursor.fetchall():
                    entities.append(MedicalEntity(
                        content=row['content'],
                        confidence=row['confidence'],
                        source_section=row['source_section'],
                        extraction_method=row['extraction_method']
                    ))
                entities_by_type[entity_type] = entities
            
            # Get statistics
            cursor.execute('SELECT * FROM disease_statistics WHERE disease_id = ?', (disease_row['id'],))
            statistics = {}
            for row in cursor.fetchall():
                stat_type = row['stat_type']
                if stat_type not in statistics:
                    statistics[stat_type] = []
                statistics[stat_type].append(row['stat_value'])
            
            return DiseaseInformation(
                name=disease_row['name'],
                overview=disease_row['overview'] or '',
                symptoms=entities_by_type.get('symptoms', []),
                risk_factors=entities_by_type.get('risk_factors', []),
                prevention=entities_by_type.get('prevention', []),
                diagnosis=entities_by_type.get('diagnosis', []),
                treatment=entities_by_type.get('treatment', []),
                statistics=statistics,
                severity=disease_row['severity'],
                prevalence=disease_row['prevalence'],
                key_facts=[],  # Would need separate table for this
                source_metadata={'source_url': disease_row['source_url']},
                last_updated=datetime.fromisoformat(disease_row['last_updated']) if disease_row['last_updated'] else datetime.now(),
                data_quality_score=disease_row['data_quality_score']
            )

# Enhanced Asynchronous Scraper
class EnhancedWHOScraper:
    """
    The main scraping engine with advanced capabilities.
    This is like upgrading from a single medical researcher to a whole
    team of researchers working efficiently together.
    """
    
    def __init__(self, config: ScrapingConfig = None):
        self.config = config or ScrapingConfig()
        self.content_processor = MedicalContentProcessor(self.config.enable_nlp)
        self.data_manager = MedicalDataManager()
        self.session = None
        self.processed_urls = set()
        
    async def __aenter__(self):
        """Async context manager entry - sets up the HTTP session"""
        connector = aiohttp.TCPConnector(limit=self.config.max_concurrent_requests)
        timeout = aiohttp.ClientTimeout(total=self.config.timeout)
        
        self.session = aiohttp.ClientSession(
            connector=connector,
            timeout=timeout,
            headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
        )
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit - cleans up the HTTP session"""
        if self.session:
            await self.session.close()
    
    async def scrape_urls(self, urls: List[str]) -> Dict[str, DiseaseInformation]:
        """
        Scrape multiple URLs concurrently with intelligent rate limiting.
        This is like coordinating a team of researchers to work on different
        diseases simultaneously while being respectful to the WHO servers.
        """
        semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
        results = {}
        
        # Create tasks for all URLs
        tasks = []
        for url in urls:
            if url not in self.processed_urls:
                task = self._scrape_with_semaphore(semaphore, url)
                tasks.append((url, task))
        
        # Process tasks and collect results
        for url, task in tasks:
            try:
                disease_info = await task
                if disease_info:
                    results[disease_info.name] = disease_info
                    # Save to database immediately
                    self.data_manager.save_disease(disease_info)
                    logger.info(f"Successfully processed: {disease_info.name}")
                self.processed_urls.add(url)
            except Exception as e:
                logger.error(f"Failed to process {url}: {e}")
        
        return results
    
    async def _scrape_with_semaphore(self, semaphore: asyncio.Semaphore, url: str) -> Optional[DiseaseInformation]:
        """Scrape a single URL with rate limiting"""
        async with semaphore:
            # Add respectful delay
            await asyncio.sleep(self.config.request_delay)
            return await self._scrape_single_url(url)
    
    async def _scrape_single_url(self, url: str) -> Optional[DiseaseInformation]:
        """
        Scrape a single URL with retry logic and comprehensive error handling.
        This method embodies the resilience of a dedicated researcher who
        doesn't give up easily when faced with technical difficulties.
        """
        for attempt in range(self.config.max_retries):
            try:
                logger.info(f"Scraping attempt {attempt + 1} for: {url}")
                
                async with self.session.get(url) as response:
                    if response.status != 200:
                        logger.warning(f"HTTP {response.status} for {url}")
                        continue
                    
                    content = await response.text()
                    
                    if len(content) > self.config.max_content_length:
                        logger.warning(f"Content too large for {url}, truncating")
                        content = content[:self.config.max_content_length]
                    
                    return await self._process_content(content, url)
            
            except asyncio.TimeoutError:
                logger.warning(f"Timeout for {url}, attempt {attempt + 1}")
            except Exception as e:
                logger.error(f"Error scraping {url}, attempt {attempt + 1}: {e}")
            
            if attempt < self.config.max_retries - 1:
                # Exponential backoff
                await asyncio.sleep(2 ** attempt)
        
        logger.error(f"Failed to scrape {url} after {self.config.max_retries} attempts")
        return None
    
    async def _process_content(self, content: str, url: str) -> Optional[DiseaseInformation]:
        """
        Process HTML content into structured disease information.
        This is the heart of our intelligent extraction system.
        """
        try:
            if not BS4_AVAILABLE:
                logger.error("BeautifulSoup not available for content processing")
                return None
            
            soup = BeautifulSoup(content, 'html.parser')
            
            # Extract basic information
            disease_name = self._extract_disease_name(soup)
            overview = self._extract_overview(soup)
            
            if not disease_name:
                logger.warning(f"Could not extract disease name from {url}")
                return None
            
            # Extract content sections with better structure detection
            sections = self._extract_content_sections(soup)
            
            # Process each section with our advanced content processor
            medical_entities = {}
            for section_title, section_content in sections.items():
                entities = self.content_processor.extract_medical_entities(
                    section_content, section_title
                )
                
                # Group entities by type
                for entity in entities:
                    category = self._determine_entity_category(entity, section_title)
                    if category not in medical_entities:
                        medical_entities[category] = []
                    medical_entities[category].append(entity)
            
            # Extract statistics and metadata
            statistics = self._extract_statistics(soup)
            severity = self._determine_severity(soup, medical_entities)
            prevalence = self._calculate_prevalence(statistics, disease_name)
            
            # Calculate overall data quality score
            quality_score = self._calculate_quality_score(
                medical_entities, statistics, overview, sections
            )
            
            return DiseaseInformation(
                name=disease_name,
                overview=overview,
                symptoms=medical_entities.get('symptoms', []),
                risk_factors=medical_entities.get('risk_factors', []),
                prevention=medical_entities.get('prevention', []),
                diagnosis=medical_entities.get('diagnosis', []),
                treatment=medical_entities.get('treatment', []),
                statistics=statistics,
                severity=severity,
                prevalence=prevalence,
                key_facts=self._extract_key_facts(soup),
                source_metadata={
                    'source_url': url,
                    'scraped_date': datetime.now(),
                    'source': 'World Health Organization (WHO)',
                    'processing_version': '2.0'
                },
                last_updated=datetime.now(),
                data_quality_score=quality_score
            )
        
        except Exception as e:
            logger.error(f"Error processing content from {url}: {e}")
            return None
    
    def _extract_disease_name(self, soup: BeautifulSoup) -> str:
        """Extract disease name with improved accuracy"""
        # Try multiple methods to find the disease name
        candidates = []
        
        # Method 1: Main heading
        h1 = soup.find('h1')
        if h1:
            candidates.append(h1.get_text().strip())
        
        # Method 2: Title tag
        title = soup.find('title')
        if title:
            title_text = title.get_text().strip()
            # Remove common WHO prefixes/suffixes
            title_text = re.sub(r'.*WHO.*?[-–]', '', title_text).strip()
            candidates.append(title_text)
        
        # Method 3: Look for fact sheet patterns
        fact_sheet_pattern = re.compile(r'fact sheet.*?[-–]\s*(.+)', re.IGNORECASE)
        for candidate in candidates:
            match = fact_sheet_pattern.search(candidate)
            if match:
                return match.group(1).strip()
        
        # Return the first non-empty candidate
        for candidate in candidates:
            if candidate and len(candidate) > 3:
                return candidate
        
        return "Unknown Disease"
    
    def _extract_overview(self, soup: BeautifulSoup) -> str:
        """Extract disease overview with better content detection"""
        # Look for common overview patterns
        overview_indicators = ['overview', 'introduction', 'about', 'definition']
        
        for indicator in overview_indicators:
            heading = soup.find(['h2', 'h3'], string=re.compile(indicator, re.I))
            if heading:
                # Get content after this heading
                content = []
                sibling = heading.next_sibling
                while sibling and not (hasattr(sibling, 'name') and sibling.name in ['h1', 'h2', 'h3']):
                    if hasattr(sibling, 'get_text'):
                        text = sibling.get_text().strip()
                        if text and len(text) > 20:
                            content.append(text)
                    sibling = sibling.next_sibling
                
                if content:
                    return ' '.join(content)[:500]  # Limit overview length
        
        # Fallback: first substantial paragraph
        paragraphs = soup.find_all('p')
        for p in paragraphs[:3]:  # Check first 3 paragraphs
            text = p.get_text().strip()
            if len(text) > 100:  # Substantial content
                return text[:500]
        
        return ""
    
    def _extract_content_sections(self, soup: BeautifulSoup) -> Dict[str, str]:
        """Enhanced section extraction with better content grouping"""
        sections = {}
        
        # Find all headings and group content
        headings = soup.find_all(['h2', 'h3', 'h4'])
        
        for i, heading in enumerate(headings):
            heading_text = heading.get_text().strip()
            if not heading_text:
                continue
            
            # Collect content until next heading of same or higher level
            content_parts = []
            current = heading.next_sibling
            
            while current:
                # Stop at next heading of same or higher level
                if (hasattr(current, 'name') and 
                    current.name in ['h1', 'h2', 'h3', 'h4'] and
                    int(current.name[1]) <= int(heading.name[1])):
                    break
                
                if hasattr(current, 'get_text'):
                    text = current.get_text().strip()
                    if text and len(text) > 10:  # Filter out very short content
                        content_parts.append(text)
                
                current = current.next_sibling
            
            if content_parts:
                sections[heading_text] = ' '.join(content_parts)
        
        return sections
    
    def _determine_entity_category(self, entity: MedicalEntity, section_title: str) -> str:
        """Determine the category of a medical entity with improved logic"""
        section_lower = section_title.lower()
        
        # Use section title as primary indicator
        if any(word in section_lower for word in ['symptom', 'sign', 'manifest']):
            return 'symptoms'
        elif any(word in section_lower for word in ['risk', 'factor', 'cause']):
            return 'risk_factors'
        elif any(word in section_lower for word in ['prevent', 'avoid']):
            return 'prevention'
        elif any(word in section_lower for word in ['diagnos', 'test', 'detect']):
            return 'diagnosis'
        elif any(word in section_lower for word in ['treat', 'therap', 'manage']):
            return 'treatment'
        
        # Fallback to content analysis
        content_lower = entity.content.lower()
        if any(word in content_lower for word in ['experience', 'feel', 'pain', 'ache']):
            return 'symptoms'
        elif any(word in content_lower for word in ['increase risk', 'more likely']):
            return 'risk_factors'
        elif any(word in content_lower for word in ['prevent', 'avoid', 'reduce risk']):
            return 'prevention'
        elif any(word in content_lower for word in ['test', 'diagnose', 'detect']):
            return 'diagnosis'
        elif any(word in content_lower for word in ['treat', 'medication', 'therapy']):
            return 'treatment'
        
        return 'general'  # Default category
    
    def _extract_statistics(self, soup: BeautifulSoup) -> Dict[str, List[str]]:
        """Enhanced statistics extraction with better pattern recognition"""
        statistics = {}
        text = soup.get_text()
        
        # Enhanced patterns for different types of statistics
        patterns = {
            'deaths': r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:million|thousand)?\s*(?:people\s*)?(?:die|death|mortality)',
            'affected': r'(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:million|billion|thousand)?\s*(?:people\s*)?(?:affected|living with|have)',
            'percentage': r'(\d+(?:\.\d+)?)\s*%',
            'prevalence': r'prevalence.*?(\d+(?:\.\d+)?\s*%)',
            'incidence': r'incidence.*?(\d+(?:,\d{3})*)',
            'cost': r'\$(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:billion|million|thousand)?'
        }
        
        for stat_type, pattern in patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                statistics[stat_type] = matches[:5]  # Limit to 5 matches per type
        
        return statistics
    
    def _determine_severity(self, soup: BeautifulSoup, medical_entities: Dict[str, List[MedicalEntity]]) -> str:
        """Enhanced severity determination using multiple indicators"""
        content = soup.get_text().lower()
        
        # High severity indicators with weights
        high_indicators = {
            'death': 3, 'fatal': 3, 'mortality': 2, 'life-threatening': 3,
            'emergency': 2, 'critical': 2, 'severe': 2, 'cancer': 2
        }
        
        # Moderate severity indicators
        moderate_indicators = {
            'chronic': 1, 'manage': 1, 'control': 1, 'treatment': 1,
            'medication': 1, 'therapy': 1, 'hospital': 2
        }
        
        # Calculate severity score
        high_score = sum(weight for word, weight in high_indicators.items() if word in content)
        moderate_score = sum(weight for word, weight in moderate_indicators.items() if word in content)
        
        # Consider entity confidence scores
        entity_severity = 0
        for entities in medical_entities.values():
            for entity in entities:
                if any(word in entity.content.lower() for word in high_indicators.keys()):
                    entity_severity += entity.confidence * 2
                elif any(word in entity.content.lower() for word in moderate_indicators.keys()):
                    entity_severity += entity.confidence
        
        total_score = high_score + entity_severity
        
        if total_score >= 5:
            return "High"
        elif total_score >= 2 or moderate_score >= 3:
            return "Moderate"
        else:
            return "Low"
    
    def _calculate_prevalence(self, statistics: Dict[str, List[str]], disease_name: str) -> float:
        """Calculate disease prevalence using multiple data sources"""
        # Try to extract from percentage statistics first
        if 'percentage' in statistics:
            try:
                # Find the most reasonable prevalence percentage
                percentages = [float(p.replace('%', '')) for p in statistics['percentage']]
                # Filter out unrealistic values
                reasonable_percentages = [p for p in percentages if 0.001 <= p <= 50]
                if reasonable_percentages:
                    return reasonable_percentages[0] / 100
            except ValueError:
                pass
        
        # Try to calculate from affected population
        if 'affected' in statistics:
            try:
                affected = statistics['affected'][0]
                # Simple estimation based on global population
                if 'million' in affected.lower():
                    millions = float(re.search(r'(\d+(?:\.\d+)?)', affected).group(1))
                    return millions / 8000  # Rough global population
            except (ValueError, AttributeError):
                pass
        
        # Disease-specific default estimates based on medical knowledge
        disease_lower = disease_name.lower()
        default_prevalences = {
            'diabetes': 0.11, 'hypertension': 0.22, 'depression': 0.05,
            'cancer': 0.03, 'tuberculosis': 0.01, 'hiv': 0.01,
            'hepatitis': 0.02, 'covid': 0.073
        }
        
        for disease, prevalence in default_prevalences.items():
            if disease in disease_lower:
                return prevalence
        
        return 0.05  # Default 5% prevalence
    
    def _extract_key_facts(self, soup: BeautifulSoup) -> List[str]:
        """Extract key facts with improved detection"""
        key_facts = []
        
        # Look for explicit key facts sections
        key_facts_section = soup.find(['h2', 'h3'], string=re.compile(r'key facts?', re.I))
        if key_facts_section:
            # Get the list following this heading
            next_element = key_facts_section.find_next(['ul', 'ol'])
            if next_element:
                for li in next_element.find_all('li'):
                    fact = li.get_text().strip()
                    if fact and len(fact) > 10:
                        key_facts.append(fact)
        
        # If no explicit section, extract from first lists
        if not key_facts:
            lists = soup.find_all(['ul', 'ol'])[:2]  # First 2 lists
            for ul in lists:
                for li in ul.find_all('li')[:5]:  # Max 5 items per list
                    fact = li.get_text().strip()
                    if fact and len(fact) > 20:  # Substantial facts only
                        key_facts.append(fact)
        
        return key_facts[:8]  # Limit to 8 key facts
    
    def _calculate_quality_score(self, medical_entities: Dict[str, List[MedicalEntity]], 
                                statistics: Dict[str, List[str]], overview: str, 
                                sections: Dict[str, str]) -> float:
        """
        Calculate overall data quality score.
        This is like a medical expert reviewing the completeness and
        reliability of the extracted information.
        """
        score = 0.0
        max_score = 10.0
        
        # Content completeness (40% of score)
        required_sections = ['symptoms', 'treatment', 'diagnosis']
        found_sections = sum(1 for section in required_sections if medical_entities.get(section))
        score += (found_sections / len(required_sections)) * 4.0
        
        # Entity confidence (30% of score)
        if medical_entities:
            all_entities = [entity for entities in medical_entities.values() for entity in entities]
            if all_entities:
                avg_confidence = sum(entity.confidence for entity in all_entities) / len(all_entities)
                score += avg_confidence * 3.0
        
        # Statistics presence (15% of score)
        if statistics:
            score += min(len(statistics) / 3.0, 1.0) * 1.5
        
        # Overview quality (10% of score)
        if overview and len(overview) > 50:
            score += 1.0
        
        # Section diversity (5% of score)
        if len(sections) >= 5:
            score += 0.5
        
        return min(score / max_score, 1.0)

# Main execution function
async def main():
    """
    Main execution function demonstrating the enhanced scraper.
    This orchestrates the entire process like a research project manager.
    """
    
    # Load configuration
    config = ScrapingConfig(
        max_concurrent_requests=3,  # Be respectful to WHO servers
        request_delay=2.0,
        enable_nlp=NLTK_AVAILABLE,
        max_retries=3
    )
    
    # Read URLs from file
    try:
        urls_file = Path('link-disease.txt')
        if not urls_file.exists():
            logger.error("link-disease.txt not found")
            return
        
        urls = []
        with open(urls_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#') and line.startswith('http'):
                    urls.append(line)
        
        logger.info(f"Loaded {len(urls)} URLs for processing")
        
    except Exception as e:
        logger.error(f"Error reading URLs: {e}")
        return
    
    # Initialize and run scraper
    async with EnhancedWHOScraper(config) as scraper:
        logger.info("Starting enhanced medical knowledge extraction...")
        
        results = await scraper.scrape_urls(urls)
        
        logger.info(f"Successfully processed {len(results)} diseases")
        
        # Generate summary report
        print("\n" + "="*80)
        print("ENHANCED MEDICAL KNOWLEDGE EXTRACTION REPORT")
        print("="*80)
        
        for disease_name, disease_info in results.items():
            print(f"\nDisease: {disease_name}")
            print(f"Quality Score: {disease_info.data_quality_score:.2f}")
            print(f"Severity: {disease_info.severity}")
            print(f"Prevalence: {disease_info.prevalence:.3f}")
            print(f"Entities Extracted:")
            print(f"  - Symptoms: {len(disease_info.symptoms)}")
            print(f"  - Risk Factors: {len(disease_info.risk_factors)}")
            print(f"  - Prevention: {len(disease_info.prevention)}")
            print(f"  - Diagnosis: {len(disease_info.diagnosis)}")
            print(f"  - Treatment: {len(disease_info.treatment)}")
            
            if disease_info.symptoms:
                print(f"  Top Symptom: {disease_info.symptoms[0].content[:100]}...")
        
        print(f"\nData saved to medical_knowledge.db")
        print("="*80)

if __name__ == "__main__":
    asyncio.run(main())