In [2]:
import requests
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class WHOMedicalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.knowledge_base = {}
        
    def scrape_who_factsheet(self, url):
        """Scrape a WHO fact sheet and extract structured medical information"""
        try:
            logger.info(f"Scraping: {url}")
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract disease information
            disease_data = self._extract_disease_info(soup, url)
            
            # Add to knowledge base
            disease_name = disease_data.get('name', 'Unknown')
            self.knowledge_base[disease_name.lower().replace(' ', '_')] = disease_data
            
            logger.info(f"Successfully scraped: {disease_name}")
            return disease_data
            
        except Exception as e:
            logger.error(f"Error scraping {url}: {str(e)}")
            return None
    
    def _extract_disease_info(self, soup, url):
        """Extract structured disease information from WHO fact sheet"""
        
        # Initialize disease data structure
        disease_data = {
            'metadata': self._extract_metadata(soup, url),
            'name': '',
            'overview': '',
            'key_facts': [],
            'symptoms': [],
            'risk_factors': [],
            'prevention': [],
            'diagnosis': [],
            'treatment': [],
            'statistics': {},
            'who_response': '',
            'related_links': [],
            'content_sections': {}
        }
        
        # Extract title/disease name
        title_elem = soup.find('h1') or soup.find('title')
        if title_elem:
            disease_data['name'] = self._clean_text(title_elem.get_text())
        
        # Extract key facts (usually in bullet points or numbered lists)
        key_facts = self._extract_key_facts(soup)
        disease_data['key_facts'] = key_facts
        
        # Extract main content sections
        content_sections = self._extract_content_sections(soup)
        disease_data['content_sections'] = content_sections
        
        # Parse specific medical information from content
        disease_data.update(self._parse_medical_content(content_sections))
        
        # Extract statistics and numbers
        disease_data['statistics'] = self._extract_statistics(soup)
        
        # Extract related links
        disease_data['related_links'] = self._extract_links(soup, url)
        
        return disease_data
    
    def _extract_metadata(self, soup, url):
        """Extract page metadata"""
        metadata = {
            'source_url': url,
            'scraped_date': datetime.now().isoformat(),
            'source': 'World Health Organization (WHO)',
            'last_updated': None,
            'language': 'en',
            'page_title': '',
            'meta_description': '',
            'canonical_url': url
        }
        
        # Extract page title
        title_tag = soup.find('title')
        if title_tag:
            metadata['page_title'] = self._clean_text(title_tag.get_text())
        
        # Extract meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            metadata['meta_description'] = meta_desc.get('content', '')
        
        # Look for last updated date
        date_patterns = [
            r'(\d{1,2}\s+\w+\s+\d{4})',  # 12 April 2023
            r'(\w+\s+\d{1,2},\s+\d{4})',  # April 12, 2023
            r'(\d{4}-\d{2}-\d{2})'  # 2023-04-12
        ]
        
        page_text = soup.get_text()
        for pattern in date_patterns:
            matches = re.findall(pattern, page_text)
            if matches:
                metadata['last_updated'] = matches[0]
                break
        
        return metadata
    
    def _extract_key_facts(self, soup):
        """Extract key facts from the page"""
        key_facts = []
        
        # Look for key facts section
        key_facts_section = soup.find(['div', 'section'], string=re.compile(r'key facts?', re.I))
        if not key_facts_section:
            key_facts_section = soup.find(['h2', 'h3'], string=re.compile(r'key facts?', re.I))
            if key_facts_section:
                key_facts_section = key_facts_section.find_next(['div', 'ul', 'ol'])
        
        if key_facts_section:
            # Extract from lists
            for li in key_facts_section.find_all('li'):
                fact = self._clean_text(li.get_text())
                if fact and len(fact) > 10:  # Filter out very short items
                    key_facts.append(fact)
        
        # If no specific key facts section, extract from first few bullet points
        if not key_facts:
            all_lists = soup.find_all(['ul', 'ol'])[:3]  # First 3 lists
            for ul in all_lists:
                for li in ul.find_all('li')[:5]:  # First 5 items per list
                    fact = self._clean_text(li.get_text())
                    if fact and len(fact) > 10:
                        key_facts.append(fact)
        
        return key_facts[:10]  # Limit to top 10 key facts
    
    def _extract_content_sections(self, soup):
        """Extract main content sections with headings"""
        sections = {}
        
        # Find all headings (h2, h3, h4)
        headings = soup.find_all(['h2', 'h3', 'h4'])
        
        for heading in headings:
            heading_text = self._clean_text(heading.get_text())
            if not heading_text:
                continue
                
            # Get content between this heading and the next
            content_elements = []
            current = heading.next_sibling
            
            while current:
                if hasattr(current, 'name') and current.name in ['h2', 'h3', 'h4']:
                    break
                if hasattr(current, 'get_text'):
                    text = self._clean_text(current.get_text())
                    if text and len(text) > 20:  # Only meaningful content
                        content_elements.append(text)
                current = current.next_sibling
            
            if content_elements:
                sections[heading_text] = content_elements
        
        return sections
    
    def _parse_medical_content(self, content_sections):
        """Parse medical information from content sections"""
        medical_info = {
            'symptoms': [],
            'risk_factors': [],
            'prevention': [],
            'diagnosis': [],
            'treatment': [],
            'overview': ''
        }
        
        # Keywords to identify different types of medical information
        symptom_keywords = ['symptom', 'sign', 'manifest', 'present', 'appear', 'experience']
        risk_keywords = ['risk', 'factor', 'cause', 'associate', 'increase', 'likely']
        prevention_keywords = ['prevent', 'avoid', 'reduce', 'lifestyle', 'diet', 'exercise']
        diagnosis_keywords = ['diagnos', 'test', 'screen', 'detect', 'measure', 'check']
        treatment_keywords = ['treat', 'manage', 'therap', 'medic', 'drug', 'intervention']
        
        for section_title, content_list in content_sections.items():
            section_title_lower = section_title.lower()
            combined_content = ' '.join(content_list)
            
            # Categorize based on section title
            if any(keyword in section_title_lower for keyword in symptom_keywords):
                medical_info['symptoms'].extend(self._extract_list_items(combined_content))
            elif any(keyword in section_title_lower for keyword in risk_keywords):
                medical_info['risk_factors'].extend(self._extract_list_items(combined_content))
            elif any(keyword in section_title_lower for keyword in prevention_keywords):
                medical_info['prevention'].extend(self._extract_list_items(combined_content))
            elif any(keyword in section_title_lower for keyword in diagnosis_keywords):
                medical_info['diagnosis'].extend(self._extract_list_items(combined_content))
            elif any(keyword in section_title_lower for keyword in treatment_keywords):
                medical_info['treatment'].extend(self._extract_list_items(combined_content))
            elif 'overview' in section_title_lower or 'introduction' in section_title_lower:
                medical_info['overview'] = combined_content
        
        return medical_info
    
    def _extract_list_items(self, text):
        """Extract list items from text (sentences that could be symptoms, risk factors, etc.)"""
        items = []
        
        # Split by common delimiters
        sentences = re.split(r'[.;:](?:\s|$)', text)
        
        for sentence in sentences:
            cleaned = self._clean_text(sentence)
            if cleaned and 10 < len(cleaned) < 200:  # Reasonable length
                items.append(cleaned)
        
        return items[:10]  # Limit number of items
    
    def _extract_statistics(self, soup):
        """Extract numerical statistics from the page"""
        statistics = {}
        text = soup.get_text()
        
        # Common statistical patterns
        stat_patterns = [
            (r'(\d+(?:\.\d+)?)\s*million', 'millions'),
            (r'(\d+(?:\.\d+)?)\s*billion', 'billions'),
            (r'(\d+(?:\.\d+)?)\s*%', 'percentage'),
            (r'(\d+(?:,\d{3})*)', 'numbers'),
        ]
        
        for pattern, stat_type in stat_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                statistics[stat_type] = matches[:5]  # Top 5 matches
        
        return statistics
    
    def _extract_links(self, soup, base_url):
        """Extract relevant links from the page"""
        links = []
        
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            link_text = self._clean_text(a_tag.get_text())
            
            # Convert relative URLs to absolute
            full_url = urljoin(base_url, href)
            
            # Filter for relevant links (WHO, medical resources)
            if any(domain in full_url for domain in ['who.int', 'nih.gov', 'cdc.gov']) and link_text:
                links.append({
                    'url': full_url,
                    'text': link_text
                })
        
        return links[:10]  # Limit number of links
    
    def _clean_text(self, text):
        """Clean and normalize text"""
        if not text:
            return ""
        
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Remove special characters that might interfere
        text = re.sub(r'[^\w\s\-.,;:()\[\]{}]', '', text)
        
        return text
    
    def scrape_multiple_urls(self, urls, delay=2):
        """Scrape multiple WHO fact sheets"""
        results = {}
        
        for i, url in enumerate(urls):
            logger.info(f"Processing URL {i+1}/{len(urls)}: {url}")
            
            result = self.scrape_who_factsheet(url)
            if result:
                disease_name = result.get('name', f'disease_{i}')
                results[disease_name] = result
            
            # Add delay between requests to be respectful
            if i < len(urls) - 1:
                time.sleep(delay)
        
        return results
    
    def format_for_chatbot(self):
        """Format the knowledge base for chatbot use"""
        chatbot_data = []
        
        for disease_key, disease_data in self.knowledge_base.items():
            # Extract severity from content or statistics
            severity = self._determine_severity(disease_data)
            
            # Extract prevalence from statistics
            prevalence = self._extract_prevalence(disease_data)
            
            # Generate advice from prevention and treatment info
            advice = self._generate_advice(disease_data)
            
            formatted_disease = {
                "name": disease_data.get('name', '').strip(),
                "symptoms": disease_data.get('symptoms', [])[:10],  # Limit to 10
                "tests": disease_data.get('diagnosis', [])[:5],  # Limit to 5
                "treatments": disease_data.get('treatment', [])[:5],  # Limit to 5
                "severity": severity,
                "advice": advice,
                "prevalence": prevalence
            }
            
            chatbot_data.append(formatted_disease)
        
        return chatbot_data
    
    def _determine_severity(self, disease_data):
        """Determine disease severity based on content"""
        name = disease_data.get('name', '').lower()
        content = ' '.join([
            ' '.join(disease_data.get('symptoms', [])),
            ' '.join(disease_data.get('key_facts', [])),
            disease_data.get('overview', '')
        ]).lower()
        
        # High severity indicators
        high_indicators = ['death', 'fatal', 'mortality', 'life-threatening', 'emergency', 'critical']
        # Moderate severity indicators  
        moderate_indicators = ['chronic', 'manage', 'control', 'treatment', 'medication']
        # Low severity indicators
        low_indicators = ['mild', 'minor', 'temporary', 'self-limiting']
        
        if any(indicator in content for indicator in high_indicators):
            return "High"
        elif any(indicator in content for indicator in moderate_indicators):
            return "Moderate"
        elif any(indicator in content for indicator in low_indicators):
            return "Low"
        else:
            return "Moderate"  # Default
    
    def _extract_prevalence(self, disease_data):
        """Extract prevalence from statistics"""
        stats = disease_data.get('statistics', {})
        
        # Look for percentages first
        if 'percentage' in stats and stats['percentage']:
            try:
                # Take the first percentage and convert to decimal
                pct = float(stats['percentage'][0].replace('%', ''))
                return round(pct / 100, 3)
            except:
                pass
        
        # Look for millions/billions
        if 'millions' in stats and stats['millions']:
            try:
                millions = float(stats['millions'][0])
                # Rough global prevalence calculation (assuming world population ~8 billion)
                return round(millions / 8000, 3)
            except:
                pass
        
        # Default prevalence based on disease type
        name = disease_data.get('name', '').lower()
        if 'diabetes' in name:
            return 0.11  # ~11% global prevalence
        elif 'hypertension' in name or 'blood pressure' in name:
            return 0.22  # ~22% global prevalence
        elif 'cancer' in name:
            return 0.05  # ~5% lifetime risk
        else:
            return 0.10  # Default 10%
    
    def _generate_advice(self, disease_data):
        """Generate advice from prevention and treatment information"""
        advice_parts = []
        
        # From prevention
        prevention = disease_data.get('prevention', [])
        if prevention:
            advice_parts.extend(prevention[:2])  # First 2 prevention tips
        
        # From treatment
        treatment = disease_data.get('treatment', [])
        if treatment:
            advice_parts.extend(treatment[:1])  # First treatment option
        
        # Default advice if nothing found
        if not advice_parts:
            advice_parts = ["Consult a healthcare professional for proper diagnosis and treatment."]
        
        return ' '.join(advice_parts)[:200]  # Limit length
    
    def save_knowledge_base(self, filename='who_medical_knowledge_base.json'):
        """Save the knowledge base to a JSON file"""
        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(self.knowledge_base, f, indent=2, ensure_ascii=False)
            logger.info(f"Knowledge base saved to {filename}")
            return True
        except Exception as e:
            logger.error(f"Error saving knowledge base: {str(e)}")
            return False
    
    def save_chatbot_format(self, filename='chatbot_medical_data.json'):
        """Save formatted data for chatbot use"""
        try:
            chatbot_data = self.format_for_chatbot()
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(chatbot_data, f, indent=2, ensure_ascii=False)
            logger.info(f"Chatbot data saved to {filename}")
            return chatbot_data
        except Exception as e:
            logger.error(f"Error saving chatbot data: {str(e)}")
            return []
    
    def get_chatbot_questions(self, disease_name):
        """Generate potential chatbot questions based on extracted information"""
        if disease_name not in self.knowledge_base:
            return []
        
        disease_data = self.knowledge_base[disease_name]
        questions = []
        
        # Generate questions based on symptoms
        if disease_data.get('symptoms'):
            questions.append({
                'category': 'symptoms',
                'question': f"Are you experiencing any of these symptoms related to {disease_data['name']}?",
                'options': disease_data['symptoms'][:5],
                'type': 'multiple_choice'
            })
        
        # Generate questions based on risk factors
        if disease_data.get('risk_factors'):
            questions.append({
                'category': 'risk_factors',
                'question': f"Do any of these risk factors apply to you?",
                'options': disease_data['risk_factors'][:5],
                'type': 'multiple_choice'
            })
        
        return questions

# Main execution
def main():
    # Read URLs from link-disease.txt
    urls = []
    try:
        with open('link-disease.txt', 'r', encoding='utf-8') as f:
            urls = [line.strip() for line in f if line.strip() and not line.startswith('#')]
        logger.info(f"Loaded {len(urls)} URLs from link-disease.txt")
    except FileNotFoundError:
        logger.error("Error: link-disease.txt not found")
        print("Error: link-disease.txt not found. Please ensure the file exists in the current directory.")
        return
    except Exception as e:
        logger.error(f"Error reading link-disease.txt: {str(e)}")
        print(f"Error reading link-disease.txt: {str(e)}")
        return
    
    # Validate URLs
    valid_urls = [url for url in urls if url.startswith('http')]
    if not valid_urls:
        logger.error("No valid URLs found in link-disease.txt")
        print("Error: No valid URLs found in link-disease.txt. Please check the file contents.")
        return
    
    # Initialize scraper
    scraper = WHOMedicalScraper()
    
    # Scrape all URLs
    logger.info("Starting WHO medical fact sheet scraping...")
    results = scraper.scrape_multiple_urls(valid_urls)
    
    # Save knowledge base
    scraper.save_knowledge_base('who_medical_knowledge_base.json')
    
    # Save and print chatbot formatted data
    chatbot_data = scraper.save_chatbot_format('chatbot_medical_data.json')
    
    # Print formatted data for chatbot
    print("\n=== CHATBOT FORMATTED DATA ===")
    print(json.dumps(chatbot_data, indent=2, ensure_ascii=False))
    
    # Print detailed information for each disease
    print("\n=== DETAILED EXTRACTED DATA ===")
    for disease_name, data in results.items():
        print(f"\n{'='*50}")
        print(f"DISEASE: {disease_name}")
        print(f"{'='*50}")
        
        print(f"\nOVERVIEW:")
        print(data.get('overview', 'Not available')[:300] + "...")
        
        print(f"\nKEY FACTS:")
        for i, fact in enumerate(data.get('key_facts', [])[:5], 1):
            print(f"  {i}. {fact}")
        
        print(f"\nSYMPTOMS:")
        for i, symptom in enumerate(data.get('symptoms', [])[:8], 1):
            print(f"  {i}. {symptom}")
        
        print(f"\nRISK FACTORS:")
        for i, risk in enumerate(data.get('risk_factors', [])[:5], 1):
            print(f"  {i}. {risk}")
        
        print(f"\nPREVENTION:")
        for i, prevention in enumerate(data.get('prevention', [])[:5], 1):
            print(f"  {i}. {prevention}")
        
        print(f"\nDIAGNOSIS/TESTS:")
        for i, diagnosis in enumerate(data.get('diagnosis', [])[:5], 1):
            print(f"  {i}. {diagnosis}")
        
        print(f"\nTREATMENT:")
        for i, treatment in enumerate(data.get('treatment', [])[:5], 1):
            print(f"  {i}. {treatment}")
        
        print(f"\nSTATISTICS:")
        stats = data.get('statistics', {})
        for stat_type, values in stats.items():
            print(f"  {stat_type.title()}: {', '.join(values[:3])}")
        
        print(f"\nMETADATA:")
        metadata = data.get('metadata', {})
        print(f"  Source: {metadata.get('source', 'N/A')}")
        print(f"  URL: {metadata.get('source_url', 'N/A')}")
        print(f"  Last Updated: {metadata.get('last_updated', 'N/A')}")
        print(f"  Scraped: {metadata.get('scraped_date', 'N/A')}")
    
    print(f"\n{'='*50}")
    print(f"SUMMARY:")
    print(f"Total diseases processed: {len(scraper.knowledge_base)}")
    print("Files saved:")
    print("  - who_medical_knowledge_base.json (raw data)")
    print("  - chatbot_medical_data.json (formatted for chatbot)")
    print(f"{'='*50}")

if __name__ == "__main__":
    main()

2025-05-26 09:40:11,568 - INFO - Loaded 8 URLs from link-disease.txt
2025-05-26 09:40:11,568 - INFO - Starting WHO medical fact sheet scraping...
2025-05-26 09:40:11,568 - INFO - Processing URL 1/8: https://www.who.int/news-room/fact-sheets/detail/depression
2025-05-26 09:40:11,568 - INFO - Scraping: https://www.who.int/news-room/fact-sheets/detail/depression
2025-05-26 09:40:13,833 - INFO - Successfully scraped: Depressive disorder (depression)
2025-05-26 09:40:15,842 - INFO - Processing URL 2/8: https://www.who.int/news-room/fact-sheets/detail/hiv-aids
2025-05-26 09:40:15,842 - INFO - Scraping: https://www.who.int/news-room/fact-sheets/detail/hiv-aids
2025-05-26 09:40:17,713 - INFO - Successfully scraped: HIV and AIDS
2025-05-26 09:40:19,724 - INFO - Processing URL 3/8: http://who.int/news-room/fact-sheets/detail/tuberculosis
2025-05-26 09:40:19,724 - INFO - Scraping: http://who.int/news-room/fact-sheets/detail/tuberculosis
2025-05-26 09:40:21,193 - INFO - Successfully scraped: Tuber


=== CHATBOT FORMATTED DATA ===
[
  {
    "name": "Depressive disorder (depression)",
    "symptoms": [
      "During a depressive episode, a person experiences a depressed mood (feeling sad, irritable, empty)",
      "They may feel a loss of pleasure or interest in activities",
      "A depressive episode is different from regular mood fluctuations",
      "They last most of the day, nearly every day, for at least two weeks",
      "Other symptoms are also present, which may include",
      "Depression can cause difficulties in all aspects of life, including in the community and at home, work and school",
      "A depressive episode can be categorized as mild, moderate, or severe depending on the number and severity of symptoms, as well as the impact on the individuals functioning",
      "There are different patterns of depressive episodes including",
      "single episode depressive disorder, meaning the persons first and only episode;recurrent depressive disorder, meaning the perso