In [58]:
# webscraping libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np 

In [59]:
base_url = "https://icd.who.int/browse10/2010/en#/"

chapters = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 
            'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 
            'XIX', 'XX', 'XXI', 'XXII']


**Objective**

We want to extract specific data from the website's structure:

* **Chapter Title (Broad diagnosis category)**: This is the text content of the first `<h2>` element on the page. 
* **ICD Code (First 3 characters/digits)**:  Located within a `dl` element with the class "BlockList". More specifically, it's within the text content of an `a` element nested under a `li` with class "BlockList1 elements".
* **Block Title (Description of the code category)**:  Also found within the same `dl` element (class "BlockList"). It's the `title` attribute of the same `a` element that contains the ICD Code. 

**HTML Structure to Target**

```html
<dl class="Blocklist">
    <ul>
        <li class="BlockList1 elements">
            <a element title="Block title (string title of the code a category)" > Block code (first 3 letters/digits of ICD-code)</a>
        </li>
        </ul>
</dl>
```

**Key Points**

* The Chapter Title provides a high-level overview of the medical diagnoses covered in that section.
* The ICD Codes are standardized codes used for classifying diseases and health conditions. We're only interested in the first 3 characters/digits for categorization purposes.
* The Block Title gives a more detailed description of what that specific ICD code represents.


In [60]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [61]:
driver = webdriver.Chrome()
data = []

for chapter in chapters:
    # go to each url
    driver.get(base_url + chapter)

    # wait for the dynamic content to load (adjust the timeout as needed)
    wait = WebDriverWait(driver, 10)  # Wait up to 10 seconds
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'BlockList')))

    # turn page source into bs4 soup
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    #print(soup.prettify())

    # find chapter title
    chapter_h2 = soup.find('div', class_='Chapter').h2
    chapter_title = list(chapter_h2.stripped_strings)[1]

    # isolate the block code and block titles for each diagnosis under the chapter
    block_list = soup.find('dl', class_='BlockList').ul
    for li in block_list.find_all('li'):
        block_code = li.a.text
        block_title = li.a['title']
    
        data.append((chapter_title, block_code, block_title))

driver.close()

In [62]:
import pandas as pd
import numpy as np

In [63]:
data = [list(row) for row in data]

df = pd.DataFrame(data, columns = ['category', 'block_code', 'block_title'])

In [64]:
df.category.unique()

array(['Certain infectious and parasitic diseases',
       'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
       'Endocrine, nutritional and metabolic diseases',
       'Mental and behavioural disorders',
       'Diseases of the nervous system', 'Diseases of the eye and adnexa',
       'Diseases of the ear and mastoid process',
       'Diseases of the circulatory system',
       'Diseases of the respiratory system',
       'Diseases of the skin and subcutaneous tissue',
       'Diseases of the musculoskeletal system and connective tissue',
       'Pregnancy, childbirth and the puerperium',
       'Certain conditions originating in the perinatal period'],
      dtype=object)

In [65]:
df

Unnamed: 0,category,block_code,block_title
0,Certain infectious and parasitic diseases,A00-A09,Intestinal infectious diseases
1,Certain infectious and parasitic diseases,A15-A19,Tuberculosis
2,Certain infectious and parasitic diseases,A20-A28,Certain zoonotic bacterial diseases
3,Certain infectious and parasitic diseases,A30-A49,Other bacterial diseases
4,Certain infectious and parasitic diseases,A50-A64,Infections with a predominantly sexual mode of...
...,...,...,...
293,Certain conditions originating in the perinata...,P50-P61,Haemorrhagic and haematological disorders of f...
294,Certain conditions originating in the perinata...,P70-P74,Transitory endocrine and metabolic disorders s...
295,Certain conditions originating in the perinata...,P75-P78,Digestive system disorders of fetus and newborn
296,Certain conditions originating in the perinata...,P80-P83,Conditions involving the integument and temper...


# Searching for statistics based on Broad Disease Category

In [67]:
# we can prompt tune here

system_content = (
        "You are a medical research assistant specializing in epidemiology and public health statistics. "
        "Your task is to provide accurate, up-to-date numerical data on disease categories, focusing on prevalence, incidence, mortality, and economic impact. "
        "Use reliable sources such as WHO, CDC, NIH, and peer-reviewed medical journals. "
        "Present data in percentages, rates per 100,000 population, or absolute numbers as specified. "
        "If exact data for a specific point is not available, provide the closest relevant estimate and note this in your response. "
        "At the end of your response, provide a summary of key data points in the specified format."
    )

user_content_template = (
        "Provide the most recent statistics for the '{category}' category. Focus only on the following data points:\n"
        "1. Worldwide prevalence (percentage of global population affected)\n"
        "2. US prevalence (percentage of US population affected)\n"
        "3. Global incidence (new cases per 100,000 population per year)\n"
        "4. US incidence (new cases per 100,000 population per year)\n"
        "5. Global mortality (total deaths per year)\n"
        "6. US mortality (total deaths per year)\n"
        "7. Annual global healthcare costs (in billions of USD)\n"
        "Include brief notes on data sources or limitations for each point.\n\n"
        "At the end of your response, summarize these data points in the following format:\n"
        "('worldwide_prevalence_percent': value)\n"
        "('us_prevalence_percent': value)\n"
        "('global_incidence_per_100k': value)\n"
        "('us_incidence_per_100k': value)\n"
        "('global_mortality_per_year': value)\n"
        "('us_mortality_per_year': value)\n"
        "('annual_healthcare_costs_usd_billions': value)\n"
        "Use 0 for any value that is unavailable or not applicable, and explain in your notes."
)

In [70]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

YOUR_API_KEY = os.environ['PERPLEXITY_API_KEY']

messages = [
    {
        "role": "system",
        "content": system_content,
    },
    {
        "role": "user",
        "content": user_content_template.format(category='Diseases of the circulatory system'),
    },
]

client = OpenAI(api_key=YOUR_API_KEY, base_url="https://api.perplexity.ai")

# chat completion without streaming
response = client.chat.completions.create(
    model="llama-3.1-sonar-large-128k-online",
    messages=messages,
)
print(response)

ChatCompletion(id='e5558a89-6e4a-42e6-9e33-055157a9ea25', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="### Statistics for Diseases of the Circulatory System\n\n1. **Worldwide Prevalence (Percentage of Global Population Affected)**\n   - **Data:** Not directly available from the provided sources. However, it is known that cardiovascular diseases (CVDs) are a significant health issue globally, but specific prevalence percentages are not provided in the sources.\n   - **Note:** The WHO fact sheet provides mortality and impact data but does not specify the prevalence percentage of the global population affected by CVDs.\n\n2. **US Prevalence (Percentage of US Population Affected)**\n   - **Data:** 4.9% of adults in the US have ever been diagnosed with coronary heart disease.\n   - **Note:** This figure is specific to coronary heart disease and does not encompass all circulatory system diseases.\n\n3. **Global Incidence (New Cases per 

In [71]:
print(response.choices[0].message.content)

### Statistics for Diseases of the Circulatory System

1. **Worldwide Prevalence (Percentage of Global Population Affected)**
   - **Data:** Not directly available from the provided sources. However, it is known that cardiovascular diseases (CVDs) are a significant health issue globally, but specific prevalence percentages are not provided in the sources.
   - **Note:** The WHO fact sheet provides mortality and impact data but does not specify the prevalence percentage of the global population affected by CVDs.

2. **US Prevalence (Percentage of US Population Affected)**
   - **Data:** 4.9% of adults in the US have ever been diagnosed with coronary heart disease.
   - **Note:** This figure is specific to coronary heart disease and does not encompass all circulatory system diseases.

3. **Global Incidence (New Cases per 100,000 Population per Year)**
   - **Data:** Not available from the provided sources. Incidence rates for circulatory diseases are not specified in the sources.
   - **

In [72]:
def extract_data(response):
    # Find the data section at the end of the response
    data_section = response.split("At the end of your response, summarize key numerical data points")[-1]
    
    # Use regex to find all ('column_name': value) pairs
    pattern = r"\('([^']+)':\s*([^)]+)\)"
    matches = re.findall(pattern, data_section)
    
    # Convert matches to a dictionary
    data_dict = {key: value for key, value in matches}
    
    return data_dict

def update_dataframe(df, category, data):
    # If the DataFrame is empty, initialize it with columns
    if df.empty:
        df = pd.DataFrame(columns=['disease_category'] + list(data.keys()))
    
    # Create a new row with the current data
    new_row = pd.DataFrame({'disease_category': [category], **{k: [v] for k, v in data.items()}})
    
    # Append the new row to the DataFrame
    df = pd.concat([df, new_row], ignore_index=True)
    
    return df

In [73]:
df = pd.DataFrame()

data = extract_data(response.choices[0].message.content)

new_df = update_dataframe(df, 'Diseases of the Eye and Adnexa', data)

In [74]:
new_df

Unnamed: 0,disease_category,worldwide_prevalence_percent,us_prevalence_percent,global_incidence_per_100k,us_incidence_per_100k,global_mortality_per_year,us_mortality_per_year,annual_healthcare_costs_usd_billions
0,Diseases of the Eye and Adnexa,0,4.9,0,0,17900000,702880,0


## Putting it all together

In [75]:
def collect_disease_data(categories, model = "llama-3.1-sonar-large-128k-online"):
    load_dotenv()
    YOUR_API_KEY = os.environ['PERPLEXITY_API_KEY']
    client = OpenAI(api_key=YOUR_API_KEY, base_url="https://api.perplexity.ai")

    df = pd.DataFrame()

    system_content = (
            "You are a medical research assistant specializing in epidemiology and public health statistics. "
            "Your task is to provide accurate, up-to-date numerical data on disease categories, focusing on prevalence, incidence, mortality, and economic impact. "
            "Use reliable sources such as WHO, CDC, NIH, and peer-reviewed medical journals. "
            "Present data in percentages, rates per 100,000 population, or absolute numbers as specified. "
            "If exact data for a specific point is not available, provide the closest relevant estimate and note this in your response. "
            "At the end of your response, provide a summary of key data points in the specified format."
        )
    
    user_content_template = (
            "Provide the most recent statistics for the '{category}' category. Focus only on the following data points:\n"
            "1. Worldwide prevalence (percentage of global population affected)\n"
            "2. US prevalence (percentage of US population affected)\n"
            "3. Global incidence (new cases per 100,000 population per year)\n"
            "4. US incidence (new cases per 100,000 population per year)\n"
            "5. Global mortality (total deaths per year)\n"
            "6. US mortality (total deaths per year)\n"
            "7. Annual global healthcare costs (in billions of USD)\n"
            "Include brief notes on data sources or limitations for each point.\n\n"
            "At the end of your response, summarize these data points in the following format:\n"
            "('worldwide_prevalence_percent': value)\n"
            "('us_prevalence_percent': value)\n"
            "('global_incidence_per_100k': value)\n"
            "('us_incidence_per_100k': value)\n"
            "('global_mortality_per_year': value)\n"
            "('us_mortality_per_year': value)\n"
            "('annual_healthcare_costs_usd_billions': value)\n"
            "Use 0 for any value that is unavailable or not applicable, and explain in your notes."
    )

    def extract_data(response):
        data_section = response.split("At the end of your response, summarize key numerical data points")[-1]
        pattern = r"\('([^']+)':\s*([^)]+)\)"
        matches = re.findall(pattern, data_section)
        data_dict = {key: value for key, value in matches}
        return data_dict

    def update_dataframe(df, category, data):
        if df.empty:
            df = pd.DataFrame(columns=['disease_category'] + list(data.keys()))
        new_row = pd.DataFrame({'disease_category': [category], **{k: [v] for k, v in data.items()}})
        df = pd.concat([df, new_row], ignore_index=True)
        return df

    for category in categories:
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content_template.format(category=category)}
        ]

        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages,
            )
            
            response_text = response.choices[0].message.content
            data = extract_data(response_text)
            df = update_dataframe(df, category, data)
            print(f"Data collected for: {category}")
        except Exception as e:
            print(f"Error collecting data for {category}: {str(e)}")

    return df

In [76]:
categories = [
    'Certain infectious and parasitic diseases',
    'Neoplasms',
    'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
    'Diseases of the nervous system',
    'Diseases of the eye and adnexa',
    'Diseases of the digestive system',
    'Diseases of the skin and subcutaneous tissue',
    'Diseases of the musculoskeletal system and connective tissue',
    'Diseases of the genitourinary system'
]

In [77]:
result_df = collect_disease_data(categories)

result_df

Data collected for: Certain infectious and parasitic diseases
Data collected for: Neoplasms
Data collected for: Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism
Data collected for: Diseases of the nervous system
Data collected for: Diseases of the eye and adnexa
Data collected for: Diseases of the digestive system
Data collected for: Diseases of the skin and subcutaneous tissue
Data collected for: Diseases of the musculoskeletal system and connective tissue
Data collected for: Diseases of the genitourinary system


Unnamed: 0,disease_category,worldwide_prevalence_percent,us_prevalence_percent,global_incidence_per_100k,us_incidence_per_100k,global_mortality_per_year,us_mortality_per_year,annual_healthcare_costs_usd_billions
0,Certain infectious and parasitic diseases,0.0,0.0,0.0,0,3500000,75000,0
1,Neoplasms,6.0,0.0,178.9,0,10000000,0,0
2,Diseases of the blood and blood-forming organs...,0.0,0.0,0.0,0,0,0,0
3,Diseases of the nervous system,43.1,0.0,0.0,0,11000000,0,0
4,Diseases of the eye and adnexa,0.0,0.0,0.0,0,0,0,0
5,Diseases of the digestive system,3.0,19.5,0.0,0,2560000,245921,0
6,Diseases of the skin and subcutaneous tissue,38.0,0.0,5930.0,0,98522,0,0
7,Diseases of the musculoskeletal system and con...,22.0,0.0,0.0,0,83100,0,0
8,Diseases of the genitourinary system,0.0,0.0,0.0,0,0,0,0
