In [1]:
# webscraping libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np 
import json

In [2]:
base_url = "https://icd.who.int/browse10/2010/en#/"

chapters = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 
            'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 
            'XIX', 'XX', 'XXI', 'XXII']


**Objective**

We want to extract specific data from the website's structure: https://icd.who.int/browse10/2010/en#/XXII

* **Chapter Title (Broad diagnosis category)**: This is the text content of the first `<h2>` element on the page. 
* **ICD Code (First 3 characters/digits)**:  Located within a `dl` element with the class "BlockList". More specifically, it's within the text content of an `a` element nested under a `li` with class "BlockList1 elements".
* **Block Title (Description of the code category)**:  Also found within the same `dl` element (class "BlockList"). It's the `title` attribute of the same `a` element that contains the ICD Code. 

**HTML Structure to Target**

```html
<dl class="Blocklist">
    <ul>
        <li class="BlockList1 elements">
            <a element title="Block title (string title of the code a category)" > Block code (first 3 letters/digits of ICD-code)</a>
        </li>
        </ul>
</dl>
```

**Key Points**

* The Chapter Title provides a high-level overview of the medical diagnoses covered in that section.
* The ICD Codes are standardized codes used for classifying diseases and health conditions. We're only interested in the first 3 characters/digits for categorization purposes.
* The Block Title gives a more detailed description of what that specific ICD code represents.


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [4]:
driver = webdriver.Chrome()
data = []

for chapter in chapters:
    # go to each url
    driver.get(base_url + chapter)

    # wait for the dynamic content to load (adjust the timeout as needed)
    wait = WebDriverWait(driver, 10)  # Wait up to 10 seconds
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'BlockList')))

    # turn page source into bs4 soup
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    #print(soup.prettify())

    # find chapter title
    chapter_h2 = soup.find('div', class_='Chapter').h2
    chapter_title = list(chapter_h2.stripped_strings)[1]

    # isolate the block code and block titles for each diagnosis under the chapter
    block_list = soup.find('dl', class_='BlockList').ul
    for li in block_list.find_all('li'):
        block_code = li.a.text
        block_title = li.a['title']
    
        data.append((chapter_title, block_code, block_title))

driver.close()

In [5]:
import pandas as pd
import numpy as np

In [6]:
data = [list(row) for row in data]

df = pd.DataFrame(data, columns = ['category', 'block_code', 'block_title'])

In [7]:
df.category.unique()

array(['Certain infectious and parasitic diseases',
       'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
       'Endocrine, nutritional and metabolic diseases',
       'Mental and behavioural disorders',
       'Diseases of the nervous system', 'Diseases of the eye and adnexa',
       'Diseases of the ear and mastoid process',
       'Diseases of the circulatory system',
       'Diseases of the respiratory system',
       'Diseases of the digestive system',
       'Diseases of the genitourinary system',
       'Pregnancy, childbirth and the puerperium'], dtype=object)

In [12]:
df

Unnamed: 0,category,block_code,block_title
0,Certain infectious and parasitic diseases,A00-A09,Intestinal infectious diseases
1,Certain infectious and parasitic diseases,A15-A19,Tuberculosis
2,Certain infectious and parasitic diseases,A20-A28,Certain zoonotic bacterial diseases
3,Certain infectious and parasitic diseases,A30-A49,Other bacterial diseases
4,Certain infectious and parasitic diseases,A50-A64,Infections with a predominantly sexual mode of...
...,...,...,...
265,"Pregnancy, childbirth and the puerperium",O30-O48,Maternal care related to the fetus and amnioti...
266,"Pregnancy, childbirth and the puerperium",O60-O75,Complications of labour and delivery
267,"Pregnancy, childbirth and the puerperium",O80-O84,Delivery
268,"Pregnancy, childbirth and the puerperium",O85-O92,Complications predominantly related to the pue...


In [14]:
df.category.nunique()

12

In [15]:
df['block_title'].sample(3)

34                                 Other viral diseases
73    Viral infections characterized by skin and muc...
67    Infections with a predominantly sexual mode of...
Name: block_title, dtype: object

#### Lets save the df to a csv file

In [16]:
df.to_csv("disease_categories.csv")

# Searching for statistics based on Broad Disease Category

In [17]:
# we can prompt tune here

system_content = (
    "You are a medical research assistant specializing in epidemiology and public health statistics. "
    "Your task is to provide accurate, up-to-date numerical data on disease categories, focusing on prevalence, incidence, mortality, and economic impact. "
    "Use reliable sources such as WHO, CDC, NIH, and peer-reviewed medical journals. "
    "Present data in percentages, rates per 100,000 population, or absolute numbers as specified. "
    "If exact data for a specific point is not available, provide the closest relevant estimate. "
    "You must respond only with valid, RFC8259 compliant JSON. Do not include any explanations or text outside of the JSON structure."
)

user_content_template = (
    "Provide the most recent statistics for the '{category}' category in JSON format. Include the following data points:\n"
    "1. Worldwide prevalence (percentage of global population affected)\n"
    "2. US prevalence (percentage of US population affected)\n"
    "3. Global incidence (new cases per 100,000 population per year)\n"
    "4. US incidence (new cases per 100,000 population per year)\n"
    "5. Global mortality (total deaths per year)\n"
    "6. US mortality (total deaths per year)\n"
    "7. Annual global healthcare costs (in billions of USD)\n"
    "Use 0 for any value that is unavailable or not applicable.\n\n"
    "Respond with a JSON object in the following format:\n"
    "{{\n"
    '  "category": "{{category}}",\n'
    '  "worldwide_prevalence_percent": 0,\n'
    '  "us_prevalence_percent": 0,\n'
    '  "global_incidence_per_100k": 0,\n'
    '  "us_incidence_per_100k": 0,\n'
    '  "global_mortality_per_year": 0,\n'
    '  "us_mortality_per_year": 0,\n'
    '  "annual_healthcare_costs_usd_billions": 0\n'
    "}}"
)

In [18]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

YOUR_API_KEY = os.environ['PERPLEXITY_API_KEY']

messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant. Please respond with a JSON object.",
    },
    {
        "role": "user",
        "content": user_content_template.format(category='Diseases of the circulatory system'),
    },
]

client = OpenAI(api_key=YOUR_API_KEY, base_url="https://api.perplexity.ai")

# chat completion with JSON response format
response = client.chat.completions.create(
    model="llama-3.1-sonar-small-128k-online",
    messages=messages,
)
print(response)

ChatCompletion(id='7f249503-594c-4e60-a11b-3d9b145543f3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n{\n  "category": "Diseases of the circulatory system",\n  "worldwide_prevalence_percent": 0,\n  "us_prevalence_percent": 0,\n  "global_incidence_per_100k": 0,\n  "us_incidence_per_100k": 0,\n  "global_mortality_per_year": 0,\n  "us_mortality_per_year": 702880,\n  "annual_healthcare_costs_usd_billions": 252.2\n}\n```\n\nNote: The global prevalence, global incidence, and global mortality rates are not provided in the sources. Therefore, these values are set to 0. The US mortality rate and annual healthcare costs are based on the data from the CDC.', refusal=None, role='assistant', function_call=None, tool_calls=None), delta={'role': 'assistant', 'content': ''})], created=1726600677, model='llama-3.1-sonar-small-128k-online', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(complet

In [26]:
print(response.choices[0].message.content)

```json
{
  "category": "Diseases of the circulatory system",
  "worldwide_prevalence_percent": 0,
  "us_prevalence_percent": 0,
  "global_incidence_per_100k": 0,
  "us_incidence_per_100k": 0,
  "global_mortality_per_year": 0,
  "us_mortality_per_year": 702880,
  "annual_healthcare_costs_usd_billions": 252.2
}
```

Note: The global prevalence, global incidence, and global mortality rates are not provided in the sources. Therefore, these values are set to 0. The US mortality rate and annual healthcare costs are based on the data from the CDC.


In [33]:
import json

def extract_data(response):
    # Extract the JSON content from the response
    try:
        # Use regex to find the JSON object within the response content
        json_str = re.search(r'\{.*\}', response.choices[0].message.content, re.DOTALL).group(0)
        data = json.loads(json_str)
    except (json.JSONDecodeError, AttributeError):
        print("Error decoding JSON response")
        data = {}
    return data

def update_dataframe(df, category, data):
    # If the DataFrame is empty, initialize it with columns
    if df.empty:
        df = pd.DataFrame(columns=['disease_category'] + list(data.keys()))
    
    # Create a new row with the current data
    new_row = pd.DataFrame({'disease_category': [category], **{k: [v] for k, v in data.items()}})
    
    # Append the new row to the DataFrame
    df = pd.concat([df, new_row], ignore_index=True)
    
    return df

In [34]:
df = pd.DataFrame()

data = extract_data(response)
new_df = update_dataframe(df, 'Diseases of the Eye and Adnexa', data)


In [35]:
new_df

Unnamed: 0,disease_category,category,worldwide_prevalence_percent,us_prevalence_percent,global_incidence_per_100k,us_incidence_per_100k,global_mortality_per_year,us_mortality_per_year,annual_healthcare_costs_usd_billions
0,Diseases of the Eye and Adnexa,Diseases of the circulatory system,0,0,0,0,0,702880,252.2


## Putting it all together

In [54]:
def collect_disease_data(categories, model="llama-3.1-sonar-large-128k-online"):
    load_dotenv()
    YOUR_API_KEY = os.environ['PERPLEXITY_API_KEY']
    client = OpenAI(api_key=YOUR_API_KEY, base_url="https://api.perplexity.ai")

    df = pd.DataFrame()

    system_content = (
        "You are a medical research assistant specializing in epidemiology and public health statistics. "
        "Your task is to provide accurate, up-to-date numerical data on disease categories, focusing on prevalence, incidence, mortality, and economic impact. "
        "Use reliable sources such as WHO, CDC, NIH, and peer-reviewed medical journals. "
        "Present data in percentages, rates per 100,000 population, or absolute numbers as specified. "
        "If exact data for a specific point is not available, provide the closest relevant estimate. "
        "You must respond only with valid, RFC8259 compliant JSON. Do not include any explanations or text outside of the JSON structure."
    )

    user_content_template = (
        "Provide the most recent statistics for the '{category}' category in JSON format. Include the following data points:\n"
        "1. Worldwide prevalence (percentage of global population affected)\n"
        "2. US prevalence (percentage of US population affected)\n"
        "3. Global incidence (new cases per 100,000 population per year)\n"
        "4. US incidence (new cases per 100,000 population per year)\n"
        "5. Global mortality (total deaths per year)\n"
        "6. US mortality (total deaths per year)\n"
        "7. Annual global healthcare costs (in billions of USD)\n"
        "Use 0 for any value that is unavailable or not applicable.\n\n"
        "Respond with a JSON object in the following format:\n"
        "{{\n"
        '  "category": "{category}",\n'
        '  "worldwide_prevalence_percent": value,\n'
        '  "us_prevalence_percent": value,\n'
        '  "global_incidence_per_100k": value,\n'
        '  "us_incidence_per_100k": value,\n'
        '  "global_mortality_per_year": value,\n'
        '  "us_mortality_per_year": value,\n'
        '  "annual_healthcare_costs_usd_billions": value\n'
        "}}"
    )

    def extract_data(response):
        # Directly parse the JSON content from the response
        try:
            json_str = re.search(
                r'\{.*\}', response.choices[0].message.content, re.DOTALL).group(0)
            data = json.loads(json_str)
        except json.JSONDecodeError:
            print("Error decoding JSON response")
            data = {}
        return data

    def update_dataframe(df, category, data):
        if df.empty:
            df = pd.DataFrame(columns=['disease_category'] + list(data.keys()))
        new_row = pd.DataFrame(
            {'disease_category': [category], **{k: [v] for k, v in data.items()}})
        df = pd.concat([df, new_row], ignore_index=True)
        return df

    for category in categories:
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content_template.format(
                category=category)}
        ]

        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages,
            )

            data = extract_data(response)
            df = update_dataframe(df, category, data)
            print(f"Data collected for: {category}")
        except Exception as e:
            print(f"Error collecting data for {category}: {str(e)}")

    return df

In [55]:
categories = [
    'Certain infectious and parasitic diseases',
    'Neoplasms',
    'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
    'Diseases of the nervous system',
    'Diseases of the eye and adnexa',
    'Diseases of the digestive system',
    'Diseases of the skin and subcutaneous tissue',
    'Diseases of the musculoskeletal system and connective tissue',
    'Diseases of the genitourinary system'
]

# Updated categories with more search-friendly names
renamed_categories = [
    'Infectious diseases',
    'Cancers (Neoplasms)',
    'Blood disorders and immune disorders',
    'Neurological disorders',
    'Eye diseases',
    'Digestive diseases',
    'Skin diseases',
    'Musculoskeletal disorders',
    'Genitourinary disorders'
]

In [57]:
result_df = collect_disease_data(renamed_categories, model="llama-3.1-sonar-huge-128k-online")

result_df

Data collected for: Infectious diseases
Data collected for: Cancers (Neoplasms)
Data collected for: Blood disorders and immune disorders
Data collected for: Neurological disorders
Data collected for: Eye diseases
Data collected for: Digestive diseases
Data collected for: Skin diseases
Data collected for: Musculoskeletal disorders
Data collected for: Genitourinary disorders


Unnamed: 0,disease_category,category,worldwide_prevalence_percent,us_prevalence_percent,global_incidence_per_100k,us_incidence_per_100k,global_mortality_per_year,us_mortality_per_year,annual_healthcare_costs_usd_billions
0,Infectious diseases,Infectious diseases,0.0,0.0,0.0,0.0,0,0,0.0
1,Cancers (Neoplasms),Cancers (Neoplasms),0.0,0.0,186.5,440.5,9700000,611720,208.9
2,Blood disorders and immune disorders,Blood disorders and immune disorders,10.0,15.625,0.0,0.0,0,0,0.0
3,Neurological disorders,Neurological disorders,43.1,0.0,0.0,0.0,11100000,0,0.0
4,Eye diseases,Eye diseases,28.5,0.0,0.0,0.0,0,0,411.0
5,Digestive diseases,Digestive diseases,0.0,0.0,0.0,0.0,2560000,0,0.0
6,Skin diseases,Skin diseases,37.5,0.0,6251.19,0.0,0,0,0.0
7,Musculoskeletal disorders,Musculoskeletal disorders,6.32,40.96,0.0,0.0,83100,0,0.0
8,Genitourinary disorders,Genitourinary disorders,0.0,0.0,0.0,0.0,0,0,0.0


In [78]:
# Export to CSV
result_df.to_csv('disease_data.csv', index=False)