## MediPal -- Data Source ETL pipeline

In [None]:
import re
import json
from pathlib import Path
import time
import random
from urllib.parse import urljoin
import requests
import warnings
import langid
from bs4 import BeautifulSoup

### In this section, I am building a data pipeline to scrape medicine information from datasource. At this stage, I only use one public source.

The pipeline will explore the website from base to child pages. It will go deeply at third layers to get the drugs data.

1. The process will visit the drugs information page to get the urls of drugs name which begin with A to Z.

2. Then, the process will iterate exploring each url to get each drug's url and save it to a list.

3. Finally, it will extract the drugs' information.

In [99]:
# Global variables
base_source_url = "https://medlineplus.gov"

start_source_url = urljoin(base_source_url, "druginformation.html")

header = {
     # A real UA helps avoid basic bot filters
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",    
}

In [None]:
# This function will iterately extract the urls of page contain the drugs which's name begin with from A to Z
def get_az_drug_index_urls():
    resp = requests.get(start_source_url, headers=header, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    az_div = soup.find("div", id="az-section2") # All the urls are inside this section
    az_urls = []
    if az_div:
        for a in az_div.find_all("a", href=True):
            sub_url = a["href"]
            if re.search(r"druginfo/drug_[A-Z]a\.html$", sub_url): # Filter with regex to get exactly url we want
                url = urljoin(base_source_url, sub_url)
                az_urls.append(url)

    return az_urls

In [83]:
a_to_z_urls = get_az_drug_index_urls()

In [85]:
a_to_z_urls[:3]

['https://medlineplus.gov/druginfo/drug_Aa.html',
 'https://medlineplus.gov/druginfo/drug_Ba.html',
 'https://medlineplus.gov/druginfo/drug_Ca.html']

In [91]:
def extract_drug_urls(a_to_z_urls):
    all_drug_urls = []

    for url in a_to_z_urls:
        try:
            resp = requests.get(url, headers=header, timeout=20)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")

            ul_index = soup.find("ul", id="index")
            if not ul_index:
                continue

            for a in ul_index.find_all("a", href=True):
                drug_url = urljoin(url, a["href"])
                all_drug_urls.append(drug_url)

            delay = random.uniform(1, 3)
            time.sleep(delay)
        except Exception as e:
            print(f"Failed to process {url}: {e}")

    return all_drug_urls

In [92]:
all_drug_urls = extract_drug_urls(a_to_z_urls)

In [93]:
len(all_drug_urls)

7391

In [94]:
all_drug_urls[:3]

['https://medlineplus.gov/druginfo/meds/a606008.html',
 'https://medlineplus.gov/druginfo/meds/a601105.html',
 'https://medlineplus.gov/druginfo/meds/a607073.html']

In [None]:
def scrape_all_drugs(all_drug_urls):
    """
    Visit each URL, scrape data, random-sleep between requests, and return a list of dicts.
    """
    out = []
    for i, url in enumerate(all_drug_urls, start=1):        
        for attempts in range(3):  # Retry 3 times          
            resp = requests.get(url, headers=header, timeout=20)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, "html.parser")
            # drug name
            h1 = soup.find("h1", class_="with-also")
            drug_name = h1.get_text(strip=True) if h1 else ""
            # pronunciation
            pron_el = soup.find("span", id="d-pronunciation")
            pronunciation = pron_el.get_text(strip=True) if pron_el else ""
            # sections
            subtitles = []            
            for head in soup.find_all("div", class_="section-header"):
                h2 = head.find("div", class_="section-title")
                h2 = h2.find("h2") if h2 else None
                title_text = h2.get_text(strip=True) if h2 else None
                if title_text.find("Brand names") > -1:
                    break
                body_div = None
                btn = head.find("button", attrs={"aria-controls": True})
                if btn:
                    body_id = btn.get("aria-controls")
                    if body_id:
                        body_div = soup.find("div", id=body_id, class_="section-body")
                if body_div is None:
                    body_div = head.find_next_sibling("div", class_="section-body")
                if title_text and body_div:
                    subtitles.append({
                        "title": title_text,
                        "content": str(body_div)
                    })

            item = {
                "id": i,
                "drug_name": drug_name,
                "pronunciation": pronunciation,
                "url": url,
                "subtitles": subtitles  # list of {title, content}
            }
            out.append(item)
            # random delay avoid auti-bot policy
            delay = random.uniform(1, 3)                
            time.sleep(delay)
            break  # success -> exit retry loop         
    return out

In [None]:
data = scrape_all_drugs(all_drug_urls)
print(f"Total scraped: {len(data)}")

Total scraped: 7391


##### We finally got 7391 drugs' info.
##### The pipleline took 5 hours to crawl. As I made it randomly sleep 1 - 3 seconds after extract each drug to immitate a human behavior.
##### Now I have to save it to my local so that I don't need to scrape again.

In [113]:
out_path = Path(r"datasets\medlineplus_drugs.json")
out_path.parent.mkdir(parents=True, exist_ok=True)  # make folder if needed

with out_path.open("w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)