In [1]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import string
import time
import random

In [2]:
def get_driver():
    options = Options()
    options.add_argument("--start-maximized") #this line is used to maximize the browser window
    options.add_argument("--ignore-certificate-errors-spki-list") #this line is used to ignore certificate errors
    options.add_argument('--ignore-ssl-errors') #this line is used to ignore ssl errors
    
    driver = webdriver.Chrome(options=options)
    return driver

In [3]:
lowercase_letters = list(string.ascii_lowercase)
print(lowercase_letters)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
def scrape_healthhub_links():
    
    #setup dataframe with columns datetime, title, text, link
    df = pd.DataFrame(columns=['title', 'text', 'link'])
    
    driver = get_driver()
    wait = WebDriverWait(driver, 15) #this line is used to wait for the page to load
    
    base_url = 'https://www.healthhub.sg/a-z/' #there is a website for each letter of the alphabet
    
    for letter in lowercase_letters:
        website = base_url + letter
        
        driver.get(website)
        driver.implicitly_wait(15)
        
        #get the class called 'index_text' which contains all the links via XPath
        links = wait.until(EC.presence_of_all_elements_located((By.XPATH, '/html/body/div/div[2]/div/div/div[1]/div/div/div/div[1]/div/div[1]/div/section/div/div[2]/div[2]/div[1]')))
        for element in links:
            #find all <a> tags
            a_tags = element.find_elements(By.TAG_NAME, 'a')
            for a_tag in a_tags:
                #get the title and link
                title = a_tag.text
                link = a_tag.get_attribute('href')
                text = ''
                
                #add to dataframe
                df = pd.concat([df, pd.DataFrame([[title, text, link]], columns=['title', 'text', 'link'])], ignore_index=True)
        
        time.sleep(3 + 10 * random.random())
        
    
    driver.quit()
    return df

In [7]:
def scrape_healthhub_text(df):
    
    driver = get_driver()
    wait = WebDriverWait(driver, 15) #this line is used to wait for the page to load
    
    for index, row in df.iterrows():
        
        #check if text is already present
        if row['text']:
            continue
        
        driver.get(row['link'])
        driver.implicitly_wait(15)
        
        text = ''
        
        title = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div/section[2]/div/div/div[2]/div/p')))
        text += title.text + '\n'
        
        body = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div/div[2]/div/div/section[2]/div/div/div[2]/div/div[1]')))
        text += body.text + '\n'
        
        df.at[index, 'text'] = text
        
        #create snapshot of dataframe after every article
        df.to_csv('healthhub_links_with_text.csv', index=False)
        
        time.sleep(5 + 3 * random.random())
    
    driver.quit()
    return df
    
    

In [5]:
##########################################################################
# Only run if you need to remake the csv file containing links but no text
##########################################################################

#first_df = scrape_healthhub_links() #only contains title and link of each article
#first_df.to_csv('healthhub_links_no_text.csv', index=False) #save to csv

In [6]:
#first_df = pd.read_csv('healthhub_links_no_text.csv') #use this if you are running the code for the first time
first_df = pd.read_csv('healthhub_links_with_text.csv') #use this if the csv already exists so you don't have to scrape from the start

In [8]:
##########################################################################
# Only run if you need to remake the csv file containing links with text
##########################################################################

#final_df = scrape_healthhub_text(first_df)
#final_df.to_csv('healthhub_data.csv', index=False) #save to csv