In [2]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


import datetime
from datetime import timedelta
from dateutil import tz
import time
import random
import json
import os

import undetected_chromedriver as uc

In [3]:
cur_date = (datetime.datetime.now(tz=tz.gettz('Asia/Singapore'))-timedelta(days=1)).strftime('%Y%m%d')
urls_output_dir = f'../data/raw/{cur_date}/urls'
funds_output_dir = f'../data/raw/{cur_date}/funds'

os.makedirs(urls_output_dir, exist_ok=True)
os.makedirs(funds_output_dir, exist_ok=True)

In [4]:
# selenium stealth driver used for scraping Google Scholar
def create_driver(debug=True):

    options = webdriver.ChromeOptions()
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--start-maximized")
    if debug==False:
        options.add_argument("--headless")
    # options.add_experimental_option("excludeSwitches", ["enable-automation"])
    # options.add_experimental_option('useAutomationExtension', False)
    driver = uc.Chrome(
        options=options
    )
    stealth(driver,
            # user_agent=agent,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    return driver

# Scrape Fund URL

In [187]:
def scrape_fund_url(output_dir):
    url = 'https://investor.vanguard.com/investment-products/list/all'
    driver = create_driver()

    driver.get(url)
    time.sleep(random.uniform(3,5))
    nxt_button = True
    page_count = 0

    try:
        fund_urls = {}
        while nxt_button:
            page_count+=1
            table = driver.find_element(By.XPATH,"//div[@class='col-md-9 col-sm-12']")
            table_body = table.find_element(By.TAG_NAME,'tbody')
            table_rows = table_body.find_elements(By.TAG_NAME,'tr')
            for row in table_rows:

                symbol = row.find_element(By.TAG_NAME,'span').text
                fund_url = row.find_element(By.TAG_NAME,'a').get_attribute('href')
                fund_urls[symbol] = fund_url
            button = driver.find_element(By.ID,'next-page-btn')
            button.location_once_scrolled_into_view
            time.sleep(random.uniform(1,2))
            if button.find_element(By.TAG_NAME,'vui-icon').get_attribute('class') == 'disabled':
                nxt_button=False
            else:
                button.click()
                time.sleep(random.uniform(1,2))
        driver.quit()
        output_file = output_dir+'/funds_url.json'
        with open(output_file,'w') as f:
            json.dump(fund_urls,f)
    except:
        print(symbol,page_count)
        



In [188]:
scrape_fund_url(output_dir=urls_output_dir)

# Scrape Fund Details

In [5]:
def extract_fund_details(html_page_source,fund_detail,symbol,driver):
    dashboard = html_page_source.find(name='section',attrs={'id':'Dashboard'})
    if dashboard:
        fund_name = dashboard.find(name='h1',attrs={'class':'fund-name rps-display-two'}).text
        fund_detail['name'] = fund_name
    
    overview = html_page_source.find(name='div',attrs={'id':'overview_section'})
    if overview:
        key_fact_table = overview.find(name='table',attrs={'class':'table key-fact-table'})
        if key_fact_table:
            fund_detail['key_fact_table'] = {}
            for row in key_fact_table.find('tbody').find_all('tr'):
                key = row.find('th').text
                value = row.find('td').text
                fund_detail['key_fact_table'][key] = value

        reward_scale = overview.find(name='div',attrs={'class':"reward--scale__item reward--scale__item--active4 ng-star-inserted"})
        if reward_scale:
            span_element = reward_scale.find(name='span',attrs={'aria-label':"The fund risk level is a"})
            next_sibling = span_element.find_next_sibling()
            fund_detail['risk_level'] = next_sibling.text
        
        ppf_container = overview.find(name='div',attrs={'class':'col-lg-4 col-sm-12 ppf-container'})
        if ppf_container:
            ytd_percent = ppf_container.find(name='h4',attrs={'data-rpa-tag-id':'ytdPercent'})
            ytd_percent_market = ppf_container.find(name='h4',attrs={'data-rpa-tag-id':'ytdPercentMarket'})
            ytd_percent_nav = ppf_container.find(name='h4',attrs={'data-rpa-tag-id':'ytdPercentNAV'})
            if ytd_percent:
                fund_detail['ytd_returns'] = ytd_percent.text
            if ytd_percent_market:
                fund_detail['ytd_market_returns'] = ytd_percent_market.text
            if ytd_percent_nav:
                fund_detail['ytd_nav_returns'] = ytd_percent_nav.text
        
        product_summary = overview.find(name='div',attrs={'data-rpa-tag-id':'productSummary'})
        if product_summary:
            if product_summary.find('p'):
                fund_detail['product_summary'] = product_summary.find('p').text
            elif product_summary.find_all('li'):
                summary_text = ""
                for li in product_summary.find_all('li'):
                    summary_text = summary_text+li.text
                fund_detail['product_summary'] = summary_text
            else:
                pass
        
        min_inv = overview.find(name='h4',attrs={'data-rpa-tag-id':'minInvestment'})
        if min_inv:
            fund_detail['min_investment'] = min_inv.text
        
        exp_ratio = overview.find(name='h4',attrs={'data-rpa-tag-id':'expenseRatio'})
        if exp_ratio:
            fund_detail['exp_ratio'] = exp_ratio.text
        
    perf = html_page_source.find(name='div',attrs={'id':'performance-fees_section'})
    if perf:
        perf_table = perf.find(name='table',attrs={'aria-label':'Performance summary'})
        if perf_table:
            fund_detail['perf_table'] = {'index':['Month-end','3-Month total','YTD','1-yr','3-yr','5-yr','10-yr','Since inception']}
            for row in perf_table.find('tbody').find_all('tr'):
                values = []
                for td in row.find_all('td'):
                    values.append(td.text)
                fund_detail['perf_table'][row.find('th').text] = values
    
    fund_detail['historical_price_table'] = {}
    if html_page_source.find(name='div',attrs={'id':'mat-tab-label-0-3'}):
        tab_1_year = driver.find_element(By.ID,'mat-tab-label-0-3')    
        tab_1_year.location_once_scrolled_into_view
        time.sleep(random.uniform(1,2))
        tab_1_year.click()
        
        time.sleep(random.uniform(2,4))
        radio_button = driver.find_element(By.XPATH,"//input[@type='radio' and @aria-label='table']")
        radio_button.click()
        time.sleep(random.uniform(2,4))

        listbox_location = driver.find_element(By.XPATH,"//button[@tabindex='0' and @aria-haspopup='listbox']")
        dates = []
        daily_prices = []
        listbox_location.click()
        time.sleep(random.uniform(0,0.5))

        overlay_container = driver.find_element(By.XPATH,"//div[@class='cdk-overlay-container']")
        driver.find_element(By.XPATH,"//table[@aria-label='Historical prices table']").location_once_scrolled_into_view
        time.sleep(random.uniform(1,2))
        listbox = overlay_container.find_element(By.XPATH,"//div[@role='listbox' and @tabindex='0']")
        num_options = len(listbox.find_elements(By.TAG_NAME,'vui-option'))
        for index,option in enumerate(listbox.find_elements(By.TAG_NAME,'vui-option')):
            option.location_once_scrolled_into_view
            time.sleep(random.uniform(0.5,1))
            option.click()
            time.sleep(random.uniform(0.5,1))
            updated_page_source = BeautifulSoup(driver.page_source)
            price = updated_page_source.find(name='div',attrs={'id':'price_section'})
            for tr in price.find(name='table',attrs={'aria-label':'Historical prices table'}).find('tbody').find_all('tr'):
                dates.append(tr.find(name='td',attrs={'data-rpa-tag-id':'historicalDate'}).text)
                daily_prices.append(tr.find(name='td',attrs={'data-rpa-tag-id':'historicalPrice'}).text)

            if index == num_options-1:
                break
            listbox_location.click()
            time.sleep(random.uniform(0.5,1))

        fund_detail['historical_price_table']['date'] = dates
        fund_detail['historical_price_table']['price'] = daily_prices

    fundamental_table = html_page_source.find(name='div',attrs={'id':'characteristics-tabset'})
    fund_detail['portfolio_fundamental_table'] = {}
    if fundamental_table:
        trs = fundamental_table.find_all('tr')
        fundamental_list = []
        fund_list = []
        benchmark_list = []
        for tr in trs[1:]:
            tds = tr.find_all('td')
            for index,td in enumerate(tds):
                if index==0:
                    fundamental_list.append(td.text)
                elif index==1:
                    fund_list.append(td.text)
                else:
                    benchmark_list.append(td.text)
        fund_detail['portfolio_fundamental_table']['fundamentals'] = fundamental_list
        fund_detail['portfolio_fundamental_table'][symbol] = fund_list
        fund_detail['portfolio_fundamental_table']['benchmark'] = benchmark_list
    

    fund_detail['weighted_exposure_table'] = {}
    if html_page_source.find(name='vui-tab-group',attrs={'id':'weighting-prices-tabset'}):
        tab_list = driver.find_element(By.ID,'weighting-prices-tabset').find_element(By.CSS_SELECTOR,'.vui-tabs-container.vui-tab-not-contained-style').find_elements(By.TAG_NAME,'button')
        driver.find_element(By.XPATH,"//exemplar-weighting-exposure").location_once_scrolled_into_view
        time.sleep(random.uniform(1,2))
        for button in tab_list:
            exposure_type = BeautifulSoup(button.get_attribute('outerHTML'),'html.parser').find('div').text
            button.click()
            time.sleep(random.uniform(1,2))
            updated_page_source = BeautifulSoup(driver.page_source)
            asset_allocation_table = updated_page_source.find(name='mat-table',attrs={'id':'assetAllocationTable'})
            rows = asset_allocation_table.find_all(name='mat-row',attrs={'role':'row'})
            exposure_type_list = []
            symbol_list = []
            benchmark_list = []
            weight_difference_list = []
            for row in rows:
                for cell in row.find_all(name='mat-cell'):
                    span = cell.find('span')
                    if span:
                        data_rpa_tag_id = span.get('data-rpa-tag-id')
                        if data_rpa_tag_id=="weightedExposuresTab":
                            exposure_type_list.append(span.text)
                        
                        if data_rpa_tag_id=='symbol_LongPct':
                            symbol_list.append(span.text)
                        
                        if data_rpa_tag_id=='benchmark_ShortPct':
                            benchmark_list.append(span.text)
                        
                        if data_rpa_tag_id=='weightPct':
                            weight_difference_list.append(span.text)

            fund_detail['weighted_exposure_table'][exposure_type] = {}
            fund_detail['weighted_exposure_table'][exposure_type][exposure_type] = exposure_type_list
            fund_detail['weighted_exposure_table'][exposure_type][symbol] = symbol_list
            fund_detail['weighted_exposure_table'][exposure_type]["Benchmark"] = benchmark_list
            fund_detail['weighted_exposure_table'][exposure_type]['+/-Weight'] = weight_difference_list
    
    return fund_detail


In [None]:
def extract_name_and_risk(html_page_source,fund_detail,symbol,driver):
    
    dashboard = html_page_source.find(name='section',attrs={'id':'Dashboard'})
    if dashboard:
        fund_name = dashboard.find(name='h1',attrs={'class':'fund-name rps-display-two'}).text
        fund_detail['name'] = fund_name
    
    overview = html_page_source.find(name='div',attrs={'id':'overview_section'})
    if overview:
        reward_scale = overview.find(name='div',attrs={'class':"reward--scale__item reward--scale__item--active4 ng-star-inserted"})
        if reward_scale:
            span_element = reward_scale.find(name='span',attrs={'aria-label':"The fund risk level is a"})
            next_sibling = span_element.find_next_sibling()
            fund_detail['risk_level'] = next_sibling.text

In [62]:

with open('../data/raw/20231212/urls/funds_url.json','r')as f:
    funds_url = json.load(f)

try:
    for index,(symbol,url) in enumerate(funds_url.items()):
        # if index<349:
        #     continue
        driver = create_driver()
        fund_detail = {}
        driver.get(url)
        time.sleep(random.uniform(1,1.5))
        soup = BeautifulSoup(driver.page_source)
        fund_detail = extract_fund_details(soup,fund_detail,symbol,driver)
        with open(f"{funds_output_dir}/{symbol}.json",'w') as f:
            json.dump(fund_detail,f)
        driver.quit()
except Exception as e:
    print(f"ERROR --> {symbol}, Index --> {index}")
    print(e)