#### ETF Issuer EDA

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import numpy as np
import json
import datetime
import sys


class etf_web_data:
    """
    Blueprint to define how each issuers website is scraped.
    - Initial scope is to scrape 80% of assets from Issuers websites
    - Made Possible as ETFs require this transparency to be publicly available
    - Subsequently will look at other use cases
    - Make note - do all websites require selenium or can html suffice?
    - The only manual maitenance should be a list of all issuer websites - but maybe even this can be sourced?
    """

    def __init__(self, debug=True):
        
        # Initiate default options for the instance
        chrome_options = Options()
        
        # Used to toggle GUI Browser interface
        self.debug = debug
        if self.debug == False:
            chrome_options.add_argument('--headless')    # When debugging is off - no GUI needed
        else: print('Debugging Mode ON')        
        
        
        # Selecting driver based on OS
        if sys.platform == 'win32':
            self.browser_path = 'drivers/chromedriver_win.exe'
        elif sys.platform == 'darwin':
            self.browser_path = 'drivers/chromedriver_mac'

        # Initiating the web driver
        self.driver = webdriver.Chrome(options=chrome_options, executable_path=self.browser_path)
            
        # Used to define the correct cookies for the session
        with open('web_cookies.txt', 'r') as fp:
            self.cookies = json.load(fp)  
        
        # navigate to start page
        self.driver.get('https://www.google.co.uk/')
        
        # add cookies to the session
        for cooki in self.cookies:
            self.driver.add_cookie(cooki)
            
        # Current list of all identified issuer websites
        with open('issuer_websites.txt', 'r') as wb:
            self.websites = wb.read()
            

            
            
    def pBDP(self, isin=None, datapoint=None,FX=None):
        '''
        Python Data Point alternative to BDP.
        Simply a selector for the various BDP implementations of each site
        '''
        pass
    

    def update_isins(self, many=True):
        '''
        Fetching a current list of ISIN's from all issuers websites
        Yet to be implemented; many worth collecting all data from these files directly?        
        '''
        pass

    
    def pBDH(self, isin=None, datapoint=None, start_date=None, end_date=None, FX=None):
        '''
        Python Historical Data Point alternative to BDH
        Designed to behave similarly
        - Handling an array of ISINs and datapoints (?)
        '''
        pass

    
    def pjBDP(self, isins=None, datapoints=None):
        '''
        method returning a single datapoint on a single jpm isin
        Being built out at present to explore exactly how this class should be structured
        '''
        # convert single queries to lists so works w/ vectorisation framework
        if isinstance(isins,str):
            isins = [isins]
        if isinstance(datapoints,str):
            datapoints = [datapoints]
            
        
        df = pd.DataFrame(columns = ['ISIN', 'DATAPOINT', 'VALUE'])
        df.loc[0] = None
        
        website = 'https://am.jpmorgan.com/'
        self.driver.get(website)
        time.sleep(2)
        
        
        for isin in isins:
            for datapoint in datapoints:
                search = self.driver.find_element_by_id('searchbox')
                search.send_keys(isin)
                time.sleep(1)
                search.send_keys(Keys.RETURN)
                time.sleep(3)
                dp = self.driver.find_element_by_css_selector('[data-testid=' + datapoint + ']')
                dp = dp.get_attribute('innerHTML')
                df.loc[max(df.index) + 1] = [isin, datapoint, dp]
        
        df.drop(df.index[0], inplace=True)
        df['SOURCE'] = website
        df['SOURCE_DATE'] = datetime.datetime.today()
        return(df)
    


In [7]:
jp = etf_web_data(debug=False)


Debugging Mode ON


In [8]:
jp.pjBDP(isins='IE00BD9MMF62', datapoints='isin')

Unnamed: 0,ISIN,DATAPOINT,VALUE,SOURCE,SOURCE_DATE
1,IE00BD9MMF62,isin,IE00BD9MMF62,https://am.jpmorgan.com/,2019-12-28 10:35:39.031771
