In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import os
import time
import yaml
import datetime
import pickle
from multiprocessing import Queue, Pool

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import sys
sys.path.append('..')
from config import CFG
CFG=CFG()

import warnings; warnings.simplefilter('ignore')

In [11]:
class Crawler():
    def __init__(self, *chrome_options, url=CFG.URL2, window_size=CFG.WINDOW_SIZE):
        """
        Constructor function that loads any options and connects to the webpage

        Args:
            chrome_options: A list of chrome options to be passed to the webdriver
            url: The url to connect to
            window_size: The size of the window to be opened
        """
        # sourcery skip: remove-pass-body
        print("starting selenium")
        self.url = url
        self.window_size = window_size

        # Unpacking any chrome options passed in
        if chrome_options:
            self.options = webdriver.ChromeOptions()
            for option in chrome_options:
                self.options.add_argument(option)
                
        self.opt = bool(chrome_options)  # this is used to prevent an error from inputing an empty options atrribute
        if self.opt:
            self.driver = webdriver.Chrome(options=self.options)
        else:
            self.driver = webdriver.Chrome()
            
        print("Connected to website") 
        self.driver.get(self.url)
        self.driver.set_window_size(self.window_size[0], self.window_size[1])

    def load_webpage(self):
        """
        Loads the results tab by switching frames and clicking on the results tab,
        also contains an intermediate try/except block to look for an error msg, all using explicit waits.        
        """
        print("waiting to switch frames")
        # Need to switch frames to enable interaction with the loaded javascript
        WebDriverWait(self.driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CLASS_NAME, "iframe-class")))
        print("switched frames, waiting to click results")

        # checking to see if the react error message is present, if it is, refresh and if not, pass
        try:
            WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="react-entry-point"]/div')))
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div')))
        except TimeoutException as e:
            pass
        else:
            self.refresh_page()

        # waiting to click results tab
        WebDriverWait(self.driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#tab_container > li:nth-child(3) > a"))
            ).click()
        print("selected results tab")

    def connect(self, max_tries=2):
        """
        Implements a counted try/except loop to load the webpage and refresh it in case of a timeout error.
        It also refreshes the page if there is any other exception thrown.
        
        Args:
            max_tries: The max number of loops to try and load the webpage
        """
        loaded = False
        counter = 0
        while not loaded:
            try:
                self.load_webpage()
                loaded = True
            except TimeoutException as e:
                self.refresh_page(e)
            except Exception as e:
                print("Exception, trying again")
                self.refresh_page(e)
            counter += 1
            if counter == max_tries:
                print("Too many attempts, exiting")
                self.driver.quit()
                break

    def refresh_page(self, error):
        print(f"{error}, trying again")
        self.driver.refresh()
            
    def __del__(self):
        self.driver.quit()

    def get_page_count(self):
        """
        Finds the container holding the last page number and returns it as a string
        """
        pages = self.driver.find_element(By.CLASS_NAME,
            'last-page'
        ).text
        print(f"Page count: {pages}")
        return str(pages)

    def load_queue(self, data_queue):
        """
        Loads an instance of multiprocessing.Queue with the listified output obtained from get_page_count
        Args:
            queue: An instance of multiprocessing.Queue
        """
        pages = self.get_page_count()
        [data_queue.put(x) for x in range(1, int(pages)+1)]
        print("Loaded queue")

    def get_from_queue(self, data_queue):
        """
        Gets the next item from the queue and returns it as a string, if the queue is empty, it returns None
        Args:
            queue: An instance of multiprocessing.Queue
        """
        try:
            page = data_queue.get()
        except Exception:
            data_queue.empty()
            print("Queue is empty")
            empty = True
        return str(page), empty

    def change_page(self, data_queue):
        """
        Gets page number from get_from_queue and enters it into the page box
        Args:
            queue: An instance of multiprocessing.Queue
        """
        page, empty = self.get_from_queue(data_queue)
        # page = "10"
        page_box = self.driver.find_element(By.CLASS_NAME, "current-page")
        page_box.send_keys(page)
        return empty

    def make_df(self):
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        table = soup.find('table')
        return pd.read_html(str(table))[0]

    def collect_data(self, data_queue, product_queue):
        """
        Collects the data from the webpage after changing pages, compiles it into a pd.DataFrame,
        then puts it into the product queue
        """
        print("Starting data collection")
        empty = False
        while not empty:
            empty = self.change_page(data_queue)
            df = self.make_df()
            product_queue.put(df)
        

In [5]:
DQ = Queue()
PQ = Queue()

In [12]:
crawler = Crawler()

starting selenium
Connected to website


In [13]:
crawler.connect()

waiting to switch frames
switched frames, waiting to click results
Exception, trying again
Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=109.0.5414.120)
Stacktrace:
Backtrace:
	(No symbol) [0x00F2F243]
	(No symbol) [0x00EB7FD1]
	(No symbol) [0x00DAD04D]
	(No symbol) [0x00D92D7A]
	(No symbol) [0x00DFBE7B]
	(No symbol) [0x00E0C196]
	(No symbol) [0x00DF8386]
	(No symbol) [0x00DD163C]
	(No symbol) [0x00DD269D]
	GetHandleVerifier [0x011C9A22+2655074]
	GetHandleVerifier [0x011BCA24+2601828]
	GetHandleVerifier [0x00FD8C0A+619850]
	GetHandleVerifier [0x00FD7830+614768]
	(No symbol) [0x00EC05FC]
	(No symbol) [0x00EC5968]
	(No symbol) [0x00EC5A55]
	(No symbol) [0x00ED051B]
	BaseThreadInitThunk [0x767BFA29+25]
	RtlGetAppContainerNamedObjectPath [0x77487A7E+286]
	RtlGetAppContainerNamedObjectPath [0x77487A4E+238]
, trying again


Exception ignored in: <function Service.__del__ at 0x00000223FCCD08B0>
Traceback (most recent call last):
  File "c:\Users\broug\mambaforge\envs\drugs\lib\site-packages\selenium\webdriver\common\service.py", line 183, in __del__
    self.stop()
  File "c:\Users\broug\mambaforge\envs\drugs\lib\site-packages\selenium\webdriver\common\service.py", line 149, in stop
    self.send_remote_shutdown_command()
  File "c:\Users\broug\mambaforge\envs\drugs\lib\site-packages\selenium\webdriver\common\service.py", line 128, in send_remote_shutdown_command
    request.urlopen(f"{self.service_url}/shutdown")
  File "c:\Users\broug\mambaforge\envs\drugs\lib\urllib\request.py", line 216, in urlopen
    return opener.open(url, data, timeout)
  File "c:\Users\broug\mambaforge\envs\drugs\lib\urllib\request.py", line 519, in open
    response = self._open(req, data)
  File "c:\Users\broug\mambaforge\envs\drugs\lib\urllib\request.py", line 536, in _open
    result = self._call_chain(self.handle_open, protoc

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=109.0.5414.120)
Stacktrace:
Backtrace:
	(No symbol) [0x00F2F243]
	(No symbol) [0x00EB7FD1]
	(No symbol) [0x00DAD04D]
	(No symbol) [0x00D92D7A]
	(No symbol) [0x00DFBE7B]
	(No symbol) [0x00E0C196]
	(No symbol) [0x00DF8386]
	(No symbol) [0x00DD163C]
	(No symbol) [0x00DD269D]
	GetHandleVerifier [0x011C9A22+2655074]
	GetHandleVerifier [0x011BCA24+2601828]
	GetHandleVerifier [0x00FD8C0A+619850]
	GetHandleVerifier [0x00FD7830+614768]
	(No symbol) [0x00EC05FC]
	(No symbol) [0x00EC5968]
	(No symbol) [0x00EC5A55]
	(No symbol) [0x00ED051B]
	BaseThreadInitThunk [0x767BFA29+25]
	RtlGetAppContainerNamedObjectPath [0x77487A7E+286]
	RtlGetAppContainerNamedObjectPath [0x77487A4E+238]


In [None]:
crawler.load_queue()

In [None]:
crawler.change_page()