In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import pandas as pd
import os
import time
import datetime
import pickle
from multiprocessing import Queue, Pool, Process

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import sys
sys.path.append('..')
from config import CFG
CFG=CFG()

import warnings; warnings.simplefilter('ignore')

In [13]:
class Crawler(object):
    url=CFG.URL2
    window_size=CFG.WINDOW_SIZE
    chrome_options = CFG.CHROME_OPTS
    options = webdriver.ChromeOptions()
    if bool(chrome_options) is not None:
        for opt in chrome_options:
            options.add_argument(opt)
    
    def start(self):
        """
        Constructor function that loads any options and connects to the webpage

        Args:
            chrome_options: A list of chrome options to be passed to the webdriver
            url: The url to connect to
            window_size: The size of the window to be opened
        """
        # sourcery skip: remove-pass-body
        print("starting selenium")
        if bool(self.chrome_options) is not None:
            self.driver = webdriver.Chrome(options=self.options)
        else:
            self.driver = webdriver.Chrome()

        print("Connected to website")
        self.driver.get(self.url)
        self.driver.set_window_size(self.window_size[0], self.window_size[1])

    def load_webpage(self):
        """
        Loads the results tab by switching frames and clicking on the results tab,
        also contains an intermediate try/except block to look for an error msg, all using explicit waits.        
        """
        print("waiting to switch frames")
        # Need to switch frames to enable interaction with the loaded javascript
        WebDriverWait(self.driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CLASS_NAME, "iframe-class")))
        print("switched frames, waiting to click results")

        # checking to see if the react error message is present, if it is, refresh and if not, pass
        try:
            WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="react-entry-point"]/div')))
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div'))) 
        except TimeoutException as e:
            pass
        else:
            self.refresh_page()

        # waiting to click results tab
        WebDriverWait(self.driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#tab_container > li:nth-child(3) > a"))
            ).click()
        print("selected results tab")

    def connect(self, max_tries=2):
        """
        Implements a counted try/except loop to load the webpage and refresh it in case of a timeout error.
        It also refreshes the page if there is any other exception thrown.
        
        Args:
            max_tries: The max number of loops to try and load the webpage
        """
        self.start()
        loaded = False
        counter = 0
        while not loaded:
            try:
                self.load_webpage()
                loaded = True
            except TimeoutException as e:
                self.refresh_page(e)
            except Exception as e:
                print("Exception, trying again")
                self.refresh_page(e)
            counter += 1
            if counter == max_tries:
                print("Too many attempts, exiting")
                self.driver.quit()
                break

    def refresh_page(self, error):
        print(f"{error}, trying again")
        self.driver.refresh()

    def __del__(self):
        self.driver.quit()
        

In [14]:
class PageLoader(Crawler):
    def get_page_count(self):
        """
        Finds the container holding the last page number and returns it as a string
        """
        pages = self.driver.find_element(By.CLASS_NAME,
            'last-page'
        ).text
        print(f"Page count: {pages}")
        return str(pages)

    def load_queue(self, data_queue):
        """
        Loads an instance of multiprocessing.Queue with the listified output obtained from get_page_count
        Args:
            queue: An instance of multiprocessing.Queue
        """
        pages = self.get_page_count()
        [data_queue.put(x) for x in range(1, int(pages)+1)]
        data_queue.put("END")
        print("Loaded queue")

    def run(self, data_queue):
        self.connect()
        self.load_queue(data_queue)
        self.driver.quit()

In [15]:
class Worker(Crawler):
    def get_from_queue(self, data_queue):
        """
        Gets the next item from the queue and returns it as a string, if the queue is empty, it returns None
        Args:
            queue: An instance of multiprocessing.Queue
        """
        try:
            page = data_queue.get()
        except Exception:
            data_queue.empty()
            print("Queue is empty")
        return str(page)

    def change_page(self, data_queue):
        """
        Gets page number from get_from_queue and enters it into the page box
        Args:
            queue: An instance of multiprocessing.Queue
        """
        page = self.get_from_queue(data_queue)
        page_box = self.driver.find_element(By.CLASS_NAME, "current-page")
        page_box.send_keys(page)
        return page

    def make_df(self):
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        table = soup.find('table')
        return pd.read_html(str(table))[0]

    def collect_data(self, data_queue, product_queue):
        """
        Collects the data from the webpage after changing pages, compiles it into a pd.DataFrame,
        then puts it into the product queue
        """
        print("Starting data collection")
        while True:
            page = self.change_page(data_queue)
            if  page == "END":
                print("Finished collecting data")
                break
            df = self.make_df()
            product_queue.put(df)

    def run(self, data_queue, product_queue):
        """
        Runs the collect_data function
        """
        self.connect()
        self.collect_data(data_queue, product_queue)

In [16]:
def workers_run(data_queue, product_queue):
    """
    Creates a list of Worker objects and runs them
    """
    worker = Worker()
    worker.connect()
    worker.collect_data()

In [17]:
def get_df(product_queue):
    counter = 0
    dfs = []
    big_dfs = []
    time.sleep(180)
    while not product_queue.empty():
        dfs.append(product_queue.get())
        counter += 1
        if counter % 250:
            print(f"Processing {counter}th df")
            big_dfs.append(pd.concat(dfs, ignore_index=True))
            dfs = []
    return pd.concat(big_dfs, ignore_index=True)

def run():
    DQ, PQ = Queue(), Queue()
    
    loader = PageLoader()
    loader.run(DQ)

    workers = []
    for _ in range(2):
        worker = Process(target=workers_run, args=(DQ, PQ), daemon=True)
        worker.start()
        workers.append(worker)

    df = get_df(PQ)

    for worker in workers:
        worker.join()

    return df

In [18]:
df = run()

starting selenium
Connected to website
waiting to switch frames
switched frames, waiting to click results
Message: 
Stacktrace:
Backtrace:
	(No symbol) [0x00F2F243]
	(No symbol) [0x00EB7FD1]
	(No symbol) [0x00DAD04D]
	(No symbol) [0x00DDC0B0]
	(No symbol) [0x00DDC22B]
	(No symbol) [0x00E0E612]
	(No symbol) [0x00DF85D4]
	(No symbol) [0x00E0C9EB]
	(No symbol) [0x00DF8386]
	(No symbol) [0x00DD163C]
	(No symbol) [0x00DD269D]
	GetHandleVerifier [0x011C9A22+2655074]
	GetHandleVerifier [0x011BCA24+2601828]
	GetHandleVerifier [0x00FD8C0A+619850]
	GetHandleVerifier [0x00FD7830+614768]
	(No symbol) [0x00EC05FC]
	(No symbol) [0x00EC5968]
	(No symbol) [0x00EC5A55]
	(No symbol) [0x00ED051B]
	BaseThreadInitThunk [0x767BFA29+25]
	RtlGetAppContainerNamedObjectPath [0x77487A7E+286]
	RtlGetAppContainerNamedObjectPath [0x77487A4E+238]
, trying again
waiting to switch frames
switched frames, waiting to click results
Message: 
Stacktrace:
Backtrace:
	(No symbol) [0x00F2F243]
	(No symbol) [0x00EB7FD1]
	(No 

MaxRetryError: HTTPConnectionPool(host='localhost', port=60149): Max retries exceeded with url: /session/5fb07a8c6ca49f6d1e805ea748735bb6/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000029AED20AF20>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))