In [117]:

from selenium import webdriver
import time
from webdriver_manager.chrome import ChromeDriverManager
from utils import Supported_Platforms_and_Pages

import logging

_logger = logging.getLogger(__name__)

# Override Selenium Tools, to avoid be caught easily
# This should happen for every request ever sent
class BotParser:

    def __init__(self, url, page_number=1, platform="Coursera", topic="DataScience"):
        if page_number == 1:
            raise NotImplementedError("Page Number of 1 is not supported")
        if platform not in Supported_Platforms_and_Pages:
            raise NotImplementedError(f"The platform {platform} is not supported")
        if topic not in Supported_Platforms_and_Pages[platform]:
            raise NotImplementedError(f"The topic {topic} is not supported")
        self.url = url
        self.page_number = page_number
        self.platform = platform
        self.topic = topic
        self.sleep_duration = sleep_duration
        
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')

        self.driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
    
    def avoid_getting_caught(self):
        time.sleep(self.sleep_duration)

    def hit_url(self):
        self.driver.get(self.url)
        self.avoid_getting_caught()

    def get_url(self):
        return self.url

    def get_page_source(self):
        self.hit_url()
        return self.driver.page_source

    def shut_down(self):
        self.driver.close()
        del self.driver
    
    def get_course_info(self):
        elements = self.driver.find_elements_by_class_name("ais-InfiniteHits-item")
        response = list()
        for element in elements:
            horizontal_class = element.find_elements_by_class_name('horizontal-box')
            assert len(horizontal_class) == 3
            course_name = horizontal_class[0].text
            partner = horizontal_class[1].text
            certificate_type = horizontal_class[2].text
            product_info = element.find_elements_by_class_name('rc-ProductInfo')
            assert len(product_info) == 1
            difficulty = product_info[0].text
            element.click()
            self.avoid_getting_caught()
            time.sleep(3)
            course_url = self.driver.current_url
            response.append({
                "course_name": course_name,
                "partner": partner,
                "certificate_type": certificate_type,
                "link": course_url
            })
            self.hit_url()
            self.avoid_getting_caught()
        return response

In [118]:
bot = BotParser(url="https://www.coursera.org/browse/data-science?page=2", page_number=2)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Driver [/Users/1025948/.wdm/drivers/chromedriver/mac64/84.0.4147.30/chromedriver] found in cache


 




In [119]:
bot.hit_url()

In [120]:
bot.get_course_info()

[{'course_name': 'IBM Data Science',
  'partner': 'IBM',
  'certificate_type': 'PROFESSIONAL CERTIFICATE',
  'link': 'https://www.coursera.org/professional-certificates/ibm-data-science'}]

In [87]:
elements[0].find_element_by_class_name('card-info vertical-box')
#elements[0].find_element_by_class_name('horizontal-box')

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".card-info vertical-box"}
  (Session info: chrome=84.0.4147.135)


In [56]:
elements[0].find_element_by_id('card-info vertical-box')

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=84.0.4147.135)


In [46]:
with open('my_file.html', 'w') as f:
    f.write(bot.driver.page_source)

In [44]:
bot.driver.file_detector_context(file_detector_class="ais-InfiniteHits-item")

<function selenium.webdriver.remote.webdriver.WebDriver.file_detector_context(self, file_detector_class, *args, **kwargs)>

In [39]:
dir(bot.driver)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_file_detector',
 '_is_remote',
 '_mobile',
 '_switch_to',
 '_unwrap_value',
 '_web_element_cls',
 '_wrap_value',
 'add_cookie',
 'application_cache',
 'back',
 'capabilities',
 'close',
 'command_executor',
 'create_options',
 'create_web_element',
 'current_url',
 'current_window_handle',
 'delete_all_cookies',
 'delete_cookie',
 'desired_capabilities',
 'error_handler',
 'execute',
 'execute_async_script',
 'execute_cdp_cmd',
 'execute_script',
 'file_detector',
 'file_detector_context',
 'find_element',
 'find_element_by_class_name',
 'find_element_by_css_selector',
 'find_element_by_id',
 

In [34]:
bot.driver.find_elements_by_class_name("ais-InfiniteHits-item")[0]

<selenium.webdriver.remote.webelement.WebElement (session="693eff09abce62aeee84cc48c92de552", element="945c6687-3dad-46e6-a85a-4ffbbff0bc94")>

In [18]:
bot.driver.find_elements_by_class_name("ais-InfiniteHits-item")[0].click()

In [22]:
with open("my_file.html", 'w') as f:
    f.write(bot.driver.page_source)

In [12]:
dir(bot.driver.find_elements_by_class_name("ais-InfiniteHits-item")[0])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_execute',
 '_id',
 '_parent',
 '_upload',
 '_w3c',
 'clear',
 'click',
 'find_element',
 'find_element_by_class_name',
 'find_element_by_css_selector',
 'find_element_by_id',
 'find_element_by_link_text',
 'find_element_by_name',
 'find_element_by_partial_link_text',
 'find_element_by_tag_name',
 'find_element_by_xpath',
 'find_elements',
 'find_elements_by_class_name',
 'find_elements_by_css_selector',
 'find_elements_by_id',
 'find_elements_by_link_text',
 'find_elements_by_name',
 'find_elements_by_partial_link_text',
 'find_elements_by_tag_name',
 'find_elements_by_xpath',
 'get_attribute',
 'get_property',
 'id',
 

In [None]:
parser = HtmlParser(response=response, page_number=2)
parser.crawl_page()

In [None]:
parser.soup

In [None]:
response

In [29]:
from selenium import webdriver

driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)

[WDM] - Current google-chrome version is 84.0.4147
[WDM] - Get LATEST driver version for 84.0.4147
[WDM] - Driver [/Users/1025948/.wdm/drivers/chromedriver/mac64/84.0.4147.30/chromedriver] found in cache


 


  import sys
