## Selenium Introduction

    # importing the selenium webdriver
    from selenium import webdriver

    # importing common keys (keyboard keys such as, ENTER, ALT etc.)
    from selenium.webdriver.common.keys import Keys

    # to identify the path of the Chrome webdriver on the disk
    from shutil import which

    # to define certain options for the webdriver settings
    from selenium.webdriver.chrome.options import Options

    # determines the path of chromedriver
    driver_path = which("chromedriver")

    # to launch the webdriver instance as headless
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")

    # instantiates an instance of Chrome using the chromedriver with options set to chrome_options
    driver = webdriver.Chrome(executable_path=driver_path)  # (executable_path=driver_path, options=chrome_options)
    driver.get("https://www.duckduckgo.com")  # gets the website from the link

    # To select a particular element using a css selector
    input_box = driver.find_element_by_css_selector("#search_form_input_homepage")

    # To select multiple elements (selected by a single css selector) use driver.find_elements_by_css_selector().
    # Remember that position starts from 0 similar to index numbers.

    # To type
    input_box.send_keys("My User Agent")  

    # Selecting the search button and clicking it.
    # src_btn = driver.find_element_by_css_selector("#search_button_homepage")
    # src_btn.click()

    # Pressing Enter after typing
    input_box.send_keys(Keys.ENTER)

    # the html source code of the page
    resp = driver.page_source

    print(resp)

    # close the webdriver
    driver.close()

For more information read the documentation at - <url>https://selenium-python.readthedocs.io/</url>

## Setting up scrapy-selenium

In the settings.py file append the following lines.

    from shutil import which

    SELENIUM_DRIVER_NAME = 'chrome'
    SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
    SELENIUM_DRIVER_ARGUMENTS = ['--incognito'] # headless
    
    DOWNLOADER_MIDDLEWARES = {'scrapy_selenium.SeleniumMiddleware': 800,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    '<folder_name>.middlewares.UserAgentRotatorMiddleware': 400}

## Using the scrapy-selenium middleware

First create a .py file inside the spiders folder. Now,

    # import the time module
    import time

    import scrapy

    # to use the itemloaders
    """
    Item Loaders provide a convenient mechanism for populating scraped items. Even though items can be populated directly, Item Loaders provide a much more convenient API for populating them from a scraping process, by automating some common tasks, like, parsing the raw extracted data before assigning it to a field.
    """
    from scrapy.loader import ItemLoader

    # to convert a string to a selector object
    from scrapy.selector import Selector

    # To send request using selenium (instead of scrapy)
    from scrapy_selenium import SeleniumRequest

    # TO handle common Exceptions
    # if the selected element we want to interact with (say a search box or a button) is not interactable
    from selenium.common.exceptions import ElementNotInteractableException
    # if the selected element is not present in the response recieved by the selenium request
    from selenium.common.exceptions import NoSuchElementException

    # to select element "By" any of the properties (say class, id, tag etc.)
    from selenium.webdriver.common.by import By

    # to import common keys (keyboard keys such as, ENTER, ALT etc.)
    from selenium.webdriver.common.keys import Keys

    # to use conditions for an operation or particular action such as to wait for a fixed amount of time until a particular element(s) is clickable or is located in the page source. 
    """
    Some common EC methods are --
    """
    # EC.presence_of_element_located((By.CSS_SELECTOR, "css selector"))
    # EC.presence_of_all_elements_located((By.CSS_SELECTOR, "css selector"))
    # EC.element_to_be_clickable((By.ID, 'id'))

    """ (Remember, the arguments to these methods must be supplied in a tuple) """

    from selenium.webdriver.support import expected_conditions as EC

    # to define how much time to wait before selenium intiates a particular action
    from selenium.webdriver.support.ui import WebDriverWait

    # to import the items class and use the created fields
    from ..items import __Item_Class_Name__

    # to identify the path of the Chrome webdriver on the disk
    from shutil import which

    # to define certain options for the webdriver settings
    from selenium.webdriver.chrome.options import Options

    # determines the path of chromedriver
    driver_path = which("chromedriver")

    # to launch the webdriver instance as headless
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")

    # instantiates an instance of Chrome using the chromedriver with options set to chrome_options
    driver = webdriver.Chrome(executable_path=driver_path)  # (executable_path=driver_path, options=chrome_options)

    class NameSpider(scrapy.Spider):
        name = 'Name'

        def start_requests(self):
            yield SeleniumRequest(
                url="https://www.abcde.net",
                callback=self.__callback_function__,
                wait_time=10,
                wait_until=EC.element_to_be_clickable((By.ID, 'id'))
            )

        def __callback_function__(self, response):
            # get the driver
            driver = response.request.meta["driver"]
            # Code
            pass 

#### Normalizing space

    # Import Regex
    import re

    # equivalent of the normalize-space function used in xpath

    def normalize_space(string):
        string = string.strip()
        string = re.sub(r'\s+', ' ', string)
        return string

#### Navigating to a page

We can navigate to a page just by extracting the link and using SeleniumRequest/driver.get(). But bear in mind that selenium response object from driver.get() doesn't work without selenium commands. To use scrapy command on the response object first convert it to a selector object using the scrapy.selector.Selector() function.

#### Handling infinite scrolling

    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scrolling down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Waiting for the page to load new content
        time.sleep(5)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


### Populating the Item Fields


        # First we use the Selector() function to convert the selenium response object to a selector object since selenium response doesn't work with scrapy commands.

        self.html = driver.page_source
        response = Selector(text=self.html)

        # say we select some elements/items using a css/xpath selector.
        xs = response.css("css selector")
        for x in xs:

            # item - The item instance to populate using subsequent calls to add_xpath()/add_css()/add_value()
            # selector - The selector object to extract data from, when using the add_xpath(), add_css(), replace_xpath(), or replace_css()

            loader = ItemLoader(item=ItemClassName(), selector=)

            # Now to populate the Item Fields 

            x_value = x.css("css selector")

            loader.add_value(field_name, value, *processors, **kw)
            loader.add_css(field_name, css, *processors, **kw)
            loader.add_xpath(field_name, xpath, *processors, **kw)

            # ItemLoader.load_item() - Populate the item with the data collected so far, and return it. The data collected is first passed through the output processors to get the final value to assign to each item field.

            yield loader.load_item()


### The idea of Input and Output processors

An Item Loader contains one input processor and one output processor for each (item) field. The input processor processes the extracted data as soon as it’s received (through the add_xpath(), add_css() or add_value() methods) and the result of the input processor is collected and kept inside the ItemLoader. <u>That's why input processors are usually used with the add_xpath(), add_css() or add_value() methods while populating the item fields.</u> After collecting all data, the ItemLoader.load_item() method is called to populate and get the populated item object. That’s when the output processor is called with the data previously collected (and processed using the input processor). The result of the output processor is the final value that gets assigned to the item. <u>And it is more common to declare the output processors in the field metadata, as they usually depend only on the field.</u>

 The processors are just callable objects, which are called with the data to be parsed, and returns a parsed value. So any function can be used as input or output processor. Both input and output processors must receive an iterable as their first argument. The output of those functions can be anything.

### Available built-in processors

    from itemloaders import processors

- <b>processors.MapCompose(*functions, **default_loader_context):</b> each input value is passed to the first function, and the result of that function is passed to the second function, and so on, until the last function returns the output value of this processor. By default, process stops on "None" value. This behaviour can be changed by passing keyword argument "stop_on_none=False". This processor provides a convenient way to compose functions that only work with single values (instead of iterables). For this reason the MapCompose processor is typically used as input processor, since data is often extracted using the extract() method of parsel selectors, which returns a list of unicode strings.

- <b>processors.TakeFirst:</b> Returns the first non-null/non-empty value from the values received, so it’s typically used as an output processor to single-valued fields.

There are some other available buit-in processors also. To learn about them visit, <url>https://itemloaders.readthedocs.io/en/latest/built-in-processors.html#built-in-processors</url>