**Introduction**

This is a playground for developing the static website scraper. Functions developed here will be eventually packaged into .py files.


**Progress**

Done:
    1. get response function
    2. element finder and extractor (parser)
    3. file saver

To-do:
    4. pagination
    5. rotate agent
    6. rotate IP
    
Long-term:
    - support authentication 
    - support cookie


In [1]:
# # Make sure you install the required libraries

# !pip3 install --upgrade multipledispatch # library for overloading functions
# !pip3 install --upgrade requests # library for making request for the static websites
# !pip3 install --upgrade soupsieve  # library to support css selector in beautifulsoup
# !pip3 install --upgrade beautifulsoup4 # a parser that balances between efficiency and leniency
# !pip3 install --upgrade --user lxml # a more efficient parser
# !pip3 install --upgrade html5lib # a parser that acts like a browser, most lenient

In [2]:
# Key libraries
import re
import os
import inspect

from multipledispatch import dispatch

import requests
import bs4

In [3]:
# These functions help us understand the variables that exist in the environment
# which is useful for creating natural language interface for data analysis

def get_local_variables(ignore_underscore = True):
    """Get the name and definition of the local variables.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    dictionary
        A mapping between name and definition of the local variables.
                
    """
    callers_local_vars = dict(inspect.currentframe().f_back.f_locals.items())
    if filter_:
        var_keys = list(callers_local_vars.keys())
        for key in var_keys:
            if key.startswith('_'):
                del callers_local_vars[key]
    return callers_local_vars
def retrieve_name(var):
    """Retrieve the name of the variable. # Reference https://stackoverflow.com/a/40536047.
    
    Parameters
    ----------
    var: object 
        Variable to get the name of.
        
    Returns
    ----------
    string
        Name of the variable passed.
        
    """
    for fi in reversed(inspect.stack()):
        names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
        if len(names) > 0:
            return names[0]
        
def get_attributes(obj, ignore_underscore = True):
    """Get a list of valid attributes of the object.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    list
        A list of valid attributes of the object.
                
    """
    return [x for x in dir(obj) if not x.startswith('_')]

def print_attributes_and_values(obj, ignore_underscore = True):
    """Print the valid attributes of the object and their corresponding values.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    None
                
    """
    obj_name = retrieve_name(obj)
    attributes = get_attributes(obj, ignore_underscore = ignore_underscore)
    for attr in attributes:
        obj_attr_string = obj_name+'.'+attr
        print(obj_attr_string)
        print(' '*4 + str(eval(obj_attr_string))[:60])
        print('-'*70)

## Feature 1: get response function

The `requests.get` function takes the following parameters:

 - url – URL for the new  Request object.
 - params – (optional) Dictionary of GET Parameters to send with the Request.
 - headers – (optional) Dictionary of HTTP Headers to send with the Request.
 - cookies – (optional) CookieJar object to send with the  Request.
 - auth – (optional) AuthObject to enable Basic HTTP Auth.
 - timeout – (optional) Float describing the timeout of the request.
 
Note the requests made by this function could be recognized as robotic access by some website. To bypass screening by those websites, additional specifications on headers and proxies are required. These additional setup will be implemented in the future versions.

In [4]:
@dispatch(str)
def get_response(url, verbose = True):
    """Get the response of the HTTP GET request for the target url.
    
    Parameters
    ----------
    url: string
        The url to the website that needs to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    response object
        
    """
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise Exception when response was not successful
    except requests.exceptions.HTTPError as http_err:
        print('[Error] HTTP error occurred: '+str(http_err))
        return requests.models.Response() # Return empty response
    except Exception as err:
        print('[Error] Other error occurred: '+str(err))
        return requests.models.Response() # Return empty response
    else:
        if verbose:
            print('[Success] The website at "'+url+'" is collected succesfully.')
        return response

@dispatch(list)
def get_response(urls, verbose = True):
    """Get the responses of the HTTP GET requests for the target urls. 
    
    Parameters
    ----------
    urls: list of string
        The urls to the websites that need to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    list of response object
        
    """
    return [get_response(url) for url in urls]

#### Success and failure examples

In [5]:
response = get_response('http://google.com')

[Success] The website at "http://google.com" is collected succesfully.


In [6]:
response = get_response('http://somewebsitethatdontexist.com')

[Error] Other error occurred: HTTPConnectionPool(host='somewebsitethatdontexist.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10d042a20>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))


In [7]:
response_list = get_response(['http://google.com','https://baidu.com','https://anotherwebsitethatdontexist.com'])

[Success] The website at "http://google.com" is collected succesfully.
[Success] The website at "https://baidu.com" is collected succesfully.
[Error] Other error occurred: HTTPSConnectionPool(host='anotherwebsitethatdontexist.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x10d073e10>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))


#### Example to use throughout this notebook

In [8]:
url = 'https://digitalcollections.nypl.org'

In [9]:
response = get_response(url)

[Success] The website at "https://digitalcollections.nypl.org" is collected succesfully.


## Feature 2: parse response function

Once the content of the website is collected, the next step is to parse the page. After parsing, we can find the elements we want, then extract and clean their values. For this step, there are many choices of libraries, some examples are: 

 - BeautifulSoup
 - Scrapy
 - Lxml
 - AdvancedHTMLParser
 
In this pipeline, we will explore `BeautifulSoup` and `Scrapy` for parsing, because these two are relatively more popular and have rich resources. Between these two, BeautifulSoup is more user-friendly, while Scrapy is more efficient and scalable.

The other two libraries listed above are good choices in their specific areas, so keep them in view:
 - Lxml has rich features for processing XML and HTML and is quite efficient (BeautifulSoup actually supports using Lxml parser among other parsers).
 - AdvancedHTMLParser has similar functions like in native JavaScript and supports complex operations on HTML.
  
References:
 - https://smartproxy.com/blog/scrapy-vs-beautifulsoup (Use cases comparison and Pros&Cons)
 - https://tomassetti.me/parsing-html (Common libraries in different programming languages)
 - https://medium.com/analytics-vidhya/scrapy-vs-selenium-vs-beautiful-soup-for-web-scraping-24008b6c87b8 (Great comparison article that includes Selenium, which is the popular choice for dynamic website scraping)

### Beautiful Soup

References:   
 - https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup (Showed how to write element finding logic in hierarchy)
 - https://stackabuse.com/guide-to-parsing-html-with-beautifulsoup-in-python (Nice illustrations, browse_and_scrape combines pagination with parsing) 
 - https://www.crummy.com/software/BeautifulSoup/bs4/doc (Long but detailed description of BS4 usage)
 - https://www.crummy.com/software/BeautifulSoup (The "Hall of Fame" section has some high-profile projects, worth having a look)
 
 
An interesting side note, here one quote from the BS project page:

> You can tell it "Find all the links", or "Find all the links of class externalLink", or "Find all the links whose urls match "foo.com", or "Find the table heading that's got bold text, then give me that text."

But actually, you cannot directly ask BS these natural language questions. You need to write codes that follow the syntax of the BS4 library, which is similar but not quite close to natural language. **Programming with natural language** is one of the directions worth pursuing in the future, as it further lowers the bar for utilizing web scraping and related technologies.

- find_all
- select
- encodings (https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit)

In [21]:
def get_soup(response, default_parser = 'lxml'):
    """Get the beautiful soup object of the response object or filepath or html string.
    
    Parameters
    ----------
    response: requests.models.Response, string
        The response object or filepath or html string. 
    default_parser: string (optional, default = lxml)
        Which parser to use when parsing the response.
    
    Returns
    ----------
    list of response object
        
    """
    if isinstance(response, requests.models.Response):
        soup = bs4.BeautifulSoup(response.content, default_parser)
    elif isinstance(response, str) and os.path.exists(response):
        with open(response) as file_handler:
            soup = bs4.BeautifulSoup(file_handler, default_parser)
    else:
        try:
            soup = bs4.BeautifulSoup(response, default_parser)
        except Exception as err:
            print('[Error] The response object you provided cannot be turned into beautiful soup object: '+str(err))
    return soup

In [11]:
def save_html(html_object, path = './TEMP.html'):
    """Save the response or soup object as a HTML file at the path provided.
    
    Parameters
    ----------
    html_object: requests.models.Response, bs4.BeautifulSoup
        The response or soup object. 
    path: string (optional, default = ./TEMP.html)
        The path at which the HTML file will be saved.
    
    Returns
    ----------
    None
        
    """
    if isinstance(html_object, requests.models.Response):
        html_text = html_object.text
    elif isinstance(html_object, (bs4.BeautifulSoup,bs4.element.Tag)):
        html_text = str(html_object.prettify())
    try:
        with open(path,'w') as f:
            f.write(html_text)
    except Exception as err:
        print('[Error] The response object you provided cannot be turned into beautiful soup object: '+str(err))

In [16]:
def is_readable_content(content):
    """Return whether the content passed is a readable content like Tag or NavigableString; not CData, Comment, Declaration, Doctype, ProcessingInstruction, ResultSet, Script, Stylesheet, XMLFormatter.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    boolean
        
    """
    # Types that are instances of NavigableString:  CData, Comment, Declaration, Doctype, PreformattedString, ProcessingInstruction, ResultSet, Script, Stylesheet, TemplateString, XMLFormatter
    # Types in the group above that are not String:  CData, Comment, Declaration, Doctype, ProcessingInstruction, ResultSet, Script, Stylesheet, XMLFormatter
    return isinstance(content, (bs4.element.Tag, bs4.element.NavigableString)) and not isinstance(content, (bs4.element.CData, bs4.element.Comment, bs4.element.Declaration, bs4.element.Doctype, bs4.element.ProcessingInstruction, bs4.element.ResultSet, bs4.element.Script, bs4.element.Stylesheet, bs4.element.XMLFormatter))

def get_contents(element):
    """Return a list of non-empty and readable contents/children of the element.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    list of bs4.element
        
    """
    return [content for content in element.contents if str(content).strip()!='' and is_readable_content(content)]

def get_contents_names(element):
    """Return the list of names of the non-empty and readable contents/children of the element.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    list of string
        
    """
    return [content.name for content in get_contents(element)]

In [17]:
def elevate_to_nearest_tag(element):
    if isinstance(element, bs4.element.NavigableString):
        element = element.parent
    if isinstance(element, bs4.element.Tag):
        return element
    else:
        print('[Error] Element is still not Tag after getting the parent.')

In [322]:
@dispatch(str)
def highlight_element(element, highlight_style = "background-color: rgba(255,0,0,0.5); border: 3px dotted yellow"):
    element['style'] = highlight_style

@dispatch(list)
def highlight_element(elements, highlight_style = "background-color: rgba(255,0,0,0.5); border: 3px dotted yellow; "):
    for element in elements:
        element['style'] = highlight_style

In [19]:
# def find_like(string):   

In [22]:
soup = get_soup(response)

In [23]:
string = "Brooklyn: 3rd Avenue - St. Mark's Place"

# 'RECENTLY DIGITIZED ITEMS'

In [338]:
def give_unique_sample_element(soup):
    attempt_count = 0
    matched_elements = []
    while len(matched_elements)!=1:
        if attempt_count>0:
            print('\nThere are '+str(len(matched_elements))+' matched elements given your last input. They are:\n'+'\t\n'.join([str(matched_element)[:100] for matched_element in matched_elements])+'\n\n')
        displayed_text = input('What is the displayed text for one of the elements you want to scrape:\n')
        matched_elements = soup.find_all(text = re.compile(displayed_text,re.IGNORECASE))
        attempt_count += 1
    sample_element = matched_elements[0]
    sample_element = elevate_to_nearest_tag(sample_element)
    print('\nUnique match is found:\n'+str(sample_element)+'\n\n')
    return sample_element

In [323]:
def get_self_index(element):
    self_type = element.name
    previous_siblings_of_all_types = list(element.previous_siblings)
    previous_siblings_of_same_type = [element for element in previous_siblings_of_all_types if element.name == self_type]
    return len(previous_siblings_of_same_type) + 1 # css selector starts indexing with 1 instead of 0

# Reference: https://stackoverflow.com/a/32263260 (basic structure inspiration)
# Reference: https://csswizardry.com/2012/05/keep-your-css-selectors-short (tips to improve efficiency)

def describe_element_in_css(node):
    
    enough_to_be_unique = False
    
    node_type = node.name
    
    node_attrs = node.attrs
    node_attrs_string = ''
    for k,v in node_attrs.items():
        if k == 'id':
            node_attrs_string += '#' + node_attrs[k]
            enough_to_be_unique = True
            break
        elif k == 'class':
            node_attrs_string += '.'+'.'.join(node_attrs[k])

    element_part = node_type + node_attrs_string
            
    if not enough_to_be_unique:
        length = get_self_index(node)
        if (length) > 1:
            element_part = '%s:nth-child(%s)' % (element_part, length)
        
    return element_part

def get_css_path(node):
    path = [describe_element_in_css(node)]
    for parent in node.parents:
        if parent.name == 'body' or '#' in path[0]:
            break
        path.insert(0, describe_element_in_css(parent))
    return ' > '.join(path)

@dispatch(str)
def extract_text(element):
    return element.text.strip()

@dispatch(list)
def extract_text(elements):
    return [extract_text(element) for element in elements]

def elevate_css_path(path):
    return '>'.join(path.split('>')[:-1]).strip() if '>' in path else path

In [339]:
# André Fashion Illustrations
sample_element = give_unique_sample_element(soup)

What is the displayed text for one of the elements you want to scrape:
collection of photographs 

There are 6 matched elements given your last input. They are:

  var bg_images = [{"title":"Penn Station, Interior, Manhattan.","name":"482603.jpg","collection_id	
Collection of photographs of New York City	
Collection of photographs of New York City, New York State and more by Max Hubacher	
Collection of photographs of New York City, 1931-1942	
Collection of photographs of East River and Hudson River piers, Manhattan	
Collection of photographs of New York City


What is the displayed text for one of the elements you want to scrape:
COLLECTION OF PHOTOGRAPHS OF NEW YORK CITY, NEW YORK

Unique match is found:
<h5>Collection of photographs of New York City, New York State and more by Max Hubacher</h5>




In [340]:
path = get_css_path(sample_element)

In [329]:
path = elevate_css_path(path)

In [330]:
highlighted_soup = highlight_element(soup.select(path))
save_html(soup)

### Scrapy

References:   
 - https://docs.scrapy.org/en/latest/intro/tutorial.html
 - https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

## Feature 3: file saver

In [None]:
# # filetype = url.split('.')[-1]
# filetype = 'html'
# filename = re.sub(r'^https?://','',url).replace('.','-').replace('/','_') + '.' + filetype

# with open(filename,'w') as f:
#     f.write(response.text)

<br>
<br>
<br>

## Resources for To-do

In [None]:
# ################################################################################
# # Reference: How to fake and rotate User Agents using Python 3
# # https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

# import requests
# import random 
# from collections import OrderedDict

# # This data was created by using the curl method explained above
# headers_list = [
#     # Firefox 77 Mac
#      {
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.5",
#         "Referer": "https://www.google.com/",
#         "DNT": "1",
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1"
#     },
#     # Firefox 77 Windows
#     {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.5",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Referer": "https://www.google.com/",
#         "DNT": "1",
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1"
#     },
#     # Chrome 83 Mac
#     {
#         "Connection": "keep-alive",
#         "DNT": "1",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Sec-Fetch-Site": "none",
#         "Sec-Fetch-Mode": "navigate",
#         "Sec-Fetch-Dest": "document",
#         "Referer": "https://www.google.com/",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
#     },
#     # Chrome 83 Windows 
#     {
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Sec-Fetch-Site": "same-origin",
#         "Sec-Fetch-Mode": "navigate",
#         "Sec-Fetch-User": "?1",
#         "Sec-Fetch-Dest": "document",
#         "Referer": "https://www.google.com/",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Accept-Language": "en-US,en;q=0.9"
#     }
# ]
# # Create ordered dict from Headers above
# ordered_headers_list = []
# for headers in headers_list:
#     h = OrderedDict()
#     for header,value in headers.items():
#         h[header]=value
#     ordered_headers_list.append(h)
    
    
# url = 'https://httpbin.org/headers'

# for i in range(1,4):
#     #Pick a random browser headers
#     headers = random.choice(headers_list)
#     #Create a request session
#     r = requests.Session()
#     r.headers = headers
    
#     response = r.get(url)
#     print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
#     print(response.json())
#     print("-------------------")

# ################################################################################
# # Reference: How To Rotate Proxies and change IP Addresses using Python 3
# # https://www.scrapehero.com/how-to-rotate-proxies-and-ip-addresses-using-python-3/    
    
# from lxml.html import fromstring
# import requests
# from itertools import cycle
# import traceback

# def get_proxies():
#     url = 'https://free-proxy-list.net/'
#     response = requests.get(url)
#     parser = fromstring(response.text)
#     proxies = set()
#     for i in parser.xpath('//tbody/tr')[:10]:
#         if i.xpath('.//td[7][contains(text(),"yes")]'):
#             proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
#             proxies.add(proxy)
#     return proxies


# #If you are copy pasting proxy ips, put in the list below
# #proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
# proxies = get_proxies()
# proxy_pool = cycle(proxies)

# url = 'https://httpbin.org/ip'
# for i in range(1,11):
#     #Get a proxy from the pool
#     proxy = next(proxy_pool)
#     print("Request #%d"%i)
#     try:
#         response = requests.get(url,proxies={"http": proxy, "https": proxy})
#         print(response.json())
#     except:
#         #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
#         #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
#         print("Skipping. Connnection error")