**Introduction**

This is a playground for developing the static website scraper. Functions developed here will be eventually packaged into .py files.


**Progress**

Done:
    1. get response function
    2. element finder and extractor (parser)
    3. file saver

To-do:
    4. pagination
    5. rotate agent
    6. rotate IP
    
Long-term:
    - support authentication 
    - support cookie


In [1]:
# # Make sure you install the required libraries
# !pip3 install --upgrade requests

In [2]:
# Key libraries
import requests
from requests.exceptions import HTTPError

import re

In [3]:
# These functions help us understand the variables that exist in the environment
# which is useful for creating natural language interface for data analysis

import inspect
def get_local_variables(ignore_underscore = True):
    """Get the name and definition of the local variables.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    dictionary
        A mapping between name and definition of the local variables.
                
    """
    callers_local_vars = dict(inspect.currentframe().f_back.f_locals.items())
    if filter_:
        var_keys = list(callers_local_vars.keys())
        for key in var_keys:
            if key.startswith('_'):
                del callers_local_vars[key]
    return callers_local_vars
def retrieve_name(var):
    """Retrieve the name of the variable. # Reference https://stackoverflow.com/a/40536047.
    
    Parameters
    ----------
    var: object 
        Variable to get the name of.
        
    Returns
    ----------
    string
        Name of the variable passed.
        
    """
    for fi in reversed(inspect.stack()):
        names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
        if len(names) > 0:
            return names[0]
        
def get_attributes(obj, ignore_underscore = True):
    """Get a list of valid attributes of the object.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    list
        A list of valid attributes of the object.
                
    """
    return [x for x in dir(obj) if not x.startswith('_')]

def print_attributes_and_values(obj, ignore_underscore = True):
    """Print the valid attributes of the object and their corresponding values.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    None
                
    """
    obj_name = retrieve_name(obj)
    attributes = get_attributes(obj, ignore_underscore = ignore_underscore)
    for attr in attributes:
        obj_attr_string = obj_name+'.'+attr
        print(obj_attr_string)
        print(' '*4 + str(eval(obj_attr_string))[:60])
        print('-'*70)

## Feature 1: get response function

The `requests.get` function takes the following parameters:

 - url – URL for the new  Request object.
 - params – (optional) Dictionary of GET Parameters to send with the Request.
 - headers – (optional) Dictionary of HTTP Headers to send with the Request.
 - cookies – (optional) CookieJar object to send with the  Request.
 - auth – (optional) AuthObject to enable Basic HTTP Auth.
 - timeout – (optional) Float describing the timeout of the request.
 
Note the requests made by this function could be recognized as robotic access by some website. To bypass screening by those websites, additional specifications on headers and proxies are required. These additional setup will be implemented in the future versions.

In [4]:
from multipledispatch import dispatch

@dispatch(str)
def get_response(url, verbose = True):
    """Get the response of the HTTP GET request for the target url.
    
    Parameters
    ----------
    url: string
        The url to the website that needs to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    response object
        
    """
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise Exception when response was not successful
    except HTTPError as http_err:
        print('[Error] HTTP error occurred: '+str(http_err))
        return requests.models.Response() # Return empty response
    except Exception as err:
        print('[Error] Other error occurred: '+str(err))
        return requests.models.Response() # Return empty response
    else:
        if verbose:
            print('[Success] The website at "'+url+'" is collected succesfully.')
        return response

@dispatch(list)
def get_response(urls, verbose = True):
    """Get the responses of the HTTP GET requests for the target urls. 
    
    Parameters
    ----------
    urls: list of string
        The urls to the websites that need to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    list of response object
        
    """
    return [get_response(url) for url in urls]

#### Success and failure examples

In [5]:
response = get_response('http://google.com')

[Success] The website at "http://google.com" is collected succesfully.


In [6]:
response = get_response('http://asdasdfasdfa.com')

[Error] Other error occurred: HTTPConnectionPool(host='asdasdfasdfa.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10a502198>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))


In [7]:
response_list = get_response(['http://google.com','https://baidu.com','https://xasdfxxas.com'])

[Success] The website at "http://google.com" is collected succesfully.
[Success] The website at "https://baidu.com" is collected succesfully.
[Error] Other error occurred: HTTPSConnectionPool(host='xasdfxxas.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x10a50e6d8>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))


#### Example to use throughout this notebook

In [8]:
url = 'http://cityrecord.engineering.nyu.edu'

In [9]:
response = get_response(url)

[Success] The website at "http://cityrecord.engineering.nyu.edu" is collected succesfully.


## Feature 2: parse response function

Once the content of the website is collected, the next step is to parse the page. After parsing, we can find the elements we want, then extract and clean their values. For this step, there are many choices of libraries, some examples are: 

 - BeautifulSoup
 - Scrapy
 - Lxml
 - AdvancedHTMLParser
 
In this pipeline, we will explore `BeautifulSoup` and `Scrapy` for parsing, because these two are relatively more popular and have rich resources. Between these two, BeautifulSoup is more user-friendly, while Scrapy is more efficient and scalable.

The other two libraries listed above are good choices in their specific areas, so keep them in view:
 - Lxml has rich features for processing XML and HTML and is quite efficient.
 - AdvancedHTMLParser has similar functions like in native JavaScript and supports complex operations on HTML.
  
References:
 - https://smartproxy.com/blog/scrapy-vs-beautifulsoup (Use cases comparison and Pros&Cons)
 - https://tomassetti.me/parsing-html (Common libraries in different programming languages)

### Beautiful Soup

### Scrapy

References:   
 - https://docs.scrapy.org/en/latest/intro/tutorial.html
 - https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

## Feature 3: file saver

In [10]:
# # filetype = url.split('.')[-1]
# filetype = 'html'
# filename = re.sub(r'^https?://','',url).replace('.','-').replace('/','_') + '.' + filetype

# with open(filename,'w') as f:
#     f.write(response.text)

<br>
<br>
<br>

## Resources for To-do

In [11]:
# ################################################################################
# # Reference: How to fake and rotate User Agents using Python 3
# # https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

# import requests
# import random 
# from collections import OrderedDict

# # This data was created by using the curl method explained above
# headers_list = [
#     # Firefox 77 Mac
#      {
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.5",
#         "Referer": "https://www.google.com/",
#         "DNT": "1",
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1"
#     },
#     # Firefox 77 Windows
#     {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.5",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Referer": "https://www.google.com/",
#         "DNT": "1",
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1"
#     },
#     # Chrome 83 Mac
#     {
#         "Connection": "keep-alive",
#         "DNT": "1",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Sec-Fetch-Site": "none",
#         "Sec-Fetch-Mode": "navigate",
#         "Sec-Fetch-Dest": "document",
#         "Referer": "https://www.google.com/",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
#     },
#     # Chrome 83 Windows 
#     {
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Sec-Fetch-Site": "same-origin",
#         "Sec-Fetch-Mode": "navigate",
#         "Sec-Fetch-User": "?1",
#         "Sec-Fetch-Dest": "document",
#         "Referer": "https://www.google.com/",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Accept-Language": "en-US,en;q=0.9"
#     }
# ]
# # Create ordered dict from Headers above
# ordered_headers_list = []
# for headers in headers_list:
#     h = OrderedDict()
#     for header,value in headers.items():
#         h[header]=value
#     ordered_headers_list.append(h)
    
    
# url = 'https://httpbin.org/headers'

# for i in range(1,4):
#     #Pick a random browser headers
#     headers = random.choice(headers_list)
#     #Create a request session
#     r = requests.Session()
#     r.headers = headers
    
#     response = r.get(url)
#     print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
#     print(response.json())
#     print("-------------------")

# ################################################################################
# # Reference: How To Rotate Proxies and change IP Addresses using Python 3
# # https://www.scrapehero.com/how-to-rotate-proxies-and-ip-addresses-using-python-3/    
    
# from lxml.html import fromstring
# import requests
# from itertools import cycle
# import traceback

# def get_proxies():
#     url = 'https://free-proxy-list.net/'
#     response = requests.get(url)
#     parser = fromstring(response.text)
#     proxies = set()
#     for i in parser.xpath('//tbody/tr')[:10]:
#         if i.xpath('.//td[7][contains(text(),"yes")]'):
#             proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
#             proxies.add(proxy)
#     return proxies


# #If you are copy pasting proxy ips, put in the list below
# #proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
# proxies = get_proxies()
# proxy_pool = cycle(proxies)

# url = 'https://httpbin.org/ip'
# for i in range(1,11):
#     #Get a proxy from the pool
#     proxy = next(proxy_pool)
#     print("Request #%d"%i)
#     try:
#         response = requests.get(url,proxies={"http": proxy, "https": proxy})
#         print(response.json())
#     except:
#         #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
#         #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
#         print("Skipping. Connnection error")