**Introduction**

This is a develop environment for the static website scraper. Functions developed here will be eventually packaged into .py files and called from other notebook or python scripts.


**Progress**

Done:
    1. get response
    2. find and extract content
    3. save file

To-do:
    4. pagination
    
Long-term:
    - handle special encoding
    - rotate agent
    - rotate IP
    - support authentication 
    - support cookie


In [1]:
# # Make sure you install the required libraries

# !pip3 install --upgrade requests # library for making request for the static websites
# !pip3 install --upgrade soupsieve  # library to support css selector in beautifulsoup
# !pip3 install --upgrade beautifulsoup4 # a parser that balances between efficiency and leniency
# !pip3 install --upgrade --user lxml # a more efficient parser
# !pip3 install --upgrade html5lib # a parser that acts like a browser, most lenient

In [2]:
# Key libraries
import re
import os
import json
import inspect
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200

import requests
import bs4

In [3]:
# These functions help us understand the variables that exist in the environment
# which is useful for creating natural language interface for data analysis

def get_local_variables(ignore_underscore = True):
    """Get the name and definition of the local variables.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    dictionary
        A mapping between name and definition of the local variables.
                
    """
    callers_local_vars = dict(inspect.currentframe().f_back.f_locals.items())
    if filter_:
        var_keys = list(callers_local_vars.keys())
        for key in var_keys:
            if key.startswith('_'):
                del callers_local_vars[key]
    return callers_local_vars
def retrieve_name(var):
    """Retrieve the name of the variable. # Reference https://stackoverflow.com/a/40536047.
    
    Parameters
    ----------
    var: object 
        Variable to get the name of.
        
    Returns
    ----------
    string
        Name of the variable passed.
        
    """
    for fi in reversed(inspect.stack()):
        names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
        if len(names) > 0:
            return names[0]
        
def get_attributes(obj, ignore_underscore = True):
    """Get a list of valid attributes of the object.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    list
        A list of valid attributes of the object.
                
    """
    return [x for x in dir(obj) if not x.startswith('_')]

def print_attributes_and_values(obj, ignore_underscore = True):
    """Print the valid attributes of the object and their corresponding values.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    None
                
    """
    obj_name = retrieve_name(obj)
    attributes = get_attributes(obj, ignore_underscore = ignore_underscore)
    for attr in attributes:
        obj_attr_string = obj_name+'.'+attr
        print(obj_attr_string)
        print(' '*4 + str(eval(obj_attr_string))[:60])
        print('-'*70)


def get_response(url, verbose = True):
    """Get the response of the HTTP GET request for the target url.
    
    Parameters
    ----------
    url: string
        The url to the website that needs to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    response object
        
    """
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise Exception when response was not successful
    except requests.exceptions.HTTPError as http_err:
        print('[Error] HTTP error occurred: '+str(http_err))
        return requests.models.Response() # Return empty response
    except Exception as err:
        print('[Error] Other error occurred: '+str(err))
        return requests.models.Response() # Return empty response
    else:
        if verbose:
            print('[Success] The website at "'+url+'" is collected succesfully.')
        return response

def get_responses(urls, verbose = True):
    """Get the responses of the HTTP GET requests for the target urls. 
    
    Parameters
    ----------
    urls: list of string
        The urls to the websites that need to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    list of response object
        
    """
    return [get_response(url) for url in urls]



def get_soup(response, default_parser = 'lxml'):
    """Get the beautiful soup object of the response object or filepath or html string.
    
    Parameters
    ----------
    response: requests.models.Response, string
        The response object or filepath or html string. 
    default_parser: string (optional, default = lxml)
        Which parser to use when parsing the response.
    
    Returns
    ----------
    list of response object
        
    """
    if isinstance(response, requests.models.Response):
        soup = bs4.BeautifulSoup(response.content, default_parser)
    elif isinstance(response, str) and os.path.exists(response):
        with open(response) as file_handler:
            soup = bs4.BeautifulSoup(file_handler, default_parser)
    else:
        try:
            soup = bs4.BeautifulSoup(response, default_parser)
        except Exception as err:
            print('[Error] The response object you provided cannot be turned into beautiful soup object: '+str(err))
    return soup

def save_html(html_object, url , path = ''):
    """Save the response or soup object as a HTML file at the path provided.
    
    Parameters
    ----------
    html_object: requests.models.Response, bs4.BeautifulSoup
        The response or soup object. 
    path: string (optional, default = ./TEMP.html)
        The path at which the HTML file will be saved.
    
    Returns
    ----------
    None
        
    """
    if path == '':
        path = './'+re.sub('^https?://','',url).replace('/','_').replace('.','-')+'.html'
    if isinstance(html_object, requests.models.Response):
        html_text = html_object.text
    elif isinstance(html_object, (bs4.BeautifulSoup,bs4.element.Tag)):
        html_text = str(html_object.prettify())
    else:
        html_text = str(html_object)
    try:
        with open(path,'w') as f:
            f.write(html_text)
            print('[Success] The HTML file is saved succesfully.')
    except Exception as err:
        print('[Error] The response object you provided cannot be turned into beautiful soup object: '+str(err))

def is_readable_content(content):
    """Return whether the content passed is a readable content like Tag or NavigableString; not CData, Comment, Declaration, Doctype, ProcessingInstruction, ResultSet, Script, Stylesheet, XMLFormatter.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    boolean
        
    """
    # Types that are instances of NavigableString:  CData, Comment, Declaration, Doctype, PreformattedString, ProcessingInstruction, ResultSet, Script, Stylesheet, TemplateString, XMLFormatter
    # Types in the group above that are not String:  CData, Comment, Declaration, Doctype, ProcessingInstruction, ResultSet, Script, Stylesheet, XMLFormatter
    return isinstance(content, (bs4.element.Tag, bs4.element.NavigableString)) and not isinstance(content, (bs4.element.CData, bs4.element.Comment, bs4.element.Declaration, bs4.element.Doctype, bs4.element.ProcessingInstruction, bs4.element.ResultSet, bs4.element.Script, bs4.element.Stylesheet, bs4.element.XMLFormatter))

def get_contents(element):
    """Return a list of non-empty and readable contents/children of the element.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    list of bs4.element
        
    """
    return [content for content in element.contents if str(content).strip()!='' and is_readable_content(content)]

def get_contents_names(element):
    """Return the list of names of the non-empty and readable contents/children of the element.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    list of string
        
    """
    return [content.name for content in get_contents(element)]

def elevate_till_is_tag(element):
    """Return the nearest Tag element, if not itself, return its parent if it is a Tag element.
    
    Parameters
    ----------
    element: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    bs4.element.Tag
        
    """
    if isinstance(element, bs4.element.NavigableString):
        return element.parent
    if isinstance(element, bs4.element.Tag):
        return element
    else:
        print('[Error] Element is still not Tag after getting the parent.')
        return None


def get_self_index(element):
    """Return the index of the element among its siblings.
    
    Parameters
    ----------
    element: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    int
        
    """
    self_type = element.name
    previous_siblings_of_all_types = list(element.previous_siblings)
    previous_siblings_of_same_type = [element for element in previous_siblings_of_all_types if element.name == self_type]
    return len(previous_siblings_of_same_type) + 1 # css selector starts indexing with 1 instead of 0


# Reference: https://stackoverflow.com/a/32263260 (basic structure inspiration)
# Reference: https://csswizardry.com/2012/05/keep-your-css-selectors-short (tips to improve efficiency)

def describe_part_of_css_selector(node):
    """Construct part of the css selector path.
    
    Parameters
    ----------
    node: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    string
        
    """
    
    enough_to_be_unique = False
    
    node_type = node.name
    
    node_attrs = node.attrs
    node_attrs_string = ''
    for k,v in node_attrs.items():
        if k == 'id':
            node_attrs_string += '#' + node_attrs[k]
            enough_to_be_unique = True
            break
        elif k == 'class':
            node_attrs_string += '.'+'.'.join(node_attrs[k])

    element_part = node_type + node_attrs_string
            
    if not enough_to_be_unique:
        length = get_self_index(node)
        if (length) > 1:
            element_part = '%s:nth-child(%s)' % (element_part, length)
        
    return element_part

def get_css_selector_path(node):
    """Construct the whole css selector path to a certain element.
    
    Parameters
    ----------
    node: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    string
        
    """
    
    path = [describe_part_of_css_selector(node)]
    for parent in node.parents:
        if parent.name == 'body' or '#' in path[0]:
            break
        path.insert(0, describe_part_of_css_selector(parent))
    return ' > '.join(path)

def elevate_css_selector_path(path):
    """Get the css selector path to the element that is one level above the current element.
    
    Parameters
    ----------
    path: string
        The css selector path to an BS4 element from the parsed tree.
    
    Returns
    ----------
    string
        
    """
    
    return '>'.join(path.split('>')[:-1]).strip() if '>' in path else path


from collections.abc import Iterable
def is_iterable(obj):
    """Check if the passed object is iterable.
    
    Parameters
    ----------
    obj: object
    
    Returns
    ----------
    boolean
        
    """
    
    return isinstance(obj, Iterable)


def flatten_list(l):
    """Flatten a list of lists to a one-layer list (elements are in original order). Note this is NOT recursive, meaning multi-layered list of lists cannot be converted into a single-layered list in one transformation.
    
    Parameters
    ----------
    l: list
    
    Returns
    ----------
    list
        
    """
    
    return [item for sublist in l for item in sublist]

def extract_text(element):
    """Extract the textual content of an element.
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    string
        
    """
    
    return element.text.strip()


def get_directly_related_link(element):
    """Extract the link directly related to the element.
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    string
        
    """
    
    count = 0
    while element.name != 'a' and count < 5:
        element = element.parent
        count += 1
    if element.name != 'a':
        return ''
    else:
        return element['href']


def get_indirectly_related_links(element):
    """Extract the links indirectly related to the element (i.e. belonging to the sibling elements).
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    list of string
        
    """
    
    return [link['href'] for link in element.parent.find_all('a')]


def get_related_link(element):
    """Extract the link directly related to the element, if none is found, get indirectly related links.
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    string or list of string
        
    """
    
    link = get_directly_related_link(element)
    
    if link != '':
        return link
    else:
        links = get_indirectly_related_links(element)
        if len(links) == 1:
            return links[0]
        else:
            return links

def get_longest_separator(text):
    """Return the longest separator (formed by multiple newline) in the text.
    
    Parameters
    ----------
    text: string
    
    Returns
    ----------
    string
        
    """
    if isinstance(text, str) and '\n' in text:
        return max(re.findall(r'\n+', text, re.DOTALL), key=lambda x: len(x))
    else:
        return ''

def get_longest_separator_in_list(texts):
    """Return the longest separator (formed by multiple newline) in the texts contained in the list.
    
    Parameters
    ----------
    texts: list of string
    
    Returns
    ----------
    string
        
    """
    return max([get_longest_separator(text) for text in texts], key=len)


def remove_blank_element_in_list(li):
    """Return a cleaned version of the list with all blank elements removed.
    
    Parameters
    ----------
    li: list
    
    Returns
    ----------
    list
        
    """
    return [element for element in li if element.strip()!='']

def recursive_split(text):
    """Return a multi-layer list of lists resulting from a recursive split of the text (split by longer separator first).
    
    Parameters
    ----------
    text: String
        A piece of text that contains separators of different lengths.
    
    Returns
    ----------
    list (of lists)
        
    """
    longest_separator = get_longest_separator(text)
    if longest_separator == '':
        return text
    else:
        return [recursive_split(part) for part in remove_blank_element_in_list(text.split(longest_separator))]
    
def get_unique_sample_element(soup, target_phrase = '', context_radius = 40):
    """Find and return an element based on the html structure and a target phrase, solicit additional information from user through input questions if needed.
    
    Parameters
    ----------
    soup: bs4.soup
        The parsed tree of the response.
    target_phrase: string (optional, if not provided, the function will ask user to input)
        The phrase used to find the sample element.
    context_radius: int (optional, default = 40)
        How many characters to display to help user choose recurring phrases based on their contexts.
    
    Returns
    ----------
    bs4.element.Tag
        
    """
        
    attempt_count = 0
    matched_elements = []
    
    if target_phrase != '':
        target_phrase = target_phrase.lower()
        matched_elements = soup.find_all(text = re.compile(target_phrase,re.IGNORECASE))
        attempt_count += 1
    
    while len(matched_elements)!=1:
    
        if attempt_count>0:
            print('\nThere are '+str(len(matched_elements))+' matched elements given your last input. They are:\n'+'\t\n'.join([str(matched_element)[:100] for matched_element in matched_elements[:10]])+'\n\n')
        
        if len(set([str(matched_element) for matched_element in matched_elements]))==1:
            last_index = -1
            phrases_in_context = []
            whole_page_text = re.sub('\s+',' ',soup.text).lower()
            
            if whole_page_text.count(target_phrase) == len(matched_elements):
            
                for i in range(whole_page_text.count(target_phrase)):
                    current_index = whole_page_text.index(target_phrase,last_index+1)
                    phrases_in_context.append(whole_page_text[current_index-context_radius:current_index]+'\\\\ '+whole_page_text[current_index:current_index+len(target_phrase)]+' //'+whole_page_text[current_index+len(target_phrase):current_index+len(target_phrase)+context_radius])
                    last_index = current_index
                if len(set(phrases_in_context))==1:
                    print('[Error] There are '+str(len(phrases_in_context))+' occurences of the same target phrase on the page that have very similar contexts.\nPlease use the browser inspector tool to copy the "selector" or "Selector Path".\n')
                    return None
                else:
                    numbered_contexts = ''
                    for i in range(len(phrases_in_context)):
                        numbered_contexts += 'Choice '+str(i+1)+':  '+phrases_in_context[i] + '\n'
                    print('There are '+str(len(phrases_in_context))+' occurences of the same target phrase on the page,\nplease choose one based on their contexts:\n\n' + numbered_contexts + '\n')

                which_one = 0
                while which_one-1 not in range(len(phrases_in_context)):
                    which_one = input('Which choice is the element you that want to scrape: [1, 2, 3, ...]\n')
                    try:
                        which_one = int(which_one)
                    except:
                        which_one = 0
                matched_elements = [matched_elements[which_one-1]]
                
            else:
                print('[Error] The number of matched elements and the number of target phrase occurences are not the same.\nPlease use the browser inspector tool to copy the "selector" or "Selector Path".\n')
                return None
            
        else:
            target_phrase = input('What is the displayed text for one of the elements you want to scrape: '+('(Type "QUIT" to stop)' if attempt_count>3 else '')+'\n')
            if target_phrase == 'QUIT':
                print('\n[Error] It is very likely that the website is not fully collected.\n        Please try this command: scrape_where_for_what(YOUR_URL, YOUR_TARGET_PHRASE, save_as_html = True)\n        A HTML file will be created in your local folder, open it with a browser.\n        If you cannot see what you want to find on the page, please switch to dynamic scraping method.\n')
                return None
            matched_elements = soup.find_all(text = re.compile(target_phrase,re.IGNORECASE))
            attempt_count += 1
            
    sample_element = matched_elements[0]
    sample_element = elevate_till_is_tag(sample_element)
    print('\nUnique match is found:\n'+str(sample_element)[:100]+ (' ......' if len(str(sample_element))>100 else '') +'\n\n')
    
    if sample_element.name == 'script':
        matched_lines = [line for line in sample_element.prettify().split('\n') if target_phrase in line.lower()]
        try:
            assert(len(matched_lines)==1)
            matched_line = matched_lines[0].strip().strip(';')
            matched_data = matched_line.split('=',maxsplit=1)[1].strip()
            data = pd.DataFrame(json.loads(matched_data))
            return data
        except:
            print('[Error] There are multiple occurences of the target phrase in the JS script.\nPlease use another more unique target phrase or inspect the page source for the data in JS script.\n')
            return None
    
    return sample_element

def extract_contents(soup, path):
    """Extract and return the texts and links with the target path in the parsed tree.
    
    Parameters
    ----------
    soup: bs4.soup
        The parsed tree of the response.
    path: string
        The css selector path to the target elements.
    
    Returns
    ----------
    pd.DataFrame
        
    """
    
    if isinstance(soup, pd.DataFrame):
        return soup
    
    target_elements = soup.select(path)

    extracted_contents = pd.DataFrame([(recursive_split(extract_text(target_element)), get_related_link(target_element)) for target_element in target_elements], columns = ['text','url'])

    return extracted_contents

def scrape_where_for_what(url, target_phrase, go_up = 0, save_as_html = False):
    """Get the parsed tree of the website and the path to the target elements.
    
    Parameters
    ----------
    url: string
        The url of the website you want to scrape.
    target_phrase: string
        The displayed text of one of the elements you want to scrape.
    go_up: int
        How many levels to go up in order to get the amount of contents you want.
    save_as_html: boolean (optional, default = False)
        Whether or not to save the response from the website as a HTML file.
    
    Returns
    ----------
    pd.DataFrame
        
    """
    
    response = get_response(url)
    
    if save_as_html:
        save_html(response.text, url)
    
    soup = get_soup(response)

    sample_element = get_unique_sample_element(soup, target_phrase)
    
    if sample_element is None:
        return None, None
    
    if isinstance(sample_element, pd.DataFrame):
        print('[Success] Data is in the JS script and now extracted as a DataFrame into the variable "soup".\n')
        return sample_element
    
    sample_path = get_css_selector_path(sample_element)
    
    path = sample_path[:]
    for i in range(go_up):
        path = elevate_css_selector_path(path)
    
    extracted_contents = extract_contents(soup, path)

    return extracted_contents


### Examples

# 1

In [4]:
scrape_where_for_what('https://digitalcollections.nypl.org', "Black Experience in Children's Book")

[Success] The website at "https://digitalcollections.nypl.org" is collected succesfully.

Unique match is found:
<h5>The Black Experience in Children's Books: Selections from Augusta Baker's Bibliographies</h5>




Unnamed: 0,text,url
0,The Black Experience in Children's Books: Selections from Augusta Baker's Bibliographies,/collections/the-black-experience-in-childrens-books-selections-from-augusta-bakers
1,Scrapbooks of New York City views,/collections/scrapbooks-of-new-york-city-views
2,Li ji ji shi: er shi wu juan,/collections/li-ji-ji-shi-er-shi-wu-juan
3,Women of distinction: remarkable in works and invincible in character,/collections/women-of-distinction-remarkable-in-works-and-invincible-in-character
4,Collection of ledgers and cash books covering the period 1891-1925,/collections/collection-of-ledgers-and-cash-books-covering-the-period-1891-1925
...,...,...
225,William Blake: Illuminated Books,/collections/william-blake-illuminated-books
226,Ise Monogatari Emaki,/collections/ise-monogatari-emaki
227,Des cleres et nobles femmes,/collections/des-cleres-et-nobles-femmes
228,Minchô shiken (The Colored Inkstone of the Ming Period),/collections/minch-shiken-the-colored-inkstone-of-the-ming-period


# 2

In [5]:
scrape_where_for_what('https://digitalcollections.nypl.org/collections/changing-new-york', "salmagundi")

[Success] The website at "https://digitalcollections.nypl.org/collections/changing-new-york" is collected succesfully.

Unique match is found:
<script type="text/javascript">

  var search_results = [{"restricted":false,"item":{"id":"510d47d9- ......


[Success] Data is in the JS script and now extracted as a DataFrame into the variable "soup".



Unnamed: 0,restricted,item
0,False,"{'id': '510d47d9-4f9d-a3d9-e040-e00a18064a99', 'title': 'Rope store, South Street and James Slip, Manhattan.', 'image_id': '482824', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/T2..."
1,False,"{'id': '510d47d9-4f4a-a3d9-e040-e00a18064a99', 'title': 'Automat, 977 Eighth Avenue, Manhattan.', 'image_id': '482752', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/sAeaJhzFT5-wk6k..."
2,False,"{'id': '510d47d9-4f85-a3d9-e040-e00a18064a99', 'title': 'Columbus Circle, Manhattan.', 'image_id': '482580', 'multi': True, 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/6b_qtKEqTYC..."
3,False,"{'id': '510d47d9-4f00-a3d9-e040-e00a18064a99', 'title': 'Broadway and Thomas Street, Manhattan.', 'image_id': '482689', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/BBYhX4MbQXSp1S9..."
4,False,"{'id': '510d47d9-4f13-a3d9-e040-e00a18064a99', 'title': 'Broadway and Thomas Street, Manhattan.', 'image_id': '482706', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/fgkkXRSHSruvd-Y..."
...,...,...
195,False,"{'id': '510d47d9-4eb0-a3d9-e040-e00a18064a99', 'title': 'Columbia Presbyterian Medical Center, 168th Street and Broadway, from 165th Street and Riverside Drive, Manhattan.', 'image_id': '482622', ..."
196,False,"{'id': '510d47d9-4f02-a3d9-e040-e00a18064a99', 'title': 'Gay Street no. 14-16, Manhattan.', 'image_id': '482690', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/0-YF-9etQLOSL4VBoVjFJ..."
197,False,"{'id': '510d47df-335a-a3d9-e040-e00a18064a99', 'title': 'Riverside Drive, no. 857, at 159th Street, Manhattan.', 'image_id': '1219146', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org..."
198,False,"{'id': '510d47d9-4f6f-a3d9-e040-e00a18064a99', 'title': 'George Washington Bridge, Riverside Drive and 179th Street, Manhattan.', 'image_id': '482785', 'sequence_number': 1, 'high_res_link': 'http..."


# 3

In [6]:
scrape_where_for_what('http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index.html?tm=1553505375349', "清上海书业商")

[Success] The website at "http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index.html?tm=1553505375349" is collected succesfully.

Unique match is found:
<h1>清上海书业商团旗帜</h1>




Unnamed: 0,text,url
0,清上海书业商团旗帜,/historymuseum/historymuseum/dc/myyp/2020/01/23/3419955b6e5e35b7016fd28a7f890c42.html?tm=1579785391841
1,民国徐汇公学教具--徐汇中学捐赠,/historymuseum/historymuseum/dc/myyp/2019/12/18/3419955b6e5e35b7016f17d0fb000572.html?tm=1579785391841
2,1934年上海徐家汇土山湾铸铜钟,/historymuseum/historymuseum/dc/myyp/2019/11/19/3419955b6e5e35b7016e8165260d01b8.html?tm=1579785391841
3,1907年外白渡桥落成铭牌,/historymuseum/historymuseum/dc/myyp/2019/10/16/3419955b6dc7fb5a016dd2e36ae000fd.html?tm=1579785391841


# 4 

In [7]:
scrape_where_for_what('https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7', "2013-12")

[Success] The website at "https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7" is collected succesfully.

There are 0 matched elements given your last input. They are:



What is the displayed text for one of the elements you want to scrape: 
126

There are 0 matched elements given your last input. They are:



What is the displayed text for one of the elements you want to scrape: 
136

There are 0 matched elements given your last input. They are:



What is the displayed text for one of the elements you want to scrape: 
158

There are 0 matched elements given your last input. They are:



What is the displayed text for one of the elements you want to scrape: (Type "QUIT" to stop)
QUIT

[Error] It is very likely that the website is not fully collected.
        Please try this command: scrape_where_for_what(YOUR_URL, YOUR_TARGET_PHRASE, save_as_html = True)
        A HTML file will be created in your local folder, open it with a browser.
        If you cannot see w

(None, None)

In [8]:
scrape_where_for_what('https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7', "2013-12", save_as_html=True)

[Success] The website at "https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7" is collected succesfully.
[Success] The HTML file is saved succesfully.

There are 0 matched elements given your last input. They are:



What is the displayed text for one of the elements you want to scrape: 
QUIT

[Error] It is very likely that the website is not fully collected.
        Please try this command: scrape_where_for_what(YOUR_URL, YOUR_TARGET_PHRASE, save_as_html = True)
        A HTML file will be created in your local folder, open it with a browser.
        If you cannot see what you want to find on the page, please switch to dynamic scraping method.



(None, None)

<br><br><br><br><br><br><br><br>










In [None]:
# Once the content of the website is collected, the next step is to parse the page. 
# After parsing, we can find the elements we want, then extract and clean their values. 
# For this step, there are many choices of libraries, some examples are: 

#  - BeautifulSoup
#  - Scrapy
#  - Lxml
#  - AdvancedHTMLParser
 
# In this pipeline, we will explore `BeautifulSoup`. `BeautifulSoup` and `Scrapy` are two popular scraping tools. Between these two, BeautifulSoup is more user-friendly, while Scrapy is more efficient and scalable. 
# As this pipeline is targeted for people with less technical backgrounds, we will sacrafice some efficiency for more intuitive experience.
# If you are working on a large-scale or high-velocity scraping project, please consider Scrapy or other tool.
# The other two libraries listed above are good choices in their specific areas, so keep them in view:
#  - Lxml has rich features for processing XML and HTML and is quite efficient (BeautifulSoup actually supports using Lxml parser among other parsers).
#  - AdvancedHTMLParser has similar functions like in native JavaScript and supports complex operations on HTML.
  
### Choices of Parsers
# References:
#  - https://smartproxy.com/blog/scrapy-vs-beautifulsoup (Use cases comparison and Pros&Cons)
#  - https://tomassetti.me/parsing-html (Common libraries in different programming languages)
#  - https://medium.com/analytics-vidhya/scrapy-vs-selenium-vs-beautiful-soup-for-web-scraping-24008b6c87b8 (Great comparison article that includes Selenium, which is the popular choice for dynamic website scraping)

### Beautiful Soup
# References:   
#  - https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup (Showed how to write element finding logic in hierarchy)
#  - https://stackabuse.com/guide-to-parsing-html-with-beautifulsoup-in-python (Nice illustrations, browse_and_scrape combines pagination with parsing) 
#  - https://www.crummy.com/software/BeautifulSoup/bs4/doc (Long but detailed description of BS4 usage)
#  - https://www.crummy.com/software/BeautifulSoup (The "Hall of Fame" section has some high-profile projects, worth having a look)

In [None]:
# # $$$
# # To be implemented:

# An interesting side note, here's one quote from the BS project page:
# > You can tell it "Find all the links", or "Find all the links of class externalLink", or "Find all the links whose urls match "foo.com", or "Find the table heading that's got bold text, then give me that text."
# But actually, you CANNOT directly ask BS these natural language questions. You need to write codes that follow the syntax of the BS4 library, which is similar but not quite close to natural language. 
# **Programming with natural language** is one of the directions worth pursuing in the future, as it further lowers the bar for utilizing web scraping and related technologies.

In [None]:
# # $$$
# # To be implemented:

# Note the requests made by the `get_response` function might be recognized as robotic access by some website. 
# To bypass screening by those websites, additional specifications on headers and proxies are required. 
# These additional setup will be implemented in the future versions.

# As a reference, the `get` function from the Python library requests takes the following parameters:
# - url – URL for the new  Request object.
# - params – (optional) Dictionary of GET Parameters to send with the Request.
# - headers – (optional) Dictionary of HTTP Headers to send with the Request.
# - cookies – (optional) CookieJar object to send with the  Request.
# - auth – (optional) AuthObject to enable Basic HTTP Auth.
# - timeout – (optional) Float describing the timeout of the request.

In [None]:
# # $$$
# # To be integrated:
# # When getting the response, use unicode-dammit to detect encodings in smart ways (https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit)

In [None]:
# # $$$
# # To be implemented formally: Get next/prev sibling
# for child in get_contents(sample_element.parent): # [::-1]
#     print(child == sample_element) # if true then next is next sib or prev sib depending on how contents list is ordered

In [None]:
# # $$$
# # Ready have open-source implementation, but :
# # css_selector_to_xpath is getting a result that is too complicated
# # xpath_to_css_selector cannot handle slightly more complex xpath
# # For simple xpath-css conversion, own implementation might be more transparent and reliable

# # !pip3 install cssify
# # !pip3 install cssselect

# # Reference: https://github.com/santiycr/cssify
# from cssify import cssify
# def xpath_to_css_selector(xpath_string):
#     return cssify(xpath_string)

# # Reference: https://lxml.de/cssselect.html
# from cssselect import GenericTranslator
# def css_selector_to_xpath(css_selector_string):
#     return GenericTranslator().css_to_xpath()

In [None]:
# # $$$
# # Implemented as a solution to indicate to the user which parts of the websites will be scraped
# # However, this involves opening up the newly created html file in a browser and searching for changes
# # The highlighting may not be obvious and the process is almost as complex as using broswer inspector
# # Thus this functionality is dropped and should be kept in view (KIV)

# def highlight_element(element, highlight_style = "background-color: rgba(255,0,0,0.5); border: 3px dotted yellow"):
#     element['style'] = highlight_style

# def highlight_elements(elements, highlight_style = "background-color: rgba(255,0,0,0.5); border: 3px dotted yellow; "):
#     for element in elements:
#         element['style'] = highlight_style

# # highlighted_soup = highlight_element(soup.select(path))

In [None]:
# ################################################################################
# # Reference: How to fake and rotate User Agents using Python 3
# # https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

# import requests
# import random 
# from collections import OrderedDict

# # This data was created by using the curl method explained above
# headers_list = [
#     # Firefox 77 Mac
#      {
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.5",
#         "Referer": "https://www.google.com/",
#         "DNT": "1",
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1"
#     },
#     # Firefox 77 Windows
#     {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.5",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Referer": "https://www.google.com/",
#         "DNT": "1",
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1"
#     },
#     # Chrome 83 Mac
#     {
#         "Connection": "keep-alive",
#         "DNT": "1",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Sec-Fetch-Site": "none",
#         "Sec-Fetch-Mode": "navigate",
#         "Sec-Fetch-Dest": "document",
#         "Referer": "https://www.google.com/",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8"
#     },
#     # Chrome 83 Windows 
#     {
#         "Connection": "keep-alive",
#         "Upgrade-Insecure-Requests": "1",
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
#         "Sec-Fetch-Site": "same-origin",
#         "Sec-Fetch-Mode": "navigate",
#         "Sec-Fetch-User": "?1",
#         "Sec-Fetch-Dest": "document",
#         "Referer": "https://www.google.com/",
#         "Accept-Encoding": "gzip, deflate, br",
#         "Accept-Language": "en-US,en;q=0.9"
#     }
# ]
# # Create ordered dict from Headers above
# ordered_headers_list = []
# for headers in headers_list:
#     h = OrderedDict()
#     for header,value in headers.items():
#         h[header]=value
#     ordered_headers_list.append(h)
    
    
# url = 'https://httpbin.org/headers'

# for i in range(1,4):
#     #Pick a random browser headers
#     headers = random.choice(headers_list)
#     #Create a request session
#     r = requests.Session()
#     r.headers = headers
    
#     response = r.get(url)
#     print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,headers['User-Agent']))
#     print(response.json())
#     print("-------------------")

# ################################################################################
# # Reference: How To Rotate Proxies and change IP Addresses using Python 3
# # https://www.scrapehero.com/how-to-rotate-proxies-and-ip-addresses-using-python-3/    
    
# from lxml.html import fromstring
# import requests
# from itertools import cycle
# import traceback

# def get_proxies():
#     url = 'https://free-proxy-list.net/'
#     response = requests.get(url)
#     parser = fromstring(response.text)
#     proxies = set()
#     for i in parser.xpath('//tbody/tr')[:10]:
#         if i.xpath('.//td[7][contains(text(),"yes")]'):
#             proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
#             proxies.add(proxy)
#     return proxies


# #If you are copy pasting proxy ips, put in the list below
# #proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
# proxies = get_proxies()
# proxy_pool = cycle(proxies)

# url = 'https://httpbin.org/ip'
# for i in range(1,11):
#     #Get a proxy from the pool
#     proxy = next(proxy_pool)
#     print("Request #%d"%i)
#     try:
#         response = requests.get(url,proxies={"http": proxy, "https": proxy})
#         print(response.json())
#     except:
#         #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
#         #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
#         print("Skipping. Connnection error")