**Introduction**

This is a develop environment for the static website scraper. Functions developed here will be eventually packaged into .py files and called from other notebook or python scripts.


**Progress**

Done:
    1. get response
    2. find and extract content
    3. save file
    4. pagination and batch
    5. special encoding
    6. rotate agent
    
Long-term:

    - rotate IP
    - support authentication 
    - support cookie


In [1]:
# # Make sure you install the required libraries

# !pip3 install --upgrade requests # library for making request for the static websites
# !pip3 install --upgrade soupsieve  # library to support css selector in beautifulsoup
# !pip3 install --upgrade beautifulsoup4 # a parser that balances between efficiency and leniency
# !pip3 install --upgrade --user lxml # a more efficient parser
# !pip3 install --upgrade html5lib # a parser that acts like a browser, most lenient


# Key libraries
import re
import os
import json
import random
import inspect
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200

import requests
import bs4

# These functions help us understand the variables that exist in the environment
# which is useful for creating natural language interface for data analysis

def get_local_variables(ignore_underscore = True):
    """Get the name and definition of the local variables.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    dictionary
        A mapping between name and definition of the local variables.
                
    """
    callers_local_vars = dict(inspect.currentframe().f_back.f_locals.items())
    if filter_:
        var_keys = list(callers_local_vars.keys())
        for key in var_keys:
            if key.startswith('_'):
                del callers_local_vars[key]
    return callers_local_vars
def retrieve_name(var):
    """Retrieve the name of the variable. # Reference https://stackoverflow.com/a/40536047.
    
    Parameters
    ----------
    var: object 
        Variable to get the name of.
        
    Returns
    ----------
    string
        Name of the variable passed.
        
    """
    for fi in reversed(inspect.stack()):
        names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
        if len(names) > 0:
            return names[0]
        
def get_attributes(obj, ignore_underscore = True):
    """Get a list of valid attributes of the object.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    list
        A list of valid attributes of the object.
                
    """
    return [x for x in dir(obj) if not x.startswith('_')]

def print_attributes_and_values(obj, ignore_underscore = True):
    """Print the valid attributes of the object and their corresponding values.
    
    Parameters
    ----------
    ignore_underscore : boolean (optional, default = True)
        Whether or not the variables starting with "_" need to be filtered out.
    
    Returns
    ----------
    None
                
    """
    obj_name = retrieve_name(obj)
    attributes = get_attributes(obj, ignore_underscore = ignore_underscore)
    for attr in attributes:
        obj_attr_string = obj_name+'.'+attr
        print(obj_attr_string)
        print(' '*4 + str(eval(obj_attr_string))[:60])
        print('-'*70)


def get_response(url, verbose = True):
    """Get the response of the HTTP GET request for the target url.
    
    Parameters
    ----------
    url: string
        The url to the website that needs to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    response object
        
    """

    # Reference: https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/    
    headers_list = [{'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'https://www.google.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'https://www.google.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'}, {'Connection': 'keep-alive', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Dest': 'document', 'Referer': 'https://www.google.com/'}, {'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '?1', 'Sec-Fetch-Dest': 'document', 'Referer': 'https://www.google.com/'}]
    
    try:
        headers = random.choice(headers_list)
        response = requests.get(url, headers = headers)
        response.raise_for_status() # Raise Exception when response was not successful
    except requests.exceptions.HTTPError as http_err:
        print('[Error] HTTP error occurred: '+str(http_err))
        return requests.models.Response() # Return empty response
    except Exception as err:
        print('[Error] Other error occurred: '+str(err))
        return requests.models.Response() # Return empty response
    else:
        if verbose:
            print('[Success] The website at "'+url+'" is collected successfully.')
        return response

def get_responses(urls, verbose = True):
    """Get the responses of the HTTP GET requests for the target urls. 
    
    Parameters
    ----------
    urls: list of string
        The urls to the websites that need to be scraped. 
    verbose: boolean (optional, default = True)
        Whether or not [Success] message should be printed.
    
    Returns
    ----------
    list of response object
        
    """
    return [get_response(url) for url in urls]



def get_soup(response, default_parser = 'lxml'):
    """Get the beautiful soup object of the response object or filepath or html string.
    
    Parameters
    ----------
    response: requests.models.Response, string
        The response object or filepath or html string. 
    default_parser: string (optional, default = lxml)
        Which parser to use when parsing the response.
    
    Returns
    ----------
    list of response object
        
    """
    if isinstance(response, requests.models.Response):
        soup = bs4.BeautifulSoup(response.content, default_parser)
    elif isinstance(response, str) and os.path.exists(response):
        with open(response) as file_handler:
            soup = bs4.BeautifulSoup(file_handler, default_parser)
    else:
        try:
            soup = bs4.BeautifulSoup(response, default_parser)
        except Exception as err:
            print('[Error] The response object you provided cannot be turned into beautiful soup object: '+str(err))
    return soup

def save_html(html_object, url , path = ''):
    """Save the response or soup object as a HTML file at the path provided.
    
    Parameters
    ----------
    html_object: requests.models.Response, bs4.BeautifulSoup
        The response or soup object. 
    path: string (optional, default = ./TEMP.html)
        The path at which the HTML file will be saved.
    
    Returns
    ----------
    None
        
    """
    if path == '':
        path = './'+re.sub('^https?://','',url).replace('/','_').replace('.','-')+'.html'
    if isinstance(html_object, requests.models.Response):
        html_text = html_object.text
    elif isinstance(html_object, (bs4.BeautifulSoup,bs4.element.Tag)):
        html_text = str(html_object.prettify())
    else:
        html_text = str(html_object)
    try:
        with open(path,'w') as f:
            f.write(html_text)
            print('[Success] The HTML file is saved succesfully.')
    except Exception as err:
        print('[Error] The response object you provided cannot be turned into beautiful soup object: '+str(err))

def is_readable_content(content):
    """Return whether the content passed is a readable content like Tag or NavigableString; not CData, Comment, Declaration, Doctype, ProcessingInstruction, ResultSet, Script, Stylesheet, XMLFormatter.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    boolean
        
    """
    # Types that are instances of NavigableString:  CData, Comment, Declaration, Doctype, PreformattedString, ProcessingInstruction, ResultSet, Script, Stylesheet, TemplateString, XMLFormatter
    # Types in the group above that are not String:  CData, Comment, Declaration, Doctype, ProcessingInstruction, ResultSet, Script, Stylesheet, XMLFormatter
    return isinstance(content, (bs4.element.Tag, bs4.element.NavigableString)) and not isinstance(content, (bs4.element.CData, bs4.element.Comment, bs4.element.Declaration, bs4.element.Doctype, bs4.element.ProcessingInstruction, bs4.element.ResultSet, bs4.element.Script, bs4.element.Stylesheet, bs4.element.XMLFormatter))

def get_contents(element):
    
    """Return a list of non-empty and readable contents/children of the element.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    list of bs4.element
        
    """
    return [content for content in element.contents if str(content).strip()!='' and is_readable_content(content)]

def get_contents_names(element):
    """Return the list of names of the non-empty and readable contents/children of the element.
    
    Parameters
    ----------
    content: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    list of string
        
    """
    return [content.name for content in get_contents(element)]

def elevate_till_is_tag(element):
    """Return the nearest Tag element, if not itself, return its parent if it is a Tag element.
    
    Parameters
    ----------
    element: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    bs4.element.Tag
        
    """
    if isinstance(element, bs4.element.NavigableString):
        return element.parent
    if isinstance(element, bs4.element.Tag):
        return element
    else:
        print('[Error] Element is still not Tag after getting the parent.')
        return None


def get_self_index(element):
    """Return the index of the element among its siblings.
    
    Parameters
    ----------
    element: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    int
        
    """
    self_type = element.name
    previous_siblings_of_all_types = list(element.previous_siblings)
    previous_siblings_of_same_type = [element for element in previous_siblings_of_all_types if element.name == self_type]
    return len(previous_siblings_of_same_type) + 1 # css selector starts indexing with 1 instead of 0


# Reference: https://stackoverflow.com/a/32263260 (basic structure inspiration)
# Reference: https://csswizardry.com/2012/05/keep-your-css-selectors-short (tips to improve efficiency)

def describe_part_of_css_selector(node):
    """Construct part of the css selector path.
    
    Parameters
    ----------
    node: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    string
        
    """
    
    enough_to_be_unique = False
    
    node_type = node.name
    
    node_attrs = node.attrs
    node_attrs_string = ''
    for k,v in node_attrs.items():
        if k == 'id':
            node_attrs_string += '#' + node_attrs[k]
            enough_to_be_unique = True
            break
        elif k == 'class':
            node_attrs_string += '.'+'.'.join(node_attrs[k])

    element_part = node_type + node_attrs_string
            
    if not enough_to_be_unique:
        length = get_self_index(node)
        if (length) > 1:
            element_part = '%s:nth-of-type(%s)' % (element_part, length)
        
    return element_part

def get_css_selector_path(node):
    """Construct the whole css selector path to a certain element.
    
    Parameters
    ----------
    node: bs4.element
        An BS4 element from the parsed tree.
    
    Returns
    ----------
    string
        
    """
    
    path = [describe_part_of_css_selector(node)]
    for parent in node.parents:
        if parent.name == 'body' : # or '#' in path[0], comment out this to get more complete path to facilitate go_up
            break
        path.insert(0, describe_part_of_css_selector(parent))
    return ' > '.join(path)

def elevate_css_selector_path(path):
    """Get the css selector path to the element that is one level above the current element.
    
    Parameters
    ----------
    path: string
        The css selector path to an BS4 element from the parsed tree.
    
    Returns
    ----------
    string
        
    """
    
    return '>'.join(path.split('>')[:-1]).strip() if '>' in path else path


from collections.abc import Iterable
def is_iterable(obj):
    """Check if the passed object is iterable.
    
    Parameters
    ----------
    obj: object
    
    Returns
    ----------
    boolean
        
    """
    
    return isinstance(obj, Iterable)


def flatten_list(l):
    """Flatten a list of lists to a one-layer list (elements are in original order). Note this is NOT recursive, meaning multi-layered list of lists cannot be converted into a single-layered list in one transformation.
    
    Parameters
    ----------
    l: list
    
    Returns
    ----------
    list
        
    """
    
    return [item for sublist in l for item in sublist]

def extract_text(element):
    """Extract the textual content of an element.
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    string
        
    """
    
    return element.getText(separator=u'\n').strip()


def get_directly_related_link(element):
    """Extract the link directly related to the element.
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    string
        
    """
    
    count = 0
    while element.name != 'a' and count < 5:
        element = element.parent
        count += 1
    if element.name != 'a':
        return ''
    else:
        return element.get('href',default='')


def get_indirectly_related_links(element):
    """Extract the links indirectly related to the element (i.e. belonging to the sibling elements).
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    list of string
        
    """
    
    return remove_blank_element_in_list([link.get('href',default='') for link in element.parent.find_all('a')])


def get_related_link(element):
    """Extract the link directly related to the element, if none is found, get indirectly related links.
    
    Parameters
    ----------
    element: bs4.element
    
    Returns
    ----------
    string or list of string
        
    """
    
    link = get_directly_related_link(element)
    
    if link != '':
        return link
    else:
        links = get_indirectly_related_links(element)
        if len(links) == 1 and links[0].strip() != '':
            return links[0]
        else:
            return links

def get_longest_separator(text):
    """Return the longest separator (formed by multiple newline) in the text.
    
    Parameters
    ----------
    text: string
    
    Returns
    ----------
    string
        
    """
    if isinstance(text, str) and '\n' in text:
        return max(re.findall(r'\n+', text, re.DOTALL), key=lambda x: len(x))
    else:
        return ''

def get_longest_separator_in_list(texts):
    """Return the longest separator (formed by multiple newline) in the texts contained in the list.
    
    Parameters
    ----------
    texts: list of string
    
    Returns
    ----------
    string
        
    """
    return max([get_longest_separator(text) for text in texts], key=len)


def remove_blank_element_in_list(li):
    """Return a cleaned version of the list with all blank elements removed.
    
    Parameters
    ----------
    li: list
    
    Returns
    ----------
    list
        
    """
    return [element for element in li if element.strip()!='']

def recursive_split(text):
    """Return a multi-layer list of lists resulting from a recursive split of the text (split by longer separator first).
    
    Parameters
    ----------
    text: String
        A piece of text that contains separators of different lengths.
    
    Returns
    ----------
    list (of lists)
        
    """
    longest_separator = get_longest_separator(text)
    if longest_separator == '':
        return text
    else:
        return [recursive_split(part) for part in remove_blank_element_in_list(text.split(longest_separator))]
    
def get_unique_sample_element(soup, target_phrase = '', context_radius = 40):
    """Find and return an element based on the html structure and a target phrase, solicit additional information from user through input questions if needed.
    
    Parameters
    ----------
    soup: bs4.soup
        The parsed tree of the response.
    target_phrase: string (optional, if not provided, the function will ask user to input)
        The phrase used to find the sample element.
    context_radius: int (optional, default = 40)
        How many characters to display to help user choose recurring phrases based on their contexts.
    
    Returns
    ----------
    bs4.element.Tag
        
    """
        
    attempt_count = 0
    matched_elements = []
    
    if target_phrase != '':
        target_phrase = target_phrase.lower()
        matched_elements = soup.find_all(text = re.compile(target_phrase,re.IGNORECASE))
        attempt_count += 1
    
    while len(matched_elements)!=1:
        
        #######################################################################################################
        # Situation where matched elements have the same textual content
        
        if len(set([str(matched_element) for matched_element in matched_elements]))==1:
            last_index = -1
            phrases_in_context = []
            whole_page_text = re.sub('\s+',' ',soup.text).lower()
            
            if whole_page_text.count(target_phrase) == len(matched_elements):
            
                for i in range(whole_page_text.count(target_phrase)):
                    current_index = whole_page_text.index(target_phrase,last_index+1)
                    phrases_in_context.append(whole_page_text[current_index-context_radius:current_index]+'\\\\ '+whole_page_text[current_index:current_index+len(target_phrase)]+' //'+whole_page_text[current_index+len(target_phrase):current_index+len(target_phrase)+context_radius])
                    last_index = current_index
                
                if len(set(phrases_in_context))==1:
                    print('[Error] There are '+str(len(phrases_in_context))+' occurences of the same target phrase on the page that have very similar contexts.\nPlease use the browser inspector tool to copy the "selector" or "Selector Path".\n')
                    return None
                else:
                    numbered_contexts = ''
                    for i in range(len(phrases_in_context)):
                        numbered_contexts += 'Choice '+str(i+1)+':  '+phrases_in_context[i] + '\n'
                    print('There are '+str(len(phrases_in_context))+' occurences of the same target phrase on the page,\nplease choose one based on their contexts:\n\n' + numbered_contexts + '\n')

                which_one = 0
                while which_one-1 not in range(len(phrases_in_context)):
                    which_one = input('Which choice is the element you that want to scrape: [1, 2, 3, ...]\n')
                    try:
                        which_one = int(which_one)
                    except:
                        which_one = 0
                matched_elements = [matched_elements[which_one-1]]
                
            else:
                print('[Error] The number of matched elements and the number of target phrase occurences are not the same.\nPlease use the browser inspector tool to copy the "selector" or "Selector Path".\n')
                return None
            
        #######################################################################################################
        if attempt_count > 0:
            
            if len(matched_elements) > 0 and len(matched_elements) < 5:
                
                numbered_choices = ''
                for i in range(len(matched_elements)):
                    numbered_choices += '\tChoice '+str(i+1)+':  '+str(matched_elements[i])[:80]+ '\n'

                print('\nThere are '+str(len(matched_elements))+' matched elements given your last input. They are:\n'+numbered_choices)
                
                # Choose one
                which_one = 0
                while which_one-1 not in range(len(matched_elements)):
                    which_one = input('Which choice is the element you that want to scrape: [1, 2, 3, ...]\n')
                    try:
                        which_one = int(which_one)
                    except:
                        which_one = 0
                matched_elements = [matched_elements[which_one-1]]
            
            else:
                if len(matched_elements) > 5:
                    print('\nThere are '+str(len(matched_elements))+' matched elements given your last input. They are:\n\n\t'+'\n\t'.join([str(matched_element)[:80] for matched_element in matched_elements[:10]])+'\n\nPlease be more specific in your target phrase.\n')
                if len(matched_elements) == 0:
                    print('\nNo match was found, please check for typos in the target phrase (case insensitive) or check if the website is fully collected.')            

                # Search again
                target_phrase = input('What is the displayed text for one of the elements you want to scrape: '+('(Type "QUIT" to stop)' if attempt_count>3 else '')+'\n')
                if target_phrase == 'QUIT':
                    print('\n[Error] It is likely that the website is not fully collected.\n        Please try this command: get_response_and_save_html(PUT_IN_YOUR_URL)\n        A HTML file will be created in your local folder, open it with a browser.\n        If you cannot see what you want to find on the page, please switch to dynamic scraping method.\n')
                    return None
                matched_elements = soup.find_all(text = re.compile(target_phrase,re.IGNORECASE))
        
        # Attempt count increments
        attempt_count += 1
            
    
    
    
    sample_element = matched_elements[0]
    sample_element = elevate_till_is_tag(sample_element)
    print('\nUnique match is found:\n'+str(sample_element)[:100]+ (' ......' if len(str(sample_element))>100 else '') +'\n\n')
    
    if sample_element.name == 'script':
        matched_lines = [line for line in sample_element.prettify().split('\n') if target_phrase in line.lower()]
        try:
            assert(len(matched_lines)==1)
            matched_line = matched_lines[0].strip().strip(';')
            matched_data = matched_line.split('=',maxsplit=1)[1].strip()
            data = pd.DataFrame(json.loads(matched_data))
            return data
        except:
            print('[Error] There are multiple occurences of the target phrase in the JS script.\nPlease use another more unique target phrase or inspect the page source for the data in JS script.\n')
            return None
    
    return sample_element

def extract_contents(soup, path, verbose = True):
    """Extract and return the texts and links with the target path in the parsed tree.
    
    Parameters
    ----------
    soup: bs4.soup
        The parsed tree of the response.
    path: string
        The css selector path to the target elements.
    verbose: boolean (optional, default = True)
        Whether or not to print the process message.
    
    Returns
    ----------
    pd.DataFrame
        
    """
    
    if soup is None:
        return None
    
    if isinstance(soup, pd.DataFrame):
        return soup
    
    if verbose:
        print('\nExtracting contents ...\n')
    
    if path.startswith('HEADER:'):
        tables = pd.read_html(str(soup))
        target_table = [table for table in tables if str(tuple(table.columns.tolist())) == path.replace('HEADER:','')][0]
        return target_table
    
    target_elements = soup.select(path)

    extracted_contents = pd.DataFrame([(recursive_split(extract_text(target_element)), get_related_link(target_element)) for target_element in target_elements], columns = ['text','url'])

    return extracted_contents
    
def scrape_what_from_where(target_phrase, url, go_up = 0):
    """Get the contents that are similar to the element with phrase "what" in the website "where".
    
    Parameters
    ----------
    target_phrase: string
        The displayed text of one of the elements you want to scrape.
    url: string
        The url of the website you want to scrape.
    go_up: int
        How many levels to go up in order to get the amount of contents you want.
        
    Returns
    ----------
    pd.DataFrame
        
    """
    
    response = get_response(url)
    
    soup = get_soup(response)

    # Check if the data is in a table, if so, directly return the table
    try:
        tables = pd.read_html(str(soup))
    except:
        tables = []
        
    print(len(tables))
    if len(tables)>0 and (len(set([tuple(table.columns.tolist()) for table in tables])) == len(tables)):        
        tables_containing_target_phrase = [table for table in tables if target_phrase in str(table)]
        tables_containing_target_phrase = sorted(tables_containing_target_phrase, key=lambda t: len(str(t)))
        if len(tables_containing_target_phrase)>0:
            while len(tables_containing_target_phrase)>0:
                print('\nThere are '+str(len(tables_containing_target_phrase))+' tables with the target phrase:\n')
                target_table = tables_containing_target_phrase[0]
                print(target_table)
                is_right_table = input('\nIs this table what you want to scrape? [Yes/No]\n')
                if is_right_table.lower()[0] == 'y':
                    right_header = tuple(target_table.columns.tolist())
                    print('\nThe right header is:\n\t'+str(right_header))
                    return target_table, 'HEADER:'+str(right_header)
                else:    
                    tables_containing_target_phrase.pop(0)
            if len(tables_containing_target_phrase)==0:
                print('\nThe target data is not one of the tables, moving on to other html elements.\n')

    
    # Pinpoint the sample element through dialogue
    sample_element = get_unique_sample_element(soup, target_phrase)
    if sample_element is None:
        return None, ''
    if isinstance(sample_element, pd.DataFrame):
        print('[Success] Data is in the JS script and now extracted as a DataFrame into the variable "soup".\n')
        return sample_element, ''
    
    # Build the css selector path to the sample element
    sample_path = get_css_selector_path(sample_element)
        
    
    # Go up the parse tree if needed:
    path = sample_path[:]
    for i in range(go_up):
        path = elevate_css_selector_path(path)
    
    # Extract content
    extracted_contents = extract_contents(soup, path)
    
    # If data is extracted from html path instead of from json, print the path for future use
    if path != '':
        print('\n[Success] The selector path used to extract contents is:\n\n\t'+path+'\n')
    
    return extracted_contents, path


def get_response_and_save_html(url,  path = ''):
    """Get the response of the website and save it as an HTML.
    
    Parameters
    ----------
    url: string
        The url to the website that needs to be scraped. 
    path: string (optional, default = ./TEMP.html)
        The path at which the HTML file will be saved.
    
    Returns
    ----------
    None
        
    """
    
    response = get_response(url)
    
    save_html(response.text, url, path = path)
    

def create_page_url_list(template_url, start_index, end_index, unique_first_url = None):
    page_url_list = []
    if unique_first_url is not None:
        page_url_list.append(unique_first_url)
    for i in range(start_index,end_index+1):
        page_url_list.append(template_url.replace('NUMBER',str(i)))
    return page_url_list

def merge_dataframes(dataframes):
    output_dataframe = pd.DataFrame()
    for dataframe in dataframes:
        output_dataframe = output_dataframe.append(ignore_index = True)
    return output_dataframe

def extract_path_from_pages(path, pages, save_separately = False, file_path_template = None , reporting_interval = None, verbose = False):

    number_of_pages = len(pages)
    index_width = len(str(number_of_pages+1))
    
    if reporting_interval is None:
        reporting_interval = int(number_of_pages/10)+1 if number_of_pages<1000 else int(number_of_pages/40)
        
    output_dataframe = pd.DataFrame()
    
    for i in range(number_of_pages):
        if i % reporting_interval == 0:
            print(str(i)+'/'+str(number_of_pages), end=', ')
        url = pages[i]
        
        dataframe = extract_contents(get_soup(get_response(url, verbose = verbose)), path, verbose = verbose)
        
        if save_separately: 
            if file_path_template is None:
                print('\n[Error] To save the dataframes from different pages separatorly, you need to provide a file path template.\n')
                return None
            file_path = file_path_template.replace('NUMBER', str(i).zfill(index_width))
            dataframe.to_csv(file_path, index = False)
            
        else:
            output_dataframe = output_dataframe.append(dataframe, ignore_index=True)
    
    print('\n\n[Success] Content extraction finished.\n\n')
    
    if not save_separately: 
        return output_dataframe
    else:
        return None

def get_base_url(url):
    return url.split('://')[0]+'://'+url.split('://')[1].split('/')[0]

### Examples

# 1

https://digitalcollections.nypl.org

In [2]:
extracted_contents, path = scrape_what_from_where("Children's Book", "https://digitalcollections.nypl.org")
extracted_contents

[Success] The website at "https://digitalcollections.nypl.org" is collected successfully.
0

There are 2 matched elements given your last input. They are:
	Choice 1:  The Black Experience in Children's Books: Selections from Augusta Baker's Biblio
	Choice 2:  Children's book illustrations

Which choice is the element you that want to scrape: [1, 2, 3, ...]
1

Unique match is found:
<h5>The Black Experience in Children's Books: Selections from Augusta Baker's Bibliographies</h5>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div#container > div#home-lanes-main > div#collections > div.lane.inverse > div.lane-items > a.lane-item > h5



Unnamed: 0,text,url
0,The Black Experience in Children's Books: Selections from Augusta Baker's Bibliographies,/collections/the-black-experience-in-childrens-books-selections-from-augusta-bakers
1,Scrapbooks of New York City views,/collections/scrapbooks-of-new-york-city-views
2,Li ji ji shi: er shi wu juan,/collections/li-ji-ji-shi-er-shi-wu-juan
3,Women of distinction: remarkable in works and invincible in character,/collections/women-of-distinction-remarkable-in-works-and-invincible-in-character
4,Collection of ledgers and cash books covering the period 1891-1925,/collections/collection-of-ledgers-and-cash-books-covering-the-period-1891-1925
...,...,...
225,William Blake: Illuminated Books,/collections/william-blake-illuminated-books
226,Ise Monogatari Emaki,/collections/ise-monogatari-emaki
227,Des cleres et nobles femmes,/collections/des-cleres-et-nobles-femmes
228,Minchô shiken (The Colored Inkstone of the Ming Period),/collections/minch-shiken-the-colored-inkstone-of-the-ming-period


In [3]:
extracted_contents, path = scrape_what_from_where("The Black Experience in Children's Books","https://digitalcollections.nypl.org", go_up = 1)
extracted_contents

[Success] The website at "https://digitalcollections.nypl.org" is collected successfully.
0

Unique match is found:
<h5>The Black Experience in Children's Books: Selections from Augusta Baker's Bibliographies</h5>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div#container > div#home-lanes-main > div#collections > div.lane.inverse > div.lane-items > a.lane-item



Unnamed: 0,text,url
0,"[The Black Experience in Children's Books: Selections from Augusta Baker's Bibliographies, 303 items]",/collections/the-black-experience-in-childrens-books-selections-from-augusta-bakers
1,"[Scrapbooks of New York City views, 3,141 items]",/collections/scrapbooks-of-new-york-city-views
2,"[Li ji ji shi: er shi wu juan, 26 items]",/collections/li-ji-ji-shi-er-shi-wu-juan
3,"[Women of distinction: remarkable in works and invincible in character, 49 items]",/collections/women-of-distinction-remarkable-in-works-and-invincible-in-character
4,"[Collection of ledgers and cash books covering the period 1891-1925, 15 items]",/collections/collection-of-ledgers-and-cash-books-covering-the-period-1891-1925
...,...,...
225,"[William Blake: Illuminated Books, 89 items]",/collections/william-blake-illuminated-books
226,"[Ise Monogatari Emaki , 109 items]",/collections/ise-monogatari-emaki
227,"[Des cleres et nobles femmes, 78 items]",/collections/des-cleres-et-nobles-femmes
228,"[Minchô shiken (The Colored Inkstone of the Ming Period), 65 items]",/collections/minch-shiken-the-colored-inkstone-of-the-ming-period


# 2

https://digitalcollections.nypl.org/collections/changing-new-york

In [6]:
extracted_contents, path = scrape_what_from_where("broome street", "https://digitalcollections.nypl.org/collections/changing-new-york")
extracted_contents

[Success] The website at "https://digitalcollections.nypl.org/collections/changing-new-york" is collected successfully.
0

There are 3 matched elements given your last input. They are:
	Choice 1:  Broome Street no. 512-514, Ma…
	Choice 2:  Broadway near Broome Street, …
	Choice 3:  Broome Street, Nos. 504-506, …

Which choice is the element you that want to scrape: [1, 2, 3, ...]
first
Which choice is the element you that want to scrape: [1, 2, 3, ...]
1

Unique match is found:
<a alt="Broome Street no. 512-514, Manhattan" class="title" href="/items/510d47d9-4fbc-a3d9-e040-e00 ......



Extracting contents ...


[Success] The selector path used to extract contents is:

	div#container > div#collection > div#collection-right > div.results-wrapper > div#results-list-wrapper > ul#results-list > li > div.description:nth-of-type(2) > a.title



Unnamed: 0,text,url
0,"Broome Street no. 512-514, Ma…",/items/510d47d9-4fbc-a3d9-e040-e00a18064a99
1,"Lamport Export Company, 507-5…",/items/510d47d9-4fbd-a3d9-e040-e00a18064a99
2,"Broadway near Broome Street, …",/items/510d47d9-4fb9-a3d9-e040-e00a18064a99
3,"Broome Street, Nos. 504-506, …",/items/510d47d9-4fbb-a3d9-e040-e00a18064a99
4,First Avenue and East 70th St…,/items/510d47d9-4f4e-a3d9-e040-e00a18064a99
5,Ewen Avenue No. 2565 (Bar and…,/items/510d47d9-4ecb-a3d9-e040-e00a18064a99
6,Gas tank and Queensboro Bridg…,/items/510d47d9-4f4b-a3d9-e040-e00a18064a99
7,"Vanderbilt, From E. 46th Stre…",/items/510d47d9-4f42-a3d9-e040-e00a18064a99
8,"Country Store: interior, Ewen…",/items/510d47d9-4f7d-a3d9-e040-e00a18064a99
9,"Palisade Avenue No. 2505, Spu…",/items/510d47d9-4f0b-a3d9-e040-e00a18064a99


In [7]:
extracted_contents, path = scrape_what_from_where("Salmagundi Club", "https://digitalcollections.nypl.org/collections/changing-new-york")
extracted_contents

[Success] The website at "https://digitalcollections.nypl.org/collections/changing-new-york" is collected successfully.
0

Unique match is found:
<script type="text/javascript">

  var search_results = [{"restricted":false,"item":{"id":"510d47d9- ......


[Success] Data is in the JS script and now extracted as a DataFrame into the variable "soup".



Unnamed: 0,restricted,item
0,False,"{'id': '510d47d9-4f9d-a3d9-e040-e00a18064a99', 'title': 'Rope store, South Street and James Slip, Manhattan.', 'image_id': '482824', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/T2..."
1,False,"{'id': '510d47d9-4f4a-a3d9-e040-e00a18064a99', 'title': 'Automat, 977 Eighth Avenue, Manhattan.', 'image_id': '482752', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/sAeaJhzFT5-wk6k..."
2,False,"{'id': '510d47d9-4f85-a3d9-e040-e00a18064a99', 'title': 'Columbus Circle, Manhattan.', 'image_id': '482580', 'multi': True, 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/6b_qtKEqTYC..."
3,False,"{'id': '510d47d9-4f00-a3d9-e040-e00a18064a99', 'title': 'Broadway and Thomas Street, Manhattan.', 'image_id': '482689', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/BBYhX4MbQXSp1S9..."
4,False,"{'id': '510d47d9-4f13-a3d9-e040-e00a18064a99', 'title': 'Broadway and Thomas Street, Manhattan.', 'image_id': '482706', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/fgkkXRSHSruvd-Y..."
...,...,...
195,False,"{'id': '510d47d9-4eb0-a3d9-e040-e00a18064a99', 'title': 'Columbia Presbyterian Medical Center, 168th Street and Broadway, from 165th Street and Riverside Drive, Manhattan.', 'image_id': '482622', ..."
196,False,"{'id': '510d47d9-4f02-a3d9-e040-e00a18064a99', 'title': 'Gay Street no. 14-16, Manhattan.', 'image_id': '482690', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org/0-YF-9etQLOSL4VBoVjFJ..."
197,False,"{'id': '510d47df-335a-a3d9-e040-e00a18064a99', 'title': 'Riverside Drive, no. 857, at 159th Street, Manhattan.', 'image_id': '1219146', 'sequence_number': 1, 'high_res_link': 'http://link.nypl.org..."
198,False,"{'id': '510d47d9-4f6f-a3d9-e040-e00a18064a99', 'title': 'George Washington Bridge, Riverside Drive and 179th Street, Manhattan.', 'image_id': '482785', 'sequence_number': 1, 'high_res_link': 'http..."


# 3

https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7

In [8]:
extracted_contents, path = scrape_what_from_where("2013-12", "https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7")
extracted_contents

[Success] The website at "https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7" is collected successfully.
1

No match was found, please check for typos in the target phrase (case insensitive) or check if the website is fully collected.
What is the displayed text for one of the elements you want to scrape: 
120

No match was found, please check for typos in the target phrase (case insensitive) or check if the website is fully collected.
What is the displayed text for one of the elements you want to scrape: 
121

No match was found, please check for typos in the target phrase (case insensitive) or check if the website is fully collected.
What is the displayed text for one of the elements you want to scrape: 
159

No match was found, please check for typos in the target phrase (case insensitive) or check if the website is fully collected.
What is the displayed text for one of the elements you want to scrape: (Type "QUIT" to stop)
QUIT

[Error] It is likely that the w

In [9]:
get_response_and_save_html("https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7")

[Success] The website at "https://www.aqistudy.cn/historydata/monthdata.php?city=%E4%B8%8A%E6%B5%B7" is collected successfully.
[Success] The HTML file is saved succesfully.


# 4 

http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index.html

In [10]:
extracted_contents, path = scrape_what_from_where("清上海书业商", "http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index.html")
extracted_contents

[Success] The website at "http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index.html" is collected successfully.
0

Unique match is found:
<h1>清上海书业商团旗帜</h1>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div.mar-w1170:nth-of-type(4) > div.mar-w1170:nth-of-type(2) > div.act1wp.clearfix.act2con#result_list > div.act2con-div.margin-t28 > div.act1-con.fl:nth-of-type(3) > h1



Unnamed: 0,text,url
0,清上海书业商团旗帜,/historymuseum/historymuseum/dc/myyp/2020/01/23/3419955b6e5e35b7016fd28a7f890c42.html?tm=1579785391841
1,民国徐汇公学教具--徐汇中学捐赠,/historymuseum/historymuseum/dc/myyp/2019/12/18/3419955b6e5e35b7016f17d0fb000572.html?tm=1579785391841
2,1934年上海徐家汇土山湾铸铜钟,/historymuseum/historymuseum/dc/myyp/2019/11/19/3419955b6e5e35b7016e8165260d01b8.html?tm=1579785391841
3,1907年外白渡桥落成铭牌,/historymuseum/historymuseum/dc/myyp/2019/10/16/3419955b6dc7fb5a016dd2e36ae000fd.html?tm=1579785391841


### Pagination Example

In [11]:
pages = create_page_url_list(template_url = 'http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index_NUMBER.html?tm=1579785391762', start_index = 1, end_index = 4, unique_first_url = 'http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index.html')

In [12]:
df = extract_path_from_pages(path, pages)

0/5, 1/5, 2/5, 3/5, 4/5, 

[Success] Content extraction finished.




In [13]:
base_url = get_base_url('http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/index_NUMBER.html')

In [14]:
df['url'] = base_url + df['url']

In [15]:
info_pages = df['url'].tolist()

In [16]:
url = 'http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/2020/01/23/3419955b6e5e35b7016fd28a7f890c42.html?tm=1579785391841'
extracted_contents, info_path = scrape_what_from_where('革命期间发挥了', url)
extracted_contents

[Success] The website at "http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/2020/01/23/3419955b6e5e35b7016fd28a7f890c42.html?tm=1579785391841" is collected successfully.
0

Unique match is found:
<span style="font-family:宋体; font-size:10.5pt">商团是清末上海商界的自卫武装团体，在辛亥革命期间发挥了重要的作用。</span>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div.mar-w1170:nth-of-type(4) > div.class-dinfowp.clearfix:nth-of-type(2) > div.class-dinfo-right.fl:nth-of-type(2) > p > span:nth-of-type(2)



Unnamed: 0,text,url
0,商团是清末上海商界的自卫武装团体，在辛亥革命期间发挥了重要的作用。,[]


In [17]:
url = 'http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/2020/01/23/3419955b6e5e35b7016fd28a7f890c42.html?tm=1579785391841'
extracted_contents, info_path = scrape_what_from_where('革命期间发挥了', url, go_up = 2)
extracted_contents

[Success] The website at "http://www.shh-shrhmuseum.org.cn/historymuseum/historymuseum/dc/myyp/2020/01/23/3419955b6e5e35b7016fd28a7f890c42.html?tm=1579785391841" is collected successfully.
0

Unique match is found:
<span style="font-family:宋体; font-size:10.5pt">商团是清末上海商界的自卫武装团体，在辛亥革命期间发挥了重要的作用。</span>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div.mar-w1170:nth-of-type(4) > div.class-dinfowp.clearfix:nth-of-type(2) > div.class-dinfo-right.fl:nth-of-type(2)



Unnamed: 0,text,url
0,"[清上海书业商团旗帜, 尺寸： 长160厘米，宽114厘米, 简介, [商团是清末上海商界的自卫武装团体，在辛亥革命期间发挥了重要的作用。, 1886, 年朱槐庐等人创立上海书业崇德公所，, 1905, 年重组为“上海书业公所”，不久组织成立上海书业商团。在上海光复之役中，书业商团参与了恢复上海全境、攻打制造局的战斗。胜利后，商团全体人员不分昼夜，认真巡逻、保境安民。], 进入高清播放器]","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"


In [18]:
info_path = 'div.mar-w1170:nth-of-type(4) > div.class-dinfowp.clearfix:nth-of-type(2) > div.class-dinfo-right.fl:nth-of-type(2)'

In [19]:
info_df = extract_path_from_pages(info_path, info_pages)

0/17, 2/17, 4/17, 6/17, 8/17, 10/17, 12/17, 14/17, 16/17, 

[Success] Content extraction finished.




In [20]:
info_df

Unnamed: 0,text,url
0,"[清上海书业商团旗帜, 尺寸： 长160厘米，宽114厘米, 简介, [商团是清末上海商界的自卫武装团体，在辛亥革命期间发挥了重要的作用。, 1886, 年朱槐庐等人创立上海书业崇德公所，, 1905, 年重组为“上海书业公所”，不久组织成立上海书业商团。在上海光复之役中，书业商团参与了恢复上海全境、攻打制造局的战斗。胜利后，商团全体人员不分昼夜，认真巡逻、保境安民。], 进入高清播放器]","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
1,"[民国徐汇公学教具--徐汇中学捐赠, 尺寸： 透镜：底部直径11厘米，高35厘米。, 简介, 透镜：底部直径11厘米，高35厘米。三棱镜：底部直径9.5厘米，宽14厘米，高33厘米。蒸汽机模型：长39厘米，宽29厘米，高53厘米。三球仪：长55厘米，宽25厘米，高41厘米。徐汇公学创办于1850年，是天主教在上海开办最早的洋学堂，是中国最早按西洋办学模式设立的学校之一。初名圣依纳爵公学，吸...","[javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
2,"[1934年上海徐家汇土山湾铸铜钟, 尺寸： 直径39厘米，高43厘米, 简介, [土山湾孤儿院成立于, 1864, 年，其前身为天主教士薛孔昭所设横塘育婴堂，专收教外孤儿，衣之食之，并教以工艺美术诸艺，以便长大后能有一技谋生。孤儿院内部设木工、五金、印书等工场，此教堂铜钟即为该院五金部作品，钟面人像为“圣女小德肋撒”，也即“圣女小德兰”。], 进入高清播放器]","[javascript:;, javascript:;, javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
3,"[1907年外白渡桥落成铭牌, 尺寸： 纵40厘米，横52厘米, 简介, [ 1856, 年，威尔斯公司在苏州河黄浦江交汇处建造了木结构的“威尔斯桥”。, 1875, 年工部局强行收购该桥并出资重建，建成后不再收过桥费。因其临近外滩花园，遂称“花园桥”（又称外摆渡桥或外白渡桥）。随着近代交通的迅速发展，, 1906, 年工部局拆除原木结构旧桥重建为钢桥，翌年落成，为上海第一座近代化的钢桁架...","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
4,"[民国 文魁斋“天晓得”招牌, 尺寸： 长84厘米，宽121厘米，厚3.8厘米, 简介, 民国初年，在广西路汉口路先后开设了两家糖食店，都取名“文魁斋”。为了争夺生意，两家都指责对方是冒牌，其中一家甚至定制了一块奇特的牌子，上有一只乌龟，指责“东首假冒”，假冒者是乌龟。当时，大舞台正门在汉口路上，正对着这两家“文魁斋”，所以就流传出一句歇后语，凡是不知究竟的事，就说“大舞台对过——天晓得”...","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
5,"[民国正广和汽水, 尺寸： 底部直径7.5厘米，高29.5厘米, 简介, [正广和洋行总部设在英国伦敦，, 1874, 年开始在上海、香港等地开设分公司，, 1892, 年创建上海泌乐水厂，专事生产汽水、蒸馏水、餐用矿泉水、苏打水、姜汁水、柠檬水，广受欢迎，销路很好。, 20, 世纪, 20, 年代新厂建成投产，改称正广和汽水，是中国最早的汽水饮料生产厂。], 进入高清播放器]","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
6,"[近代亚细亚火油公司壳牌中文铜牌, 尺寸： 直径74厘米，高100厘米, 简介, [过去中国的石油主要依赖进口，统称为“洋油”。, 1903, 年，原为竞争对手的壳牌运输贸易有限公司与荷兰皇家石油公司在伦敦成立亚细亚火油公司。, 1908, 年，在上海成立办事处。, 1917, 年公司入驻位于今中山东一路, 1, 号高七层的亚细亚大楼，人称“外滩第一楼”。这块亚细亚火油公司壳牌中文铜牌原本...","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
7,"[清咸丰六年 郁森盛、经正记、王永盛银饼一组, 尺寸： 直径4厘米, 简介, [郁森盛、经正记、王永盛作为清代上海最大的沙船号商，从事沿海运输，对上海港区的形成和上海城市发展做出了巨大贡献。同时，他们还开设钱庄，于咸丰六年（, 1856, ）铸造并发行了银饼，金融街称之为“上海银饼”，开创了沪上商号自铸银元流通市场的先例，也是中国现存最早以“两”为单位的银元，为清末上海经济发展作出了贡献。...","[javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
8,"[近代上海美租界界碑, 尺寸： 长76.5厘米，宽68.5厘米，厚3厘米, 简介, [上海开埠后，美国传教士在虹口地区广置地皮，拓展势力。在造成既成事实后，要求上海道台划定该区域为美国租界。, 1863, 年，英美租界正式合并，称为洋泾浜北首外人租界或英美公共租界。, 1893, 年, 6, 月，上海道台与工部局划定美租界新界址并树立界石，此碑可能为当时遗物。上海开埠后，美国传教士在虹口地...","[javascript:;, javascript:;, /historymuseum/historymuseum/myyp_big.html?#branch=dc_myyp&collection=1]"
9,"[清乾隆四十五年陆锡熊父母诰命, 尺寸： 纵21厘米，横350厘米, 简介, [该诰命系乾隆四十五年（, 1780, ）朝廷封赠陆锡熊父母的文书。诰命质地为五色织锦，朵朵祥云点缀其间，雍容华贵，由满汉文合璧书写，并合于中轴，在满汉文结尾处均钤盖“制诰之宝”，是清代荣典制度的重要例证，亦是反映明清上海地区人文昌盛的重要实物。], 进入高清播放器]","[javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, javascript:;, /historymuseum/historymuseu..."


In [21]:
info_df.text[0]

['清上海书业商团旗帜',
 '尺寸： 长160厘米，宽114厘米',
 '简介',
 ['商团是清末上海商界的自卫武装团体，在辛亥革命期间发挥了重要的作用。',
  '1886',
  '年朱槐庐等人创立上海书业崇德公所，',
  '1905',
  '年重组为“上海书业公所”，不久组织成立上海书业商团。在上海光复之役中，书业商团参与了恢复上海全境、攻打制造局的战斗。胜利后，商团全体人员不分昼夜，认真巡逻、保境安民。'],
 '进入高清播放器']

# 5

In [22]:
extracted_contents, info_path = scrape_what_from_where("限大额", "http://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1")
extracted_contents

[Success] The website at "http://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1" is collected successfully.
3

There are 1 tables with the target phrase:

     关注  比较   序号    基金代码                 基金简称 2021-02-10         2021-02-09  \
     关注  比较   序号    基金代码                 基金简称       单位净值    累计净值       单位净值   
0   NaN NaN    1    8438      九泰行业优选混合C估值图基金吧     1.4024  1.4024     1.2246   
1   NaN NaN    2    8437      九泰行业优选混合A估值图基金吧     1.4022  1.4022     1.2244   
2   NaN NaN    3  161725  招商中证白酒指数(LOF)估值图基金吧     1.6198  3.2509     1.5439   
3   NaN NaN    4    3190    创金合信消费主题股票A估值图基金吧     3.4247  3.2593     3.2655   
4   NaN NaN    5    3191    创金合信消费主题股票C估值图基金吧     3.3633  3.1756     3.2071   
..   ..  ..  ...     ...                  ...        ...     ...        ...   
195 NaN NaN  196    9854      中加优势企业混合C估值图基金吧     1.4617  1.4617     1.4141   
196 NaN NaN  197    8277     财通资管行业精选混合估值图基金吧     1.4006  1.4006     1.3550   
197 NaN NaN  198    1382      易方达国企改革混合估值图基金吧     2

Unnamed: 0_level_0,关注,比较,序号,基金代码,基金简称,2021-02-10,2021-02-10,2021-02-09,2021-02-09,日增长值,日增长率,申购状态,赎回状态,手续费
Unnamed: 0_level_1,关注,比较,序号,基金代码,基金简称,单位净值,累计净值,单位净值,累计净值,日增长值,日增长率,申购状态,赎回状态,手续费
0,,,1,8438,九泰行业优选混合C估值图基金吧,1.4024,1.4024,1.2246,1.2246,0.1778,14.52%,限大额,开放,0.00%
1,,,2,8437,九泰行业优选混合A估值图基金吧,1.4022,1.4022,1.2244,1.2244,0.1778,14.52%,限大额,开放,0.15%
2,,,3,161725,招商中证白酒指数(LOF)估值图基金吧,1.6198,3.2509,1.5439,3.1750,0.0759,4.92%,开放,开放,0.10%
3,,,4,3190,创金合信消费主题股票A估值图基金吧,3.4247,3.2593,3.2655,3.1078,0.1592,4.88%,开放,开放,0.15%
4,,,5,3191,创金合信消费主题股票C估值图基金吧,3.3633,3.1756,3.2071,3.0281,0.1562,4.87%,开放,开放,0.00%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,196,9854,中加优势企业混合C估值图基金吧,1.4617,1.4617,1.4141,1.4141,0.0476,3.37%,暂停,暂停,0.00%
196,,,197,8277,财通资管行业精选混合估值图基金吧,1.4006,1.4006,1.3550,1.3550,0.0456,3.37%,开放,开放,0.15%
197,,,198,1382,易方达国企改革混合估值图基金吧,2.9200,2.9200,2.8250,2.8250,0.0950,3.36%,开放,开放,0.15%
198,,,199,609,华商新量化混合估值图基金吧,3.0580,3.6080,2.9590,3.5090,0.0990,3.35%,开放,开放,0.15%


# 6

https://www.nyc.com/arts__attractions/

In [23]:
extracted_contents, path = scrape_what_from_where("Central Park", "https://www.nyc.com/arts__attractions/")
extracted_contents

[Success] The website at "https://www.nyc.com/arts__attractions/" is collected successfully.
0

There are 3 matched elements given your last input. They are:
	Choice 1:  Central Park
	Choice 2:  New York's "flagship" park of 843 acres, 26,000 trees, and almost 9,000 benches 
	Choice 3:  Seven and a half miles of Beach on the Atlantic Ocean. Lifeguards are stationed 

Which choice is the element you that want to scrape: [1, 2, 3, ...]
1

Unique match is found:
<h2>Central Park</h2>



Extracting contents ...


[Success] The selector path used to extract contents is:

	section.tiles.tiles_2.container:nth-of-type(4) > div.row.tilesrow > article.lazy.cgreen.col-xs-12.col-sm-6.col-md-4 > header > h2



Unnamed: 0,text,url
0,Central Park,[]
1,"Cooper-Hewitt, National Design Museum",[]
2,Ellis Island Museum,[]
3,New Museum of Contemporary Art,[]
4,Outdoor Movie Guide,[]
5,Restaurant Week,[]


# 7

http://search.huochepiao.com/chezhan/shanghai

In [24]:
extracted_contents, path = scrape_what_from_where("上海 - 南京", "http://search.huochepiao.com/chezhan/shanghai")
extracted_contents

[Success] The website at "http://search.huochepiao.com/chezhan/shanghai" is collected successfully.
0

Unique match is found:
<a href="/shike_shanghai_nanjing">上海 - 南京</a>



Extracting contents ...


[Success] The selector path used to extract contents is:

	table#sys-notice-box > tr > td > table:nth-of-type(2) > tr > td > table > tr > td > a



Unnamed: 0,text,url
0,上海 - 南京,/shike_shanghai_nanjing
1,上海 - 北京,/shike_shanghai_beijing
2,上海 - 苏州,/shike_shanghai_suzhou
3,上海 - 杭州,/shike_shanghai_hangzhou
4,上海 - 郑州,/shike_shanghai_zhengzhou
5,上海 - 成都,/shike_shanghai_chengdu
6,上海 - 天津,/shike_shanghai_tianjin
7,上海 - 广州,/shike_shanghai_guangzhou
8,上海 - 合肥,/shike_shanghai_hefei
9,上海 - 无锡,/shike_shanghai_wuxi


In [25]:
base_url = get_base_url("http://search.huochepiao.com/chezhan/shanghai")

In [26]:
extracted_contents['url'] = base_url + extracted_contents['url']

In [27]:
pages = extracted_contents['url'].tolist()

In [28]:
extracted_contents, path = scrape_what_from_where('G4824', pages[0], go_up = 3)
extracted_contents

[Success] The website at "http://search.huochepiao.com/shike_shanghai_nanjing" is collected successfully.
0

Unique match is found:
<a href="/checi/G4824/G4825">G4824/G4825</a>



Extracting contents ...


[Success] The selector path used to extract contents is:

	table:nth-of-type(2) > tr > td > table > tr > td:nth-of-type(2) > table:nth-of-type(2)



Unnamed: 0,text,url
0,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G4824/G4825, [上海虹桥, 00:50, 南京南, 01:58, 0分, 295, 134.5, 229.5, 0/0/0, 0/0, 50|10|1]], [G9376/G9377, [上海, 00:58, 南京南, 02:29, 1小时31分, 311...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E5%8D%97%E4%BA%AC, #, #, #, #, #, #, #, #, #, #, #, /checi/G4824/G4825, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search...."


In [29]:
print(' '.join(extracted_contents.text[0][0]))
print()
print(extracted_contents.text[0][1][0] + ' ' + ' '.join(extracted_contents.text[0][1][1]))
print(extracted_contents.text[0][2][0] + ' ' + ' '.join(extracted_contents.text[0][1][1]))
print(extracted_contents.text[0][3][0] + ' ' + ' '.join(extracted_contents.text[0][1][1]))

车次 出发站 开车时间 到达站 到达时间 用时 里程 硬座 软座 硬卧上/中/下 软卧上/下

G4824/G4825 上海虹桥 00:50 南京南 01:58 0分 295 134.5 229.5 0/0/0 0/0 50|10|1
G9376/G9377 上海虹桥 00:50 南京南 01:58 0分 295 134.5 229.5 0/0/0 0/0 50|10|1
K5550/K5551 上海虹桥 00:50 南京南 01:58 0分 295 134.5 229.5 0/0/0 0/0 50|10|1


In [30]:
train_schedule_df = extract_path_from_pages(path, pages)

0/20, 3/20, 6/20, 9/20, 12/20, 15/20, 18/20, 

[Success] Content extraction finished.




In [31]:
train_schedule_df

Unnamed: 0,text,url
0,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G4824/G4825, [上海虹桥, 00:50, 南京南, 01:58, 0分, 295, 134.5, 229.5, 0/0/0, 0/0, 50|10|1]], [G9376/G9377, [上海, 00:58, 南京南, 02:29, 1小时31分, 311...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E5%8D%97%E4%BA%AC, #, #, #, #, #, #, #, #, #, #, #, /checi/G4824/G4825, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search...."
1,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G102, [上海虹桥, 06:26, 北京南, 12:29, 6小时3分, 1318, 606, 971, 0/0/0, 0/0, 386|10|1]], [G104, [上海虹桥, 06:37, 北京南, 12:33, 5小时56分, 1318, 606, 971...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E5%8C%97%E4%BA%AC, #, #, #, #, #, #, #, #, #, #, #, #, /checi/G102, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search.huoc..."
2,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [K5550/K5551, [上海, 01:11, 苏州, 02:14, 1小时3分, 84, 7, -, 41/65.5/68.5, 90.5/96.5, 71|8|1]], [K5550/K5551, [上海, 01:11, 苏州, 02:14, 1小时3分, 84...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E8%8B%8F%E5%B7%9E, #, #, #, #, #, #, #, #, #, #, #, /checi/K5550/K5551, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海, http://search.hu..."
3,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G4571, [上海虹桥, 00:38, 杭州东, 01:23, 45分, 159, 73, 117, 0/0/0, 0/0, 38|10|1]], [K4085, [上海南, 01:02, 杭州东, 02:52, 1小时50分, 167, 24.5, -, 70.5...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E6%9D%AD%E5%B7%9E, #, #, #, #, #, #, #, #, #, #, #, /checi/G4571, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search.huoche..."
4,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G4110/G4111, [上海虹桥, 01:12, 郑州东, 05:20, 4小时8分, 986, 482.5, 784.5, 0/0/0, 0/0, 72|10|1]], [K4168/K4169, [上海, 01:45, 郑州, 16:18, 14小时33分, ...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E9%83%91%E5%B7%9E, #, #, #, #, #, #, #, #, #, #, #, #, /checi/G4110/G4111, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://sear..."
5,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [K4085, [上海南, 01:02, 成都, 13:15, 36小时13分, 2473, 263.5, -, 446.5/462.5/477.5, 704.5/735.5, 62|8|1]], [K4616/K4617, [上海, 03:40, 成都, 11:50,...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E6%88%90%E9%83%BD, #, #, #, #, #, #, #, #, #, #, #, #, /checi/K4085, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海南, http://search.huoc..."
6,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G108, [上海虹桥, 07:22, 天津南, 12:45, 5小时23分, 1196, 551, 881, 0/0/0, 0/0, 442|10|1]], [G1229/G1232, [上海虹桥, 07:34, 天津西, 13:45, 6小时11分, 1213, ...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E5%A4%A9%E6%B4%A5, #, #, #, #, #, #, #, #, #, #, #, #, /checi/G108, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search.huoc..."
7,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G85, [上海虹桥, 08:00, 广州南, 14:51, 6小时51分, 1790, 793, 1302.5, 0/0/0, 0/0, 480|10|1]], [G1301, [上海虹桥, 10:24, 广州南, 19:02, 8小时38分, 1790, 793,...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E5%B9%BF%E5%B7%9E, #, #, #, #, #, #, #, #, #, #, #, #, /checi/G85, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search.huoch..."
8,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [G4824/G4825, [上海虹桥, 00:50, 合肥南, 03:04, 0分, 468, 202.5, 335, 0/0/0, 0/0, 50|10|1]], [D4678/D4679, [上海虹桥, 01:17, 合肥南, 03:31, 2小时14分, 468...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E5%90%88%E8%82%A5, #, #, #, #, #, #, #, #, #, #, #, /checi/G4824/G4825, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海虹桥, http://search...."
9,"[[车次, 出发站, 开车时间, 到达站, 到达时间, 用时, 里程, 硬座, 软座, 硬卧上/中/下, 软卧上/下], [K5550/K5551, [上海, 01:11, 无锡, 02:47, 1小时36分, 126, 10.5, -, 65.5/70.5/73.5, 99.5/105.5, 71|8|1]], [K5550/K5551, [上海, 01:11, 无锡, 02:47, 1...","[http://www.guabu.com/zj_%E4%B8%8A%E6%B5%B7_%E6%97%A0%E9%94%A1, #, #, #, #, #, #, #, #, #, #, #, /checi/K5550/K5551, http://search.huochepiao.com/chaxun/resultz.asp?txtChezhan=上海, http://search.hu..."


# 8

https://www.broadway.com/shows/tickets/

In [33]:
extracted_contents, path = scrape_what_from_where('to kill a mockingbird', 'https://www.broadway.com/shows/tickets/')
extracted_contents

[Success] The website at "https://www.broadway.com/shows/tickets/" is collected successfully.
0

Unique match is found:
<a class="link-111-111" href="/shows/to-kill-mockingbird/" translate="no">To Kill a Mockingbird</a>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div:nth-of-type(4) > div.card.card--hover.card--shadow.bg-white.mtn > div.card__body > div.media > div.media-body:nth-of-type(2) > h2.futura-pt.font-22-xs.font-36-sm.lh-1.ls-negative-05.mhn.mtn.mbr-15 > a.link-111-111



Unnamed: 0,text,url
0,To Kill a Mockingbird,/shows/to-kill-mockingbird/
1,Hamilton,/shows/hamilton-broadway/
2,Wicked,/shows/wicked/
3,Moulin Rouge! The Musical,/shows/moulin-rouge-musical/
4,The Phantom of the Opera,/shows/the-phantom-of-the-opera/
5,The Lion King,/shows/the-lion-king/
6,The Book of Mormon,/shows/book-mormon/
7,Ain't Too Proud – The Life and Times of The Temptations,/shows/aint-too-proud/
8,Aladdin,/shows/aladdin-broadway/
9,Tina: The Tina Turner Musical,/shows/tina-tina-turner-musical/


In [37]:
extracted_contents, path = scrape_what_from_where('to kill a mockingbird', 'https://www.broadway.com/shows/tickets/', go_up = 2)
extracted_contents

[Success] The website at "https://www.broadway.com/shows/tickets/" is collected successfully.
0

Unique match is found:
<a class="link-111-111" href="/shows/to-kill-mockingbird/" translate="no">To Kill a Mockingbird</a>



Extracting contents ...


[Success] The selector path used to extract contents is:

	div:nth-of-type(4) > div.card.card--hover.card--shadow.bg-white.mtn > div.card__body > div.media > div.media-body:nth-of-type(2)



Unnamed: 0,text,url
0,"[[To Kill a Mockingbird, [['from ', '$69.00'], 2hrs, 35mins (1 Intermission)], [Broadway,, Plays,, Stars on Stage,, Drama,, 2019 Tony Nominees,, 2019 Tony Winners]], [[ ...","[/shows/to-kill-mockingbird/, /shows/to-kill-mockingbird/, /shows/to-kill-mockingbird/, /shows/to-kill-mockingbird/]"
1,"[[Hamilton, [['from ', '$149.00'], 2hrs, 55mins (1 Intermission)], [Broadway,, Musicals,, Award Winning,, 2016 Tony Nominees,, 2016 Tony Winners]], [[ ...","[/shows/hamilton-broadway/, /shows/hamilton-broadway/, /shows/hamilton-broadway/, /shows/hamilton-broadway/]"
2,"[[Wicked, [['from ', '$95.00'], 2hrs, 45mins (1 Intermission)], [Broadway,, Bestsellers,, Musicals,, Kid-Friendly,, Comedy,, Award Winning]], [[ ...","[/shows/wicked/, /shows/wicked/, /shows/wicked/, /shows/wicked/]"
3,"[[Moulin Rouge! The Musical, [['from ', '$59.00'], 2hrs, 45mins (1 Intermission)], [Broadway,, Musicals,, Comedy,, 2020 Tony Nominees]], [[ B...","[/shows/moulin-rouge-musical/, /shows/moulin-rouge-musical/, /shows/moulin-rouge-musical/, /shows/moulin-rouge-musical/]"
4,"[[The Phantom of the Opera, [['from ', '$29.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Classics,, Drama,, Award Winning]], [[ ...","[/shows/the-phantom-of-the-opera/, /shows/the-phantom-of-the-opera/, /shows/the-phantom-of-the-opera/, /shows/the-phantom-of-the-opera/]"
5,"[[The Lion King, [['from ', '$75.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Kid-Friendly,, Dance,, Award Winning]], [[ Buy ...","[/shows/the-lion-king/, /shows/the-lion-king/, /shows/the-lion-king/, /shows/the-lion-king/]"
6,"[[The Book of Mormon, [['from ', '$69.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Comedy,, Award Winning,, 2011 Tony Winners]], [[ ...","[/shows/book-mormon/, /shows/book-mormon/, /shows/book-mormon/, /shows/book-mormon/]"
7,"[[Ain't Too Proud – The Life and Times of The Temptations, [['from ', '$49.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, 2019 Tony Nominees,, 2019 Tony Winners]], [[ ...","[/shows/aint-too-proud/, /shows/aint-too-proud/, https://checkout.broadway.com/aint-too-proud/12714/calendar/, /shows/aint-too-proud/]"
8,"[[Aladdin, [['from ', '$57.50'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Kid-Friendly,, Comedy,, Award Winning,, 2014 Tony Winners]], [[ ...","[/shows/aladdin-broadway/, /shows/aladdin-broadway/, /shows/aladdin-broadway/, /shows/aladdin-broadway/]"
9,"[[Tina: The Tina Turner Musical, [['from ', '$79.00'], 2hrs, 45mins (1 Intermission)], [Broadway,, Musicals,, 2020 Tony Nominees]], [[ Buy Ti...","[/shows/tina-tina-turner-musical/, /shows/tina-tina-turner-musical/, /shows/tina-tina-turner-musical/, /shows/tina-tina-turner-musical/]"


In [38]:
pages = create_page_url_list(template_url = 'https://www.broadway.com/shows/tickets/?page=NUMBER', start_index = 1, end_index = 3)

In [39]:
broadway_show_df = extract_path_from_pages(path, pages)

0/3, 1/3, 2/3, 

[Success] Content extraction finished.




In [40]:
broadway_show_df

Unnamed: 0,text,url
0,"[[To Kill a Mockingbird, [['from ', '$69.00'], 2hrs, 35mins (1 Intermission)], [Broadway,, Plays,, Stars on Stage,, Drama,, 2019 Tony Nominees,, 2019 Tony Winners]], [[ ...","[/shows/to-kill-mockingbird/, /shows/to-kill-mockingbird/, /shows/to-kill-mockingbird/, /shows/to-kill-mockingbird/]"
1,"[[Hamilton, [['from ', '$149.00'], 2hrs, 55mins (1 Intermission)], [Broadway,, Musicals,, Award Winning,, 2016 Tony Nominees,, 2016 Tony Winners]], [[ ...","[/shows/hamilton-broadway/, /shows/hamilton-broadway/, /shows/hamilton-broadway/, /shows/hamilton-broadway/]"
2,"[[Wicked, [['from ', '$95.00'], 2hrs, 45mins (1 Intermission)], [Broadway,, Bestsellers,, Musicals,, Kid-Friendly,, Comedy,, Award Winning]], [[ ...","[/shows/wicked/, /shows/wicked/, /shows/wicked/, /shows/wicked/]"
3,"[[Moulin Rouge! The Musical, [['from ', '$59.00'], 2hrs, 45mins (1 Intermission)], [Broadway,, Musicals,, Comedy,, 2020 Tony Nominees]], [[ B...","[/shows/moulin-rouge-musical/, /shows/moulin-rouge-musical/, /shows/moulin-rouge-musical/, /shows/moulin-rouge-musical/]"
4,"[[The Phantom of the Opera, [['from ', '$29.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Classics,, Drama,, Award Winning]], [[ ...","[/shows/the-phantom-of-the-opera/, /shows/the-phantom-of-the-opera/, /shows/the-phantom-of-the-opera/, /shows/the-phantom-of-the-opera/]"
5,"[[The Lion King, [['from ', '$75.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Kid-Friendly,, Dance,, Award Winning]], [[ Buy ...","[/shows/the-lion-king/, /shows/the-lion-king/, /shows/the-lion-king/, /shows/the-lion-king/]"
6,"[[The Book of Mormon, [['from ', '$69.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Comedy,, Award Winning,, 2011 Tony Winners]], [[ ...","[/shows/book-mormon/, /shows/book-mormon/, /shows/book-mormon/, /shows/book-mormon/]"
7,"[[Ain't Too Proud – The Life and Times of The Temptations, [['from ', '$49.00'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, 2019 Tony Nominees,, 2019 Tony Winners]], [[ ...","[/shows/aint-too-proud/, /shows/aint-too-proud/, https://checkout.broadway.com/aint-too-proud/12714/calendar/, /shows/aint-too-proud/]"
8,"[[Aladdin, [['from ', '$57.50'], 2hrs, 30mins (1 Intermission)], [Broadway,, Musicals,, Kid-Friendly,, Comedy,, Award Winning,, 2014 Tony Winners]], [[ ...","[/shows/aladdin-broadway/, /shows/aladdin-broadway/, /shows/aladdin-broadway/, /shows/aladdin-broadway/]"
9,"[[Tina: The Tina Turner Musical, [['from ', '$79.00'], 2hrs, 45mins (1 Intermission)], [Broadway,, Musicals,, 2020 Tony Nominees]], [[ Buy Ti...","[/shows/tina-tina-turner-musical/, /shows/tina-tina-turner-musical/, /shows/tina-tina-turner-musical/, /shows/tina-tina-turner-musical/]"


<br><br><br><br><br><br><br><br><br><br>

### Notes and To-dos

In [41]:
# Once the content of the website is collected, the next step is to parse the page. 
# After parsing, we can find the elements we want, then extract and clean their values. 
# For this step, there are many choices of libraries, some examples are: 

#  - BeautifulSoup
#  - Scrapy
#  - Lxml
#  - AdvancedHTMLParser
 
# In this pipeline, we will explore `BeautifulSoup`. `BeautifulSoup` and `Scrapy` are two popular scraping tools. Between these two, BeautifulSoup is more user-friendly, while Scrapy is more efficient and scalable. 
# As this pipeline is targeted for people with less technical backgrounds, we will sacrafice some efficiency for more intuitive experience.
# If you are working on a large-scale or high-velocity scraping project, please consider Scrapy or other tool.
# The other two libraries listed above are good choices in their specific areas, so keep them in view:
#  - Lxml has rich features for processing XML and HTML and is quite efficient (BeautifulSoup actually supports using Lxml parser among other parsers).
#  - AdvancedHTMLParser has similar functions like in native JavaScript and supports complex operations on HTML.
  
### Choices of Parsers
# References:
#  - https://smartproxy.com/blog/scrapy-vs-beautifulsoup (Use cases comparison and Pros&Cons)
#  - https://tomassetti.me/parsing-html (Common libraries in different programming languages)
#  - https://medium.com/analytics-vidhya/scrapy-vs-selenium-vs-beautiful-soup-for-web-scraping-24008b6c87b8 (Great comparison article that includes Selenium, which is the popular choice for dynamic website scraping)

### Beautiful Soup
# References:   
#  - https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup (Showed how to write element finding logic in hierarchy)
#  - https://stackabuse.com/guide-to-parsing-html-with-beautifulsoup-in-python (Nice illustrations, browse_and_scrape combines pagination with parsing) 
#  - https://www.crummy.com/software/BeautifulSoup/bs4/doc (Long but detailed description of BS4 usage)
#  - https://www.crummy.com/software/BeautifulSoup (The "Hall of Fame" section has some high-profile projects, worth having a look)

In [None]:
# # $$$
# # To be implemented:

# An interesting side note, here's one quote from the BS project page:
# > You can tell it "Find all the links", or "Find all the links of class externalLink", or "Find all the links whose urls match "foo.com", or "Find the table heading that's got bold text, then give me that text."
# But actually, you CANNOT directly ask BS these natural language questions. You need to write codes that follow the syntax of the BS4 library, which is similar but not quite close to natural language. 
# **Programming with natural language** is one of the directions worth pursuing in the future, as it further lowers the bar for utilizing web scraping and related technologies.

In [None]:
# # $$$
# # To be implemented:

# Note the requests made by the `get_response` function might be recognized as robotic access by some website. 
# To bypass screening by those websites, additional specifications on headers and proxies are required. 
# These additional setup will be implemented in the future versions.

# As a reference, the `get` function from the Python library requests takes the following parameters:
# - url – URL for the new  Request object.
# - params – (optional) Dictionary of GET Parameters to send with the Request.
# - headers – (optional) Dictionary of HTTP Headers to send with the Request.
# - cookies – (optional) CookieJar object to send with the  Request.
# - auth – (optional) AuthObject to enable Basic HTTP Auth.
# - timeout – (optional) Float describing the timeout of the request.

In [None]:
# # $$$
# # To be integrated:
# # When getting the response, use unicode-dammit to detect encodings in smart ways (https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit)

In [None]:
# # $$$
# # To be implemented formally: Get next/prev sibling
# for child in get_contents(sample_element.parent): # [::-1]
#     print(child == sample_element) # if true then next is next sib or prev sib depending on how contents list is ordered

In [None]:
# # $$$
# # Ready have open-source implementation, but :
# # css_selector_to_xpath is getting a result that is too complicated
# # xpath_to_css_selector cannot handle slightly more complex xpath
# # For simple xpath-css conversion, own implementation might be more transparent and reliable

# # !pip3 install cssify
# # !pip3 install cssselect

# # Reference: https://github.com/santiycr/cssify
# from cssify import cssify
# def xpath_to_css_selector(xpath_string):
#     return cssify(xpath_string)

# # Reference: https://lxml.de/cssselect.html
# from cssselect import GenericTranslator
# def css_selector_to_xpath(css_selector_string):
#     return GenericTranslator().css_to_xpath()

In [None]:
# # $$$
# # Implemented as a solution to indicate to the user which parts of the websites will be scraped
# # However, this involves opening up the newly created html file in a browser and searching for changes
# # The highlighting may not be obvious and the process is almost as complex as using broswer inspector
# # Thus this functionality is dropped and should be kept in view (KIV)

# def highlight_element(element, highlight_style = "background-color: rgba(255,0,0,0.5); border: 3px dotted yellow"):
#     element['style'] = highlight_style

# def highlight_elements(elements, highlight_style = "background-color: rgba(255,0,0,0.5); border: 3px dotted yellow; "):
#     for element in elements:
#         element['style'] = highlight_style

# # highlighted_soup = highlight_element(soup.select(path))

In [None]:
# ################################################################################
# # Reference: How To Rotate Proxies and change IP Addresses using Python 3
# # https://www.scrapehero.com/how-to-rotate-proxies-and-ip-addresses-using-python-3/    
    
# from lxml.html import fromstring
# import requests
# from itertools import cycle
# import traceback

# def get_proxies():
#     url = 'https://free-proxy-list.net/'
#     response = requests.get(url)
#     parser = fromstring(response.text)
#     proxies = set()
#     for i in parser.xpath('//tbody/tr')[:10]:
#         if i.xpath('.//td[7][contains(text(),"yes")]'):
#             proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
#             proxies.add(proxy)
#     return proxies

# #If you are copy pasting proxy ips, put in the list below
# #proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
# proxies = get_proxies()
# proxy_pool = cycle(proxies)

# url = 'https://httpbin.org/ip'
# for i in range(1,11):
#     #Get a proxy from the pool
#     proxy = next(proxy_pool)
#     print("Request #%d"%i)
#     try:
#         response = requests.get(url,proxies={"http": proxy, "https": proxy})
#         print(response.json())
#     except:
#         #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
#         #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
#         print("Skipping. Connnection error")