In [6]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [7]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




Current google-chrome version is 95.0.4638
Get LATEST chromedriver version for 95.0.4638 google-chrome
Driver [/Users/liang/.wdm/drivers/chromedriver/mac64/95.0.4638.69/chromedriver] found in cache


In [8]:
# **executable_path is unpacking the dictionary we've stored the path in – think of it as unpacking a suitcase

In [9]:
# headless=False means that all of the browser's actions will be displayed in a Chrome window so we can see them

In [10]:
# Scrape the Title


In [11]:
# Visit the Quotes to Scrape site
#This code tells Splinter which site we want to visit by assigning the link to a URL. 
#After executing the cell above, we will use BeautifulSoup to parse the HTML. 
#In the next cell, we'll add two more lines of code:
url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [4]:
# Parse the HTML
#Now we've parsed all of the HTML on the page. That means that BeautifulSoup has taken a look at the 
#different components and can now access them. Specifically, BeautifulSoup parses the HTML text and then 
#stores it as an object.
html = browser.html
html_soup = soup(html, 'html.parser')

In [5]:
# Scrape the Title
#we will find the title and extract it
title = html_soup.find('h2').text
title

'Top Ten tags'

In [None]:
#What we've just done in the last two lines of code is:

#We used our html_soup object we created earlier and chained find() to it to search for the <h2 /> tag.
#We've also extracted only the text within the HTML tags by adding .text to the end of the code.

In [None]:
#Scrape All of the Tags


In [None]:
#####QUESTION: how do we know that we need to create a variable and we cannot just go fin a and tags directly in the html.soup?

# Scrape the top ten tags
#The first line, tag_box = html_soup.find('div', class_='tags-box'), creates a new variable tag_box, 
#which will be used to store the results of a search. In this case, we're looking for <div /> elements with a 
#class of tags-box, and we're searching for it in the HTML we parsed earlier and stored in the html_soup variable.
tag_box = html_soup.find('div', class_='tags-box')
# tag_box
#The second line, tags = tag_box.find_all('a', class_='tag'), is similar to the first but with a few tweaks to
#make the search more specific. The new "tags" variable will hold the results of a find_all, but this time we're
#searching through the parsed results stored in our tag_box variable to find <a /> elements with a tag class.
tags = tag_box.find_all('a', class_='tag')

#for loop. This for loop cycles through each tag in the tags variable, strips the HTML code out of it, 
#and then prints only the text of each tag
for tag in tags:
    word = tag.text
    print(word)

In [12]:
# Scrape Across Pages

#We have already created the Browser instance and navigated to the http://quotes.toscrape.com/ page 
#with the visit() method. But, if you'd like to create the Browser instance again, run the following code in a
#new cell.

url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [15]:
#In the next cell, we'll create a for loop that will do the following:

#Create a BeautifulSoup object
#Find all the quotes on the page
#Print each quote from the page
#Click the "Next" button at the bottom of the page
#We'll use range(1, 6) in our for loop to visit the first five pages of the website.
for x in range(1, 6):
   html = browser.html
   quote_soup = soup(html, 'html.parser')
   quotes = quote_soup.find_all('span', class_='text')
   for quote in quotes:
      print('page:', x, '----------')
      print(quote.text)
   browser.links.find_by_partial_text('Next').click()

AttributeError: 'str' object has no attribute 'descendants'

In [None]:
#News Title and Paragraph

In [None]:
def mars_news(browser):

   # Visit the mars nasa news site
   url = 'https://redplanetscience.com/'
   browser.visit(url)

   # Optional delay for loading the page
   browser.is_element_present_by_css('div.list_text', wait_time=1)

   # Convert the browser html to a soup object and then quit the browser
   html = browser.html
   news_soup = soup(html, 'html.parser')

   slide_elem = news_soup.select_one('div.list_text')

   # Use the parent element to find the first <a> tag and save it as `news_title`
   news_title = slide_elem.find('div', class_='content_title').get_text()

   # Use the parent element to find the paragraph text
   news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

   return news_title, news_p

In [None]:
#Instead of having our title and paragraph printed within the function, we want to return them from the function so
#they can be used outside of it. We'll adjust our code to do so by deleting news_title and news_p and include them
#in the return statement instead, as shown below.

In [None]:
# #When we add the word "browser" to our function, we're telling Python that we'll be using the browser variable we 
# #defined outside the function. All of our scraping code utilizes an automated browser, and without this section, 
# #our function wouldn't work.

# The finishing touch is to add error handling to the mix. This is to address any potential errors that may occur 
#during web scraping. Errors can pop up from anywhere, but in web scraping the most common cause of an error is
#when the webpage's format has changed and the scraping code no longer matches the new HTML elements.

# We're going to add a try and except clause addressing AttributeErrors. By adding this error handling, 
#we are able to continue with our other scraping portions even if this one doesn't work.

# In our code, we're going to add the try portion right before the scraping:

In [None]:
# After adding the try portion of our error handling, we need to add the except part. After these lines,
# we'll immediately add the following:

In [None]:
# By adding try: just before scraping, we're telling Python to look for these elements. If there's an error, 
#     Python will continue to run the remainder of the code. If it runs into an AttributeError, however, 
#     instead of returning the title and paragraph, Python will return nothing instead.

In [None]:
Featured Image


In [None]:
def featured_image(browser):
    # Visit URL
    url = 'https://spaceimages-mars.com'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

    except AttributeError:
        return None

    # Use the base url to create an absolute url
    img_url = f'https://spaceimages-mars.com/{img_url_rel}'

    return img_url

In [None]:
# #Declare and define our function.

# #Remove print statement(s) and return them instead.
# #In our Jupyter Notebook version of the code, we printed the results of our scraping by simply stating the 
# variable (e.g., after assigning data to the img_url variable, we simply put img_url on the next line to view 
#           the data). We still want to view the data output in our Python script, but we want to see it at the 
# end of our function instead of within it.

#Add error handling for AttributeError.

In [None]:
Mars Facts


In [None]:
def mars_facts():
    # Add try/except for error handling
    try:
        # Use 'read_html' to scrape the facts table into a dataframe
        df = pd.read_html('https://galaxyfacts-mars.com')[0]

    except BaseException:
        return None

    # Assign columns and set index of dataframe
    df.columns=['Description', 'Mars', 'Earth']
    df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    return df.to_html()

In [None]:
# Code for the facts table will be updated in a similar manner to the other two. This time, though, 
# we'll be adding BaseException to our except block for error handling.

# A BaseException is a little bit of a catchall when it comes to error handling. It is raised when any 
# of the built-in exceptions are encountered and it won't handle any user-defined exceptions. 
# We're using it here because we're using Pandas' read_html() function to pull data, instead of
# scraping with BeautifulSoup and Splinter. The data is returned a little differently and can result in errors
# other than AttributeErrors, which is what we've been addressing so far.

#As before, we've removed the print statements. Now that we know this code is working correctly, 
# we don't need to view the DataFrame that's generated.

# The code to assign columns and set the index of the DataFrame will remain the same, so the last 
# update we need to complete for this function is to add the return statement.


In [None]:
Integrate MongoDB Into the Web App

In [None]:
# Before we make our website look pretty (you never know when NASA is looking for its new analyst), 
# we need to connect to Mongo and establish communication between our code and the database we're using. 
# We'll add this last bit of code to our scraping.py script.

# At the top of our scraping.py script, just after importing the dependencies, we'll add one more function. 
# This function differs from the others in that it will:

# Initialize the browser.
# Create a data dictionary.
# End the WebDriver and return the scraped data.
# Let's define this function as "scrape_all" and then initiate the browser.

In [None]:
def scrape_all():
    # Initiate headless driver for deployment
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)


In [None]:
# While we can see the word "browser" here twice, one is the name of the variable passed into the function 
# and the other is the name of a parameter. Coding guidelines do not require that these match, even though 
# they do in our current code.

# When we were testing our code in Jupyter, headless was set as False so we could see the scraping in action. 
# Now that we are deploying our code into a usable web app, we don't need to watch the script work 
# (though it's totally okay if you still want to).

In [None]:
# When scraping, the "headless" browsing session is when a browser is run without the users seeing it at all. 
# So, when headless=True is declared as we initiate the browser, we are telling it to run in headless mode. 
# All of the scraping will still be accomplished, but behind the scenes.

In [None]:
#Next, we're going to set our news title and paragraph variables (remember, this function will return two values).
news_title, news_paragraph = mars_news(browser)
#This line of code tells Python that we'll be using our mars_news function to pull this data.    

In [None]:
#Now that we have our browser ready for work, we need to create the data dictionary. 
#Add the following code to our scrape_all() function:

# Run all scraping functions and store results in dictionary
data = {
      "news_title": news_title,
      "news_paragraph": news_paragraph,
      "featured_image": featured_image(browser),
      "facts": mars_facts(),
      "last_modified": dt.datetime.now()
}

#This dictionary does two things: It runs all of the functions we've created—featured_image(browser), 
# for example—and it also stores all of the results. When we create the HTML template, we'll create paths 
# to the dictionary's values, which lets us present our data on our template. We're also adding the date the 
# code was run last by adding "last_modified": dt.datetime.now(). For this line to work correctly, 
#     we'll also need to add import datetime as dt to our imported dependencies at the beginning of our code.

In [None]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import datetime as dt
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
   # Stop webdriver and return data
   browser.quit()
   return data

In [None]:
# To finish up the function, there are two more things to do. The first is to end the WebDriver using the 
# line browser.quit(). You can quit the automated browser by physically closing it, but there's a chance it won't 
# fully quit in the background. By using code to exit the browser, you'll know that all of the processes have been 
# stopped.

# Second, the return statement needs to be added. This is the final line that will signal that the function is 
# complete, and it will be inserted directly beneath browser.quit(). We want to return the data dictionary created
# earlier, so our return statement will simply read return data.

In [None]:
if __name__ == "__main__":
    # If running as script, print scraped data
    print(scrape_all())

In [None]:
# This last block of code tells Flask that our script is complete and ready for action. 
# The print statement will print out the results of our scraping to our terminal after executing the code.

In [None]:
final code

In [None]:
# Import Splinter, BeautifulSoup, and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import datetime as dt
from webdriver_manager.chrome import ChromeDriverManager


def scrape_all():
    # Initiate headless driver for deployment
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)

    news_title, news_paragraph = mars_news(browser)

    # Run all scraping functions and store results in a dictionary
    data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "facts": mars_facts(),
        "last_modified": dt.datetime.now()
    }

    # Stop webdriver and return data
    browser.quit()
    return data


def mars_news(browser):

    # Scrape Mars News
    # Visit the mars nasa news site
    url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css('div.list_text', wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        slide_elem = news_soup.select_one('div.list_text')
        # Use the parent element to find the first 'a' tag and save it as 'news_title'
        news_title = slide_elem.find('div', class_='content_title').get_text()
        # Use the parent element to find the paragraph text
        news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

    except AttributeError:
        return None, None

    return news_title, news_p


def featured_image(browser):
    # Visit URL
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

    except AttributeError:
        return None

    # Use the base url to create an absolute url
    img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'

    return img_url

def mars_facts():
    # Add try/except for error handling
    try:
        # Use 'read_html' to scrape the facts table into a dataframe
        df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0]

    except BaseException:
        return None

    # Assign columns and set index of dataframe
    df.columns=['Description', 'Mars', 'Earth']
    df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    return df.to_html(classes="table table-striped")

if __name__ == "__main__":

    # If running as script, print scraped data
    print(scrape_all())

In [None]:
note about bugs

In [None]:
# It's also a good idea at this point to run your code and check it for errors. Even though the Jupyter Notebook 
# cells have already been tested and bugs were addressed, because we made some slight updates and fine-tuned the 
# converted Python code, it's possible a new bug could have popped up.

# NOTE
# In your terminal, make sure you're in the correct directory with the ls command (if you don't see the files you've 
# been working on, then navigate to the folder you're storing them in). Make sure you have the correct environment 
# activated, then type python app.py into your terminal.

# The next message you see on your terminal should be a message that the Flask application is running on localhost. 
# Enter that address (usually http://127.0.0.1:5000/) into the address bar of your web browser.

# If you don't see that message on your terminal, you likely have a bug in your script. Thankfully, error messages 
# will help you pinpoint where and why an error is occurring.