In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

# Import pandas for .read_html() function
import pandas as pd

# add import datetime as dt
import datetime as dt

In [2]:
def scrape_all():
    # Initiate headless driver for deployment
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=True)

    news_title, news_paragraph = mars_news(browser)

    # Run all scraping functions and store results in a dictionary
    data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "featured_image": featured_image(browser),
        "facts": mars_facts(),
        "last_modified": dt.datetime.now()
    }

    # Stop webdriver and return data
    browser.quit()
    return data


def mars_news(browser):

    # Scrape Mars News
    # Visit the mars nasa news site
    url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css('div.list_text', wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        slide_elem = news_soup.select_one('div.list_text')
        # Use the parent element to find the first 'a' tag and save it as 'news_title'
        news_title = slide_elem.find('div', class_='content_title').get_text()
        # Use the parent element to find the paragraph text
        news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

    except AttributeError:
        return None, None

    return news_title, news_p


def featured_image(browser):
    # Visit URL
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

    except AttributeError:
        return None

    # Use the base url to create an absolute url
    img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'

    return img_url

def mars_facts():
    # Add try/except for error handling
    try:
        # Use 'read_html' to scrape the facts table into a dataframe
        df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0]

    except BaseException:
        return None

    # Assign columns and set index of dataframe
    df.columns=['Description', 'Mars', 'Earth']
    df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    return df.to_html(classes="table table-striped")

if __name__ == "__main__":

    # If running as script, print scraped data
    print(scrape_all())
    
# I copied the above code from what was supposed to go on scrapping.py.  The code in the notes below is probably more 
# reliable to go by for the challenge. 



Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\Gary Laptop\.wdm\drivers\chromedriver\win32\92.0.4515.107\chromedriver.exe] found in cache


{'news_title': "The Launch Is Approaching for NASA's Next Mars Rover, Perseverance", 'news_paragraph': "The Red Planet's surface has been visited by eight NASA spacecraft. The ninth will be the first that includes a roundtrip ticket in its flight plan. ", 'featured_image': 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars1.jpg', 'facts': '<table border="1" class="dataframe table table-striped">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <

In [3]:
# set your executable path then set up the URL
#executable_path = {'executable_path': ChromeDriverManager().install()}
#browser = Browser('chrome', **executable_path, headless=False)


In [4]:
# assign the url and instruct the browser to visit it
# Visit the mars nasa news site
#url = 'https://redplanetscience.com'
#browser.visit(url)
# Optional delay for loading the page
#browser.is_element_present_by_css('div.list_text', wait_time=1)

# With the following line, browser.is_element_present_by_css('div.list_text', wait_time=1), we are 
# accomplishing two things.

# One is that we're searching for elements with a specific combination of tag (div) and attribute (list_text). As an
# example, ul.item_list would be found in HTML as <ul class="item_list">.

# Secondly, we're also telling our browser to wait one second before searching for components. 
# The optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are
# image-heavy

In [5]:
# set up the HTML parser:
#html = browser.html
#news_soup = soup(html, 'html.parser')
#slide_elem = news_soup.select_one('div.list_text')

# Notice how we've assigned slide_elem as the variable to look for the <div /> tag and its descendent 
# (the other tags within the <div /> element)? This is our parent element. This means that this element holds all of
# the other elements within it, and we'll reference it when we want to filter search results even further. The . is used 
# for selecting classes, such as list_text, so the code 'div.list_text' pinpoints the <div /> tag with the
# class of list_text. CSS works from right to left, such as returning the last item on the list instead of the first. 
# Because of this, when using select_one, the first matching element returned will be a <li /> element with a class of
# slide and all nested elements within it

In [6]:
# After opening the page in a new browser, right-click to inspect and activate your DevTools. Then search for
# the HTML components you'll use to identify the title and paragraph you want
# What we will search is: class = “content_title”

# We'll want to assign the title and summary text to variables we'll reference later
# begin our scraping:
#slide_elem.find('div', class_='content_title')

# In this line of code, we chained .find onto our previously assigned variable, slide_elem. When we do this,
# we're saying, "This variable holds a ton of information, so look inside of that information to find this specific
# data." The data we're looking for is the content title, which we've specified by saying, "The specific data is in 
# a <div /> with a class of 'content_title'."

In [7]:
# The title is in that mix of HTML in our output—that's awesome! But we need to get just the text, and the extra
# HTML stuff isn't necessary. 

# Use the parent element to find the first `a` tag and save it as `news_title`
#news_title = slide_elem.find('div', class_='content_title').get_text()
#news_title

# We've added something new to our .find() method here: .get_text(). When this new method is chained onto .find(), only 
# the text of the element is returned. The code above, for example, would return only the title of the news article and not
# any of the HTML tags or elements

# Once executed, the result is the most recent title published on the website. When the website is updated and a new 
# article is posted, when our code is run again, it will return that article instead.

In [8]:
# Next we need to add the summary text.
# Use the DevTools selector tool and select the article summary (teaser), then check to see which tag is highlighted.

# We know that "article_teaser_body" is the right class name, but when we search for it, there is more than one 
# result. What now?

# That's okay. There will be many matches because there are many articles, each with a tag of <div /> and a class of
# article_teaser_body. We want to pull the first one on the list, not a specific one, so more than 10 results is fine. 
# In this case, if our scraping code is too specific, we'd pull only that article summary instead of the most recent.

# Because new articles are added to the top of the list, and we only need the most recent one, our search leads us to the
# first article.

# There are two methods used to find tags and attributes with BeautifulSoup:

# .find() is used when we want only the first class and attribute we've specified.
# .find_all() is used when we want to retrieve all of the tags and attributes.
# For example, if we were to use .find_all() instead of .find() when pulling the summary, we would retrieve all of
# the summaries on the page instead of just the first one.

# Use the parent element to find the paragraph text
#news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
#news_p

# Our next step scraping code will be to gather the featured images from the Jet Propulsion Laboratory's Space 
# Images (Links to an external site.) webpage. In your Jupyter notebook, use markdown to separate the article scraping 
# from the image scraping.

### Featured Images

# change the format of the code cell to "Markdown."
# You can access the cell formatting feature by using a drop-down menu at the top of the notebook. It's currently 
# set to "Code," so click the down arrow to toggle the drop-down menu and select "Markdown" instead.
# This would normally just be a cell that says featured images and is changed to markdown, but I have these notes to remember it. No code should go in this one.

In [9]:
# Visit URL
#url = 'https://spaceimages-mars.com'
#browser.visit(url)

In [10]:
# Next, we want to click the "Full Image" button. This button will direct our browser to an image slideshow.
# Let's take a look at the button's HTML tags and attributes with the DevTools
# <button class="btn btn-outline-light"> FULL IMAGE</button>
# This is a fairly straightforward HTML tag: the <button> element has a two 
# classes (btn and btn-outline-light) and a string reading "FULL IMAGE".

# First, let's use the dev tools to search for all the button elements. There are 3 of them.
# Since there are only three buttons, and we want to click the full-size image button, we can go ahead and use
# the HTML tag in our code.

# Find and click the full image button
#full_image_elem = browser.find_by_tag('button')[1]
#full_image_elem.click()

In [11]:
# With the new page loaded onto our automated browser, it needs to be parsed so we can continue and scrape t
# he full-size image URL

# Parse the resulting html with soup
#html = browser.html
#img_soup = soup(html, 'html.parser')

In [12]:
# Now we need to find the relative image URL. In our browser (make sure you're on the same page as the automated one),
# activate your DevTools again. This time, let's find the image link for that image. 

# We want to pull the most recently posted image for our web app
# It's important to note that the value of the src will be different every time the page is updated, so we
# can't simply record the current value—we would only pull that image each time the code is executed, instead of the most
# recent one.

# We'll use the image tag and class (<img />and fancybox-img) to build the URL to the full-size image. 

# Find the relative image url
#img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
#img_url_rel

# We've done a lot with that single line.
    # An img tag is nested within this HTML, so we've included it.
    # .get('src') pulls the link to the image.
    
# What we've done here is tell BeautifulSoup to look inside the <img /> tag for an image with a class of fancybox-image. 
# Basically we're saying, "This is where the image we want lives—use the link that's inside these tags.

In [13]:
# if we copy and paste this link into a browser, it won't work. This is because it's only a partial link, as the base
# URL isn't included.

# Let's add the base URL to our code.
# Use the base URL to create an absolute URL
#img_url = f'https://spaceimages-mars.com/{img_url_rel}'
#img_url

In [14]:
# We've chosen to collect our data from Mars Facts (Links to an external site.), so let's visit the webpage to look at
# what we'll be working with. We already have a great photo and an article, so all we want from this page is the
# table. Our plan is to display it as a table on our own web app, so keeping the current HTML table format is important.

# Let's look at the webpage again, this time using our DevTools. All of the data we want is in a <table /> tag. HTML
# code used to create a table looks fairly complex, but it's really just breaking down and naming each component.

# Tables in HTML are basically made up of many smaller containers. The main container is the <table /> tag. Inside 
# the table is <tbody />, which is the body of the table—the headers, columns, and rows.

# <tr /> is the tag for each table row. Within that tag, the table data is stored in <td /> tags. This is where
# the columns are established.

# Instead of scraping each row, or the data in each <td />, we're going to scrape the entire table 
# with Pandas' .read_html() function.

# At the top of your Jupyter Notebook, add import pandas as pd to the dependencies and rerun the cell. This 
# way, we'll be able to use this new function without generating an error

# Turn the table into a dataframe basically
#df = pd.read_html('https://galaxyfacts-mars.com')[0]
#df.columns=['description', 'Mars', 'Earth']
#df.set_index('description', inplace=True)
#df

# df = pd.read_htmldf = pd.read_html('https://galaxyfacts-mars.com')[0] With this line, we're creating a new DataFrame 
# from the HTML table. The Pandas function read_html() specifically searches for and returns a list of tables found in the
# HTML. By specifying an index of 0, we're telling Pandas to pull only the first table it encounters, or the first item 
# in the list. Then, it turns the table into a DataFrame.

# df.columns=['description', 'Mars', 'Earth'] Here, we assign columns to the new DataFrame for additional clarity.

# df.set_index('description', inplace=True) By using the .set_index() function, we're turning the Description column
# into the DataFrame's index. inplace=True means that the updated index will remain in place, without having to reassign 
# the DataFrame to a new variable.

# Now, when we call the DataFrame, we're presented with a tidy, Pandas-friendly representation of the HTML table we 
# were just viewing on the website.

In [15]:
# How do we add the DataFrame to a web application? Our data is live—if the table is updated, then we want
# that change to appear in our app

# Thankfully, Pandas also has a way to easily convert our DataFrame back into HTML-ready code using
# the .to_html() function:

#df.to_html()

In [16]:
# End the session
#browser.quit()

In [17]:
### Export to Python

# we can't automate the scraping using the Jupyter Notebook. To fully automate it, it will need to be 
# converted into a .py file.

# The next step in making this an automated process is to download the current code into a Python file. It won't 
# transition over perfectly, we'll need to clean it up a bit, but it's an easier task than copying each 
# cell and pasting it over in the correct order

# The Jupyter ecosystem is an extremely versatile tool. We already know many of its great functions, such as the
# different libraries that work well with it and also how easy it is to troubleshoot code. Another feature is being 
# able to download the notebook into different formats.

# There are several formats available, but we'll focus on one by downloading to a Python file.

    # 1. While your notebook is open, navigate to the top of the page to the Files tab.
    
    # 2. From here, scroll down to the "Download as" section of the drop-down menu.
    
    # 3. Select "Python (.py)" from the next menu to download the code
    
    # 4. If you get a warning about downloading this type of file, click "Keep" to continue the download.
    
    # 5. Navigate to your Downloads folder and open the new file. A brief look at the first lines of code shows us
    # that the code wasn't the only thing to be ported over. The number of times each cell has been run is also there, 
    # for example
    
    # 6. Clean up the code by removing unnecessary blank spaces and comments. When you're done tidying up the code,
    # make sure you save it in your working folder with your notebook code as scraping.py. You can also test the script by
    # running it through your terminal.

In [18]:
# we want our code to be reused, and often, to pull the most recent data.
# Functions enable this capability by bundling our code into something that is easy for us (and once it's deployed, 
# whoever else we share the web app with) to use and reuse as needed.

# Each major scrape, such as the news title and paragraph or featured image, will be divided into a 
# self-contained, reusable function.

### News Title and Paragraph
# we will revisit the news title and paragraph code above and insert it into a function. Let's call it mars_news
# Begin the function by defining it, then indent the code as needed to adhere to function syntax

#def mars_news():

   # Visit the mars nasa news site
   #url = 'https://redplanetscience.com/'
   #browser.visit(url)

   # Optional delay for loading the page
   #browser.is_element_present_by_css('div.list_text', wait_time=1)

   # Convert the browser html to a soup object and then quit the browser
   #html = browser.html
   #news_soup = soup(html, 'html.parser')

   #slide_elem = news_soup.select_one('div.list_text')
   #slide_elem.find('div', class_='content_title')

   # Use the parent element to find the first <a> tag and save it as  `news_title`
   #news_title = slide_elem.find('div', class_='content_title').get_text()
   #news_title

   # Use the parent element to find the paragraph text
   #news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
   #news_p
    
   #return mars_news()

In [19]:
# Instead of having our title and paragraph printed within the function, we want to return them from the function
# so they can be used outside of it. We'll adjust our code to do so by deleting news_title and news_p and include
# them in the return statement instead

#def mars_news():

   # Visit the mars nasa news site
   #url = 'https://redplanetscience.com/'
   #browser.visit(url)

   # Optional delay for loading the page
   #browser.is_element_present_by_css('div.list_text', wait_time=1)

   # Convert the browser html to a soup object and then quit the browser
   #html = browser.html
   #news_soup = soup(html, 'html.parser')

   #slide_elem = news_soup.select_one('div.list_text')

   # Use the parent element to find the first <a> tag and save it as `news_title`
   #news_title = slide_elem.find('div', class_='content_title').get_text()

   # Use the parent element to find the paragraph text
   #news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

   #return news_title, news_p

In [20]:
# There are two things left to do. First, we need to add an argument to the function.
# The finishing touch is to add error handling to the mix. This is to address any potential errors that may
# occur during web scraping. We're going to add a try and except clause addressing AttributeErrors. By adding this error
# handling, we are able to continue with our other scraping portions even if this one doesn't work
# After adding the try portion of our error handling, we need to add the except part. By adding try: just before 
# scraping, we're telling Python to look for these elements. If there's an error, Python will continue to run the remainder
# of the code. If it runs into an AttributeError, however, instead of returning the title and paragraph, Python will
# return nothing instead.

#def mars_news(browser):

    # Scrape Mars News
    # Visit the mars nasa news site
    #url = 'https://redplanetscience.com/'
    #browser.visit(url)

    # Optional delay for loading the page
    #browser.is_element_present_by_css('div.list_text', wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    #html = browser.html
    #news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    #try:
        #slide_elem = news_soup.select_one('div.list_text')
        # Use the parent element to find the first 'a' tag and save it as 'news_title'
        #news_title = slide_elem.find('div', class_='content_title').get_text()
        # Use the parent element to find the paragraph text
        #news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

    #except AttributeError:
        #return None, None

    #return news_title, news_p

In [21]:
### Featured Image

# Declare and define our function.

#def featured_image(browser):
# Remove print statement(s) and return them instead.

# In our Jupyter Notebook version of the code, we printed the results of our scraping by simply stating the 
# variable (e.g., after assigning data to the img_url variable, we simply put img_url on the next line to view the data).
# We still want to view the data output in our Python script, but we want to see it at the end of our function instead of 
# within it.

#return img_url
# Add error handling for AttributeError.

#try:
   # find the relative image url
   #img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

#except AttributeError:
   #return None
# All together, this function should look as follows:

#def featured_image(browser):
    # Visit URL
    #url = 'https://spaceimages-mars.com'
    #browser.visit(url)

    # Find and click the full image button
    #full_image_elem = browser.find_by_tag('button')[1]
    #full_image_elem.click()

    # Parse the resulting html with soup
    #html = browser.html
    #img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    #try:
        # Find the relative image url
        #img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

    #except AttributeError:
        #return None

    # Use the base url to create an absolute url
    #img_url = f'https://spaceimages-mars.com/{img_url_rel}'

    #return img_url

In [22]:
### Mars Facts

# A BaseException is a little bit of a catchall when it comes to error handling. It is raised when any of the built-in
# exceptions are encountered and it won't handle any user-defined exceptions. We're using it here because we're
# using Pandas' read_html() function to pull data, instead of scraping with BeautifulSoup and Splinter. The data is 
# returned a little differently and can result in errors other than AttributeErrors, which is what we've been addressing 
# so far.

# Let's first define our function:

#def mars_facts():
# Next, we'll update our code by adding the try and except block.

   #try:
      # use 'read_html" to scrape the facts table into a dataframe
      #df = pd.read_html('https://galaxyfacts-mars.com')[0]
   #except BaseException:
      #return None
# As before, we've removed the print statements. Now that we know this code is working correctly, we don't need to 
# view the DataFrame that's generated.

# The code to assign columns and set the index of the DataFrame will remain the same, so the last update we need to
# complete for this function is to add the return statement.

   #return df.to_html()
# The full mars_facts function should look like this:

#def mars_facts():
    # Add try/except for error handling
    #try:
        # Use 'read_html' to scrape the facts table into a dataframe
        #df = pd.read_html('https://galaxyfacts-mars.com')[0]

    #except BaseException:
        #return None

    # Assign columns and set index of dataframe
    #df.columns=['Description', 'Mars', 'Earth']
    #df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    #return df.to_html()

In [23]:
# Now we're ready to integrate Mongo Into the Web App
# The next step is to integrate Mongo into the web app. We want the script to update the data stored in Mongo each time 
# it's run. We need to add just a little bit more code to our scraping.py script to establish the link between scraped data
# and the database.

# At the top of our scraping.py script, just after importing the dependencies, we'll add one more function. This function differs from the others in that it will:

# Initialize the browser.
# Create a data dictionary.
# End the WebDriver and return the scraped data.
# Let's define this function as "scrape_all" and then initiate the browser.

#def scrape_all():
    # Initiate headless driver for deployment
    #executable_path = {'executable_path': ChromeDriverManager().install()}
    #browser = Browser('chrome', **executable_path, headless=True)
    
    # This was put in the second cell after the dependencies. The following cells are just notes on this process and 
    # the rest of the code we're adding to that celll. 

In [24]:
# Next, we're going to set our news title and paragraph variables (remember, this function will return two values).
# This line of code tells Python that we'll be using our mars_news function to pull this data.

#news_title, news_paragraph = mars_news(browser)

In [25]:
# Now that we have our browser ready for work, we need to create the data dictionary. Add the following code to
# our scrape_all() function:

# Run all scraping functions and store results in dictionary
#data = {
      #"news_title": news_title,
      #"news_paragraph": news_paragraph,
      #"featured_image": featured_image(browser),
      #"facts": mars_facts(),
      #"last_modified": dt.datetime.now()
#}

# This dictionary does two things: It runs all of the functions we've created—featured_image(browser), for example—and
# it also stores all of the results. When we create the HTML template, we'll create paths to the dictionary's values, 
# which lets us present our data on our template. We're also adding the date the code was run last by 
# adding "last_modified": dt.datetime.now(). For this line to work correctly, we'll also need to add import datetime as
# dt to our imported dependencies at the beginning of our code.

In [26]:
# To finish up the function, there are two more things to do. The first is to end the WebDriver using the line
# browser.quit(). You can quit the automated browser by physically closing it, but there's a chance it won't fully quit
# in the background. By using code to exit the browser, you'll know that all of the processes have been stopped.

# Second, the return statement needs to be added. This is the final line that will signal that the function is complete, 
# and it will be inserted directly beneath browser.quit(). We want to return the data dictionary created earlier, so 
#our return statement will simply read return data.

# Stop webdriver and return data
#browser.quit()
#return data

In [27]:
# The last step we need to add is similar to the last code block in our app.py file.
#if __name__ == "__main__":
    # If running as script, print scraped data
    #print(scrape_all())

    ### The last few cells are in the second cell. Thes are just the notes for it. 