In [13]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

# Import pandas for .read_html() function
import pandas as pd

In [2]:
# set your executable path then set up the URL
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




Current google-chrome version is 92.0.4515
Get LATEST driver version for 92.0.4515
Driver [C:\Users\Gary Laptop\.wdm\drivers\chromedriver\win32\92.0.4515.107\chromedriver.exe] found in cache


In [3]:
# assign the url and instruct the browser to visit it
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

# With the following line, browser.is_element_present_by_css('div.list_text', wait_time=1), we are 
# accomplishing two things.

# One is that we're searching for elements with a specific combination of tag (div) and attribute (list_text). As an
# example, ul.item_list would be found in HTML as <ul class="item_list">.

# Secondly, we're also telling our browser to wait one second before searching for components. 
# The optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are
# image-heavy

True

In [4]:
# set up the HTML parser:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

# Notice how we've assigned slide_elem as the variable to look for the <div /> tag and its descendent 
# (the other tags within the <div /> element)? This is our parent element. This means that this element holds all of
# the other elements within it, and we'll reference it when we want to filter search results even further. The . is used 
# for selecting classes, such as list_text, so the code 'div.list_text' pinpoints the <div /> tag with the
# class of list_text. CSS works from right to left, such as returning the last item on the list instead of the first. 
# Because of this, when using select_one, the first matching element returned will be a <li /> element with a class of
# slide and all nested elements within it

In [5]:
# After opening the page in a new browser, right-click to inspect and activate your DevTools. Then search for
# the HTML components you'll use to identify the title and paragraph you want
# What we will search is: class = “content_title”

# We'll want to assign the title and summary text to variables we'll reference later
# begin our scraping:
slide_elem.find('div', class_='content_title')

# In this line of code, we chained .find onto our previously assigned variable, slide_elem. When we do this,
# we're saying, "This variable holds a ton of information, so look inside of that information to find this specific
# data." The data we're looking for is the content title, which we've specified by saying, "The specific data is in 
# a <div /> with a class of 'content_title'."

<div class="content_title">NASA's Mars 2020 Heads Into the Test Chamber</div>

In [6]:
# The title is in that mix of HTML in our output—that's awesome! But we need to get just the text, and the extra
# HTML stuff isn't necessary. 

# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

# We've added something new to our .find() method here: .get_text(). When this new method is chained onto .find(), only 
# the text of the element is returned. The code above, for example, would return only the title of the news article and not
# any of the HTML tags or elements

# Once executed, the result is the most recent title published on the website. When the website is updated and a new 
# article is posted, when our code is run again, it will return that article instead.

"NASA's Mars 2020 Heads Into the Test Chamber"

In [7]:
# Next we need to add the summary text.
# Use the DevTools selector tool and select the article summary (teaser), then check to see which tag is highlighted.

# We know that "article_teaser_body" is the right class name, but when we search for it, there is more than one 
# result. What now?

# That's okay. There will be many matches because there are many articles, each with a tag of <div /> and a class of
# article_teaser_body. We want to pull the first one on the list, not a specific one, so more than 10 results is fine. 
# In this case, if our scraping code is too specific, we'd pull only that article summary instead of the most recent.

# Because new articles are added to the top of the list, and we only need the most recent one, our search leads us to the
# first article.

# There are two methods used to find tags and attributes with BeautifulSoup:

# .find() is used when we want only the first class and attribute we've specified.
# .find_all() is used when we want to retrieve all of the tags and attributes.
# For example, if we were to use .find_all() instead of .find() when pulling the summary, we would retrieve all of
# the summaries on the page instead of just the first one.

# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

'In this time-lapse video taken at JPL, engineers move the Mars 2020 rover into a large vacuum chamber for testing in Mars-like environmental conditions.'

# Our next step scraping code will be to gather the featured images from the Jet Propulsion Laboratory's Space 
# Images (Links to an external site.) webpage. In your Jupyter notebook, use markdown to separate the article scraping 
# from the image scraping.

### Featured Images

# change the format of the code cell to "Markdown."
# You can access the cell formatting feature by using a drop-down menu at the top of the notebook. It's currently 
# set to "Code," so click the down arrow to toggle the drop-down menu and select "Markdown" instead.
# This would normally just be a cell that says featured images and is changed to markdown, but I have these notes to remember it. No code should go in this one.

In [8]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [9]:
# Next, we want to click the "Full Image" button. This button will direct our browser to an image slideshow.
# Let's take a look at the button's HTML tags and attributes with the DevTools
# <button class="btn btn-outline-light"> FULL IMAGE</button>
# This is a fairly straightforward HTML tag: the <button> element has a two 
# classes (btn and btn-outline-light) and a string reading "FULL IMAGE".

# First, let's use the dev tools to search for all the button elements. There are 3 of them.
# Since there are only three buttons, and we want to click the full-size image button, we can go ahead and use
# the HTML tag in our code.

# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [10]:
# With the new page loaded onto our automated browser, it needs to be parsed so we can continue and scrape t
# he full-size image URL

# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [11]:
# Now we need to find the relative image URL. In our browser (make sure you're on the same page as the automated one),
# activate your DevTools again. This time, let's find the image link for that image. 

# We want to pull the most recently posted image for our web app
# It's important to note that the value of the src will be different every time the page is updated, so we
# can't simply record the current value—we would only pull that image each time the code is executed, instead of the most
# recent one.

# We'll use the image tag and class (<img />and fancybox-img) to build the URL to the full-size image. 

# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

# We've done a lot with that single line.
    # An img tag is nested within this HTML, so we've included it.
    # .get('src') pulls the link to the image.
    
# What we've done here is tell BeautifulSoup to look inside the <img /> tag for an image with a class of fancybox-image. 
# Basically we're saying, "This is where the image we want lives—use the link that's inside these tags.

'image/featured/mars2.jpg'

In [12]:
# if we copy and paste this link into a browser, it won't work. This is because it's only a partial link, as the base
# URL isn't included.

# Let's add the base URL to our code.
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

In [14]:
# We've chosen to collect our data from Mars Facts (Links to an external site.), so let's visit the webpage to look at
# what we'll be working with. We already have a great photo and an article, so all we want from this page is the
# table. Our plan is to display it as a table on our own web app, so keeping the current HTML table format is important.

# Let's look at the webpage again, this time using our DevTools. All of the data we want is in a <table /> tag. HTML
# code used to create a table looks fairly complex, but it's really just breaking down and naming each component.

# Tables in HTML are basically made up of many smaller containers. The main container is the <table /> tag. Inside 
# the table is <tbody />, which is the body of the table—the headers, columns, and rows.

# <tr /> is the tag for each table row. Within that tag, the table data is stored in <td /> tags. This is where
# the columns are established.

# Instead of scraping each row, or the data in each <td />, we're going to scrape the entire table 
# with Pandas' .read_html() function.

# At the top of your Jupyter Notebook, add import pandas as pd to the dependencies and rerun the cell. This 
# way, we'll be able to use this new function without generating an error

# Turn the table into a dataframe basically
df = pd.read_html('https://galaxyfacts-mars.com')[0]
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df

# df = pd.read_htmldf = pd.read_html('https://galaxyfacts-mars.com')[0] With this line, we're creating a new DataFrame 
# from the HTML table. The Pandas function read_html() specifically searches for and returns a list of tables found in the
# HTML. By specifying an index of 0, we're telling Pandas to pull only the first table it encounters, or the first item 
# in the list. Then, it turns the table into a DataFrame.

# df.columns=['description', 'Mars', 'Earth'] Here, we assign columns to the new DataFrame for additional clarity.

# df.set_index('description', inplace=True) By using the .set_index() function, we're turning the Description column
# into the DataFrame's index. inplace=True means that the updated index will remain in place, without having to reassign 
# the DataFrame to a new variable.

# Now, when we call the DataFrame, we're presented with a tidy, Pandas-friendly representation of the HTML table we 
# were just viewing on the website.

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [15]:
# How do we add the DataFrame to a web application? Our data is live—if the table is updated, then we want
# that change to appear in our app

# Thankfully, Pandas also has a way to easily convert our DataFrame back into HTML-ready code using
# the .to_html() function:

df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [16]:
# End the session
browser.quit()

In [None]:
### Export to Python

# we can't automate the scraping using the Jupyter Notebook. To fully automate it, it will need to be 
# converted into a .py file.

# The next step in making this an automated process is to download the current code into a Python file. It won't 
# transition over perfectly, we'll need to clean it up a bit, but it's an easier task than copying each 
# cell and pasting it over in the correct order

# The Jupyter ecosystem is an extremely versatile tool. We already know many of its great functions, such as the
# different libraries that work well with it and also how easy it is to troubleshoot code. Another feature is being 
# able to download the notebook into different formats.

# There are several formats available, but we'll focus on one by downloading to a Python file.

    # 1. While your notebook is open, navigate to the top of the page to the Files tab.
    
    # 2. From here, scroll down to the "Download as" section of the drop-down menu.
    
    # 3. Select "Python (.py)" from the next menu to download the code
    
    # 4. If you get a warning about downloading this type of file, click "Keep" to continue the download.
    
    # 5. Navigate to your Downloads folder and open the new file. A brief look at the first lines of code shows us
    # that the code wasn't the only thing to be ported over. The number of times each cell has been run is also there, 
    # for example
    
    # 6. Clean up the code by removing unnecessary blank spaces and comments. When you're done tidying up the code,
    # make sure you save it in your working folder with your notebook code as scraping.py. You can also test the script by
    # running it through your terminal.