In [1]:
# Set up dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import pandas as pd
import os
from webdriver_manager.chrome import ChromeDriverManager
import time

In [2]:
# Setup Chrome Driver for use with Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389






[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Trying to download new driver from https://chromedriver.storage.googleapis.com/89.0.4389.23/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\mthor\.wdm\drivers\chromedriver\win32\89.0.4389.23]


## NASA Mars News
This section:
1. uses Splinter to navigate to the NASA Mars Latest News site
2. uses BeautifulSoup to scrape the latest article title and teaser text  

In [3]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# HTML object
html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(html, 'html.parser')

In [4]:
# Scrape the title, teaser and date from the first list_text class div element on the page
article = soup.find('div', class_='list_text')
news_title = article.find('div', class_='content_title').text
news_teaser = article.find('div', class_='article_teaser_body').text
article_date = article.find('div', class_='list_date').text


## JPL Mars Space Images - Featured Image
This section:
1. Uses Splinter to navigate to the JPL Mars Space Images site to find the image URL to the full size image for the latest Featured Mars Image
2. Uses BeautifulSoup to parse the html and save the url of this image

In [5]:
# Visit the JPL Mars Space Images page
url = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
browser.visit(url)

# Parse the HTML on this page with Beautiful Soup
html = browser.html
soup = BeautifulSoup(html,'html.parser')

# Find the link to the first displayed Image on the page and navigate to it
image_page = soup.find('div', class_='SearchResultCard').a['href']
full_image_page = f"https://www.jpl.nasa.gov{image_page}"
browser.visit(full_image_page)

# Parse the image page with Beautiful Soup and find the url of the full size image
html = browser.html
soup = BeautifulSoup(html,'html.parser')
featured_image_url = soup.find("img", class_="BaseImage").attrs["src"] 
print(featured_image_url)


https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA23810.width-1024.jpg


## Mars Facts
This section:
1. Uses Pandas to parse the html table on the Mars space facts page and saves the Mars facts in a dataframe
2. Saves that as an HTML table string
2. It also writes it out to an html file for backup purposes

In [6]:
url = 'https://space-facts.com/mars/'
tables = pd.read_html(url)
mars_df = tables[0]
mars_df.columns = ['Fact','Detail']
mars_df
mars_facts_table = mars_df.to_html(justify='left',index=False)
mars_df.to_html('mars_facts_tbl.html',justify='left',index=False)

## Mars Hemispheres
This section:
1. Uses Splinter to navigate to the Astrogeology Mars Hemispheres pages
2. Uses BeautifulSoup to parse the html and retrieve the title and url for each of the 4 hemisphere images.
    - **NOTE: this step can be a little slow to run. Make sure it has completed before proceeding to the next cell.**
3. Saves this to a Python dictionary

In [7]:
# Visit the  USGS Astrogeology site and get titles of and links to Hemisphere image pages
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

# Parse the HTML on this page with Beautiful Soup
html = browser.html
soup = BeautifulSoup(html,'html.parser')

# Retrieve the parent divs for all items
items = soup.find_all('div', class_='item')

link_list = []
title_list = []
# loop over results to get Hemisphere image info
for item in items:
    # scrape the image title and link 
    link = item.find('a', class_='itemLink').attrs["href"] 
    title = item.find('h3').text
    link_list.append(link)
    title_list.append(title)


In [8]:
# Iterate through the list of hemisphere image links and navigate to each individual page to scrape the image url
image_list = []
for item in link_list:
   
    #Navigate to new page
    article_page = f"https://astrogeology.usgs.gov{item}"
    browser.visit(article_page)
    
    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
        
    # Retrieve elements in the downloads div and get the url to the full image
    downloads = soup.find("div", class_="downloads")
    image_url = downloads.find('a').attrs["href"]
    image_list.append(image_url)
      
 # Create a list of hemisphere dictionaries
Cerberus = {'title':title_list[0], 'img_url':image_list[0]}
Schiaparelli = {'title':title_list[1], 'img_url':image_list[1]}
Syrtis = {'title':title_list[2], 'img_url':image_list[2]}
Valles = {'title':title_list[3], 'img_url':image_list[3]}
hemisphere_image_urls = [Cerberus, Schiaparelli, Syrtis, Valles]


## Mars Hemispheres
This section builds a Python dictionary containing all of the scraped Mars data which will be inserted into a MongoDB database by my Flask application (next step).

In [10]:
# Create a dictionary containing all of my scraped Mars website data
Mars_dict = {
       'news_title': news_title,
       'news teaser': news_teaser,
       'featured_image_url': featured_image_url,
       'mars_facts_table': str(mars_facts_table),
       'hemisphere_image_urls': hemisphere_image_urls
    }
print ("Dictionary of scraped Mars data contains : " +  str(Mars_dict)) 

Dictionary of scraped Mars data contains : {'news_title': 'NASA Ingenuity Mars Helicopter Prepares for First Flight', 'news teaser': 'Now uncocooned from its protective carbon-fiber shield, the helicopter is being readied for its next steps.  ', 'featured_image_url': 'https://d2pn8kiwq2w21t.cloudfront.net/images/jpegPIA23810.width-1024.jpg', 'mars_facts_table': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: left;">\n      <th>Fact</th>\n      <th>Detail</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n    