In [1]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

In [2]:
# Import pandas
import pandas as pd

In [3]:
# Import Splinter and set the chromedriver path
from splinter import Browser
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

## Scraping news from NASA

In [459]:
# Visit the following URL
url = "https://mars.nasa.gov/news/"
browser.visit(url)

In [460]:
# Scrape the page into Soup
html = browser.html
soup = BeautifulSoup(html, "html.parser")

In [461]:
# Fastest way to get the top article and headline
current_news_title = browser.find_by_css(".content_title")[0].text
current_news_p = browser.find_by_css(".article_teaser_body")[0].text

In [463]:
print(current_news_title)
print(current_news_p)

NASA's Curiosity Rover Finds an Ancient Oasis on Mars
New evidence suggests salty, shallow ponds once dotted a Martian crater — a sign of the planet's drying climate.


In [61]:
# I misunderstood and thought we needed to scrape all articles and headlines on the page
# Below is the procedure for how I gathered them and put them into a list of dictionaries

# Using list comprehension to get all titles and articles
# Headline: class = "content_title"
news_title = [link.text for link in browser.find_by_css(".content_title")]

# This appears to be the class for short paragraphs: <div class='rollover_description_inner'>
news_p = [link.text for link in browser.find_by_css(".article_teaser_body")]

# List to hold dictionary
mars_news = []

# Making a list of dictionaries 

# It turns out there are a bunch of empty entries at the end of the news_title list;
# happily, the entries before the end correspond with the news paragraphs of the same index

# Rather than clean up the news_title list, I can just set the while loop to end when the 
# news_p list ends and it amounts to the same thing.

i = 0

while i < len(news_p):
    mars_news.append(
    {'news_title': news_title[i], 'news_p': news_p[i]}
    )
    i += 1

In [389]:
mars_news[0:3]

[{'news_title': "NASA's Curiosity Rover Finds an Ancient Oasis on Mars",
  'news_p': "New evidence suggests salty, shallow ponds once dotted a Martian crater — a sign of the planet's drying climate."},
 {'news_title': "NASA's Mars 2020 Rover Tests Descent-Stage Separation",
  'news_p': "A crane lifts the rocket-powered descent stage away from NASA's Mars 2020 rover after technicians tested the pyrotechnic charges that separate the two spacecraft."},
 {'news_title': "NASA's Push to Save the Mars InSight Lander's Heat Probe",
  'news_p': "The scoop on the end of the spacecraft's robotic arm will be used to 'pin' the mole against the wall of its hole."}]

In [108]:
# Procedure if I wanted to zip the lists together and convert them to a single dictionary 
# where the title is the key and the article is the value

# # Zipping together the lists
# NASA_zip = zip(news_title, news_p)

# # Converting to a dictionary 
# NASA_dict = dict((x,y) for x, y in NASA_zip)

## Getting featured image from NASA JPL Mars Space Images

In [403]:
# New URL -- JPL Mars Space Images
url = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)

In [404]:
# Beautiful soup needs to be updated for the new URL
html = browser.html
soup = BeautifulSoup(html, "html.parser")

In [442]:
# Image source
img_source = soup.find(class_ = "carousel_item")['style']

In [443]:
# print(img_source)

background-image: url('/spaceimages/images/wallpaper/PIA16227-1920x1200.jpg');


In [450]:
# I only need what's between the quotation marks, so I'll split the string on those
# get the second element
img_string = img_source.split("'")[1]

In [452]:
# print(img_string)

/spaceimages/images/wallpaper/PIA16227-1920x1200.jpg


In [453]:
# I need the base url, then I can add the path
base_url = "https://jpl.nasa.gov"

# the image URL
featured_image_url = base_url + img_string

In [454]:
print(featured_image_url)

https://jpl.nasa.gov/spaceimages/images/wallpaper/PIA16227-1920x1200.jpg


## Getting latest tweet from Mars Weather twitter account

In [185]:
# Change browser to twitter
url = "https://twitter.com/marswxreport?lang=en"
browser.visit(url)

In [191]:
# class is "TweetTextSize"
# using Splinter within a list comprehension to strip the tweet down to its text; we want the top 
# tweet, so that's index 0
mars_weather = [tweet.text.strip() for tweet in browser.find_by_css(".TweetTextSize")][0]

In [192]:
# Looks good!
print(mars_weather)

InSight sol 309 (2019-10-10) low -102.3ºC (-152.1ºF) high -26.2ºC (-15.1ºF)
winds from the SSE at 6.1 m/s (13.6 mph) gusting to 18.9 m/s (42.4 mph)
pressure at 7.20 hPa


## Scraping a table about Mars from Space Facts

In [12]:
# Change browser to Space Facts
url = "https://space-facts.com/mars/"
browser.visit(url)

In [13]:
# Use Beautiful Soup to scrape table
html = browser.html
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all('table')[0]

In [16]:
# print(table)

In [14]:
# Use pandas to render to table string
mars_facts = pd.read_html(str(table))

In [17]:
# Table string looks good
print(mars_facts)

[  Mars - Earth Comparison             Mars            Earth
0               Diameter:         6,779 km        12,742 km
1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
2                  Moons:                2                1
3      Distance from Sun:   227,943,824 km   149,598,262 km
4         Length of Year:   687 Earth days      365.24 days
5            Temperature:    -153 to 20 °C      -88 to 58°C]


In [15]:
# Converting table to JSON
mars_table = mars_facts[0].to_json(orient='records')
print(mars_table)

[{"Mars - Earth Comparison":"Diameter:","Mars":"6,779 km","Earth":"12,742 km"},{"Mars - Earth Comparison":"Mass:","Mars":"6.39 \u00d7 10^23 kg","Earth":"5.97 \u00d7 10^24 kg"},{"Mars - Earth Comparison":"Moons:","Mars":"2","Earth":"1"},{"Mars - Earth Comparison":"Distance from Sun:","Mars":"227,943,824 km","Earth":"149,598,262 km"},{"Mars - Earth Comparison":"Length of Year:","Mars":"687 Earth days","Earth":"365.24 days"},{"Mars - Earth Comparison":"Temperature:","Mars":"-153 to 20 \u00b0C","Earth":"-88 to 58\u00b0C"}]


In [24]:
# Making a dataframe that's unnecessary for the .py file but looks cool
df = pd.read_json(mars_table)
df

Unnamed: 0,Mars - Earth Comparison,Mars,Earth
0,Diameter:,"6,779 km","12,742 km"
1,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
2,Moons:,2,1
3,Distance from Sun:,"227,943,824 km","149,598,262 km"
4,Length of Year:,687 Earth days,365.24 days
5,Temperature:,-153 to 20 °C,-88 to 58°C


In [35]:
# Just for giggles, here's a printout in ASCII
from tabulate import tabulate
print( tabulate(mars_facts[0], headers='keys', tablefmt='psql') )

+----+---------------------------+-----------------+-----------------+
|    | Mars - Earth Comparison   | Mars            | Earth           |
|----+---------------------------+-----------------+-----------------|
|  0 | Diameter:                 | 6,779 km        | 12,742 km       |
|  1 | Mass:                     | 6.39 × 10^23 kg | 5.97 × 10^24 kg |
|  2 | Moons:                    | 2               | 1               |
|  3 | Distance from Sun:        | 227,943,824 km  | 149,598,262 km  |
|  4 | Length of Year:           | 687 Earth days  | 365.24 days     |
|  5 | Temperature:              | -153 to 20 °C   | -88 to 58°C     |
+----+---------------------------+-----------------+-----------------+


## Getting high-res pictures of the Mars hemispheres from USGS.gov

In [11]:
# Changing url to usgs.gov
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)
print(url)

https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars


In [340]:
# Updating Beautiful Soup
html = browser.html
soup = BeautifulSoup(html, "html.parser")

In [322]:
# splinter can click links to navigate to new pages -- but only if they're in full URL form
# NOTE: the ones we need, i.e., the Mars hemisphere links, are not clickable!

for link in soup.find_all('a'):
    print(link.get('href'))

https://www.usgs.gov/centers/astrogeo-sc
https://nasa.gov
https://pds-imaging.jpl.nasa.gov/
/search
/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/cerberus_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/schiaparelli_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/syrtis_major_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced
/search/map/Mars/Viking/valles_marineris_enhanced
http://isis.astrogeology.usgs.gov
http://planetarynames.wr.usgs.gov
https://astrogeology.usgs.gov/tools/map-a-planet-2
https://www.usgs.gov/centers/astrogeo-sc/science/cartography-and-imaging-sciences-node-nasa-planetary-data-system
https://www.usgs.gov/centers/astrogeo-sc/science/regional-planetary-image-facility-rpif
https://www.usgs.gov/centers/astrogeo-sc/science/usgsnasa-planetary-photogrammetry-guest-facility
http://pilot.wr.usgs.gov
https://www.usgs.gov/centers/astrogeo-sc/science/mrctr-gis-lab
http://astrogeology.usgs.

In [323]:
# This will hit the live link to NASA above, and splinter will click it and take the browser there
browser.click_link_by_partial_href('nasa.gov')

# Unfortunately, splinter will not take the browser to the image links, even when I've added them
# to a full URL

In [None]:
# I'll need to do a number of things to get image names and links:

# 1) Set up a loop to find all the hemisphere names, pulling by h3 tag, 
#    then cleaning up the names and saving them to a list
# 2) Set up another loop to find all the paths to the pages that have a link to the larger image
# 3) Set up a third loop to take browser to the new pages and grab the image link

In [349]:
# Step 1

# I'll make a list to contain the elements; 'title' is what the project calls for
title = []

# There are four elements to find, so I'll set this up to loop four times
i = 0

while i < 4:
    hemisphere = soup.find_all('h3')[i].text.strip()
    # This strips "Enhanced" off the end of the string
    hemisphere = hemisphere[:-9]
    # Append to my list
    title.append(hemisphere)
    i += 1

In [350]:
title

['Cerberus Hemisphere',
 'Schiaparelli Hemisphere',
 'Syrtis Major Hemisphere',
 'Valles Marineris Hemisphere']

In [355]:
# Step 2

# Now I'll need a list to contain the URL paths
paths = []

# It turns out that there are duplicates for each path, so the loop needs to run 8 times instead of 4
# and I'll increment by two so that I only capture one instance of the path

i = 0

while i < 8:
    path = soup.find_all('a', class_ = 'itemLink product-item')[i]['href']
    paths.append(path)
    i+=2

In [356]:
paths

['/search/map/Mars/Viking/cerberus_enhanced',
 '/search/map/Mars/Viking/schiaparelli_enhanced',
 '/search/map/Mars/Viking/syrtis_major_enhanced',
 '/search/map/Mars/Viking/valles_marineris_enhanced']

In [357]:
# Step 3-A

# I need the base URL so that I can add the path for each new browser visit
base_url = "https://astrogeology.usgs.gov"

# I'll make a list to contain full URLs
url_list = []

# I'll make the full URLs by attaching the contents of my paths list to the end of the base_url
for path in paths:
    url = base_url + path
    url_list.append(url)

In [359]:
url_list

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [360]:
# Step 3-B

# I'll make a list to hold the image url I'm going to scrape
image_url = []

# I'll cycle through the URLs in my url_list, taking the browser to each and getting the image URL out

for url in url_list:
    
    browser.visit(url)
    
    # Beautiful soup needs to be updated for the new URL
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    
    # Pull out the image link 
    img_source = soup.find('img', class_ = "wide-image")['src']
    
    # The image source URL is just a path, so I'll need to make a full url to add to the list
    img_url = base_url + img_source
    
    # Add to list
    image_url.append(img_url)

In [366]:
image_url

['https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg',
 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg']

In [363]:
# Now I need to make dictionaries for each image, with title and image_url as keys

# I'll store them in this list:
hemisphere_image_urls = []

# Now use a while loop to capture them as separate dictionaries (as opposed to zipping into tuples)
# The while will allow me to use an index
i = 0

while i < 4:
    hemisphere_image_urls.append(
    {'title': title[i], 'image_url': image_url[i]})
    i += 1


In [364]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/cfa62af2557222a02478f1fcd781d445_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/3cdd1cbf5e0813bba925c9030d13b62e_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/ae209b4e408bb6c3e67b6af38168cf28_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'image_url': 'https://astrogeology.usgs.gov/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg'}]