In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
# Web scraping of NASA Mars News Site

url = "https://mars.nasa.gov/news/"
browser.visit(url)

In [4]:
html = browser.html
soup = BeautifulSoup(html, "html.parser")

articles = soup.find('ul', class_="item_list")

news_title = articles.find('div', class_="content_title").text

news_p = articles.find('div', class_="article_teaser_body").text

print(news_title)
print(news_p)

Martian Skies Clearing over Opportunity Rover
As the skies above Opportunity continue to clear, engineers at JPL are increasing the frequency of commands asking the solar-powered rover to communicate with Earth.


In [5]:
# Navigate to appropriate elements using splinter

url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)

In [6]:
# Use splinter to click the full image button in order to retrieve appropriate url

browser.click_link_by_partial_text('FULL IMAGE')

In [7]:
# Use beautiful soup to read html page

html = browser.html
soup = BeautifulSoup(html, "html.parser")

# Find the button classes to retrieve url component
pic = soup.find('div', class_="buttons")

# Isolate both buttons and their 'a' tags
article_link = pic.find_all('a')

    # Use iterator to separate the 'a' tags, only take the relevant value and concatenate it with url to create
    # complete url for Splinter navigation
for link in article_link:
    if link["href"] != "#":
        new_url = "https://www.jpl.nasa.gov/" + link["href"]
        print(new_url)

https://www.jpl.nasa.gov//spaceimages/details.php?id=PIA17009 


In [8]:
# Use the retrieved url to navigate to the article page
browser.visit(new_url)

In [9]:
html = browser.html
soup = BeautifulSoup(html, "html.parser")

image = soup.find('figure', class_="lede").a["href"]

featured_image_url = "https://www.jpl.nasa.gov/" + image

print(featured_image_url)

https://www.jpl.nasa.gov//spaceimages/images/largesize/PIA17009_hires.jpg


In [10]:
# Scraping Mars weather data from the Mars weather twitter
url = "https://twitter.com/marswxreport?lang=en"
browser.visit(url)

In [11]:
# Set the html value for beautiful soup, and then parse the html page

html = browser.html
soup = BeautifulSoup(html, "html.parser")

# Get all tweets using the div class "tweet"

all_tweets = soup.find_all('div', class_="tweet")

# Iterate through all retrieved div containers

for tweet in all_tweets:

    # Find the element of the tweet that contains the text
    tweet_content = tweet.find('div', class_="js-tweet-text-container")

    # Access the text of that element
    tweet_text = tweet_content.p.text
    
    # Split the returned string so that it can be compared and the appropriate value can be retrieved
    tweet_compare = tweet_text.split(" ")
    
    # Compare the first word of every tweet, as every weather tweet begins with the word "Sol", to prevent the scraper
    # from retrieving the text of one of the twitter page's non-weather tweets
    if tweet_compare[0] == "Sol":
        mars_weather = tweet_text
        break

# Print to ensure that the appropriate value was returned
print(mars_weather)
    

Sol 2165 (2018-09-08), high -10C/14F, low -70C/-93F, pressure at 8.87 hPa, daylight 05:39-17:55


In [12]:
# Using pandas html reader to retrieve table from space facts web page
url = "http://space-facts.com/mars/"

# Retrieve all tables from the page
table = pd.read_html(url)

# Take the first (and only) table from the returned list
df = table[0]

# Rename the columns to make it look nicer
df.columns = ['Mars Planet Profile','Facts']

# Display the dataframe
df

Unnamed: 0,Mars Planet Profile,Facts
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [13]:
# Scraping the USGS Astrogeology website for Mars hemispheres pictures

url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)

In [14]:
# Create list for storing dictionaries

hemisphere_image_urls = []

# Visit the initial search page and collect all hemisphere titles

html = browser.html
soup = BeautifulSoup(html,"html.parser")

items = soup.find_all('div', class_="description")

# Iterate through the list and capture all relevant values

for item in items:
    
    # Get the hemisphere title
    header = item.a.h3.text
    
    # Split the header so that the "Enhanced" at the end of the string can be removed, and so the first word can be
    # used to click the relevant link
    header_split = header.split(" ")
    
    # Use a comparative to determine if the string is three or four words long
    if len(header_split) == 4:
        
        # Rebuild the string and save to a variable, excluding the word "enhanced"
        title_string = header_split[0] + " " + header_split[1] + " " + header_split[2]
        
    else:
        
        # Rebuild the string and save to a variable, excluding the word "enhanced"
        title_string = header_split[0] + " " + header_split[1]
        
    # Using the first word of the title, click to that hemisphere's page
    browser.click_link_by_partial_text(header_split[0])

    # Use Splinter & BeautifulSoup to read the webpage
    html = browser.html
    soup = BeautifulSoup(html,"html.parser")

    # Get the relevant div class containing the link to the picture
    pictures = soup.find('div', class_="downloads")

    # Retrieve the picture link and save to a variable
    picture_link = pictures.ul.li.a["href"]

    # Create a temporary dictionary that contains the title string and picture link
    temp_dict = {
        "title":title_string,
        "img_url":picture_link
    }
    
    # Append the temporary dictionary to the list created earlier
    hemisphere_image_urls.append(temp_dict)
    
    # Click back to the search page so that the next hemisphere scrape can be performed
    browser.click_link_by_partial_text("Back")
    
# Print the list to ensure successful result
print(hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere', 'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]
