In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import time

### NASA Mars News

In [2]:
# *** Scrape the [NASA Mars News Site] ***
url_NASA = "https://mars.nasa.gov/news"
r = requests.get(url_NASA) # sends a request to the url
data = r.text # turns response into texts
soup = BeautifulSoup(data, "html.parser") # changes the response from text to html

In [3]:
# collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later.
soup_div = soup.find(class_="slide") # within div in body, within <ul>, <li class=slide>.
soup_news = soup_div.find_all('a') # search by anchor
soup_news[0] # note: find_all returns list, and further find_alls only look at one entry in list at a time

<a href="/news/8308/a-piece-of-mars-is-going-home/">
<div class="rollover_description">
<div class="rollover_description_inner">
When it launches in 2020, NASA's next Mars rover will carry a chunk of Martian meteorite on board.
</div>
<div class="overlay_arrow">
<img alt="More" src="/assets/overlay-arrow.png"/>
</div>
</div>
<img alt="A Piece of Mars is Going Home" class="img-lazy" data-lazy="/system/news_items/list_view_images/8308_PIA22245_320.JPG" src="/assets/loading_320x240.png"/>
</a>

In [4]:
#getting the title
NASA_latest_t = soup_news[1].get_text().strip()

NASA_latest_t

'A Piece of Mars is Going Home'

In [5]:
#getting the paragraph
    # getting the paragraph url
soup_p = soup_div.find_all('a', href=True)
soup_p_url = soup_p[0]['href']
soup_p_url

'/news/8308/a-piece-of-mars-is-going-home/'

In [6]:
#    Scrape the href of the first news article
url = "https://mars.nasa.gov/"
news_url = url + soup_p_url
# request url
r = requests.get(news_url)
data = r.text
soup = BeautifulSoup(data, "html.parser")

soup_para = soup.find(class_='wysiwyg_content')
soup_para = soup_para.find_all('p')


In [7]:
#    save the text of the paragraphs to a list
NASA_latest_p = []
for entry in soup_para:
    paragraph = entry.get_text().strip()    
    NASA_latest_p.append(paragraph)
    

NASA_latest_p

['A chunk of Mars will soon be returning home.',
 "A piece of a meteorite called Sayh al Uhaymir 008 (SaU008) will be carried on board NASA's Mars 2020 rover mission, now being built at the agency's Jet Propulsion Laboratory in Pasadena, California. This chunk will serve as target practice for a high-precision laser on the rover's arm.",
 "Mars 2020's goal is ambitious: collect samples from the Red Planet's surface that a future mission could potentially return to Earth. One of the rover's many tools will be a laser designed to illuminate rock features as fine as a human hair.",
 "That level of precision requires a calibration target to help tweak the laser's settings. Previous NASA rovers have included calibration targets as well. Depending on the instrument, the target material can include things like rock, metal or glass, and can often look like a painter's palette.",
 "But working on this particular instrument sparked an idea among JPL scientists: why not use an actual piece of Mar

### JPL Mars Space Images - Featured Image

In [3]:
# Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [4]:
# Use splinter to navigate the site and find the image url for the current Featured Mars Image
#     the mars featured images are under a list element of the slide class. '>' signifies a child element.  
browser.find_by_css('li.slide>a.fancybox').first.click()
time.sleep(1)


In [5]:
# clicks the 'more info' button (caution!: the 'share' button is under a similar but different class)
browser.find_by_css('div.buttons>a.button').first.click()

In [6]:
# assign the url string to a variable called `featured_image_url`.
#     Here, I decide to get both the full-size .jpg and an 800x600 size image for the webpage
html = browser.html
soup = BeautifulSoup(html, "html.parser")

# full-size jpg (to be linked if image is clicked)
feat_full_img_soup = soup.find(class_="main_image")
feat_full_img = feat_full_img_soup.get('src')


In [7]:
# smaller size jpg (to be displayed on the webpage)
#     uses splinter instead of beautiful soup
browser.click_link_by_partial_href('800x600.jpg')
#     switch over to the next browser (window no. 2)
#     save it's url, then close 2nd window
browser.windows.current = browser.windows[1]  
featured_image_url = browser.url
browser.windows[1].close()


In [9]:
# save the two urls 
ori_url = 'https://www.jpl.nasa.gov'
feat_full_img = ori_url + feat_full_img

feat_full_img


'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA22273_hires.jpg'

### Mars Weather

In [12]:
''' 
*** Visit the Mars Weather twitter account (https://twitter.com/marswxreport?lang=en) and scrape the latest 
Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`. ***
'''
url = 'https://twitter.com/marswxreport?lang=en'
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')

mars_tweets = soup.find(class_='stream-items js-navigable-stream')
mars_tweets = mars_tweets.find(class_="js-tweet-text-container")

mars_weather = mars_tweets.p.text
mars_weather

'Sol 1962 (Feb 12, 2018), Sunny, high -14C/6F, low -78C/-108F, pressure at 7.38 hPa, daylight 05:40-17:27'

### Mars Facts

In [13]:
''' 
*** Visit the Mars Facts webpage (http://space-facts.com/mars/) and use Pandas to scrape the table containing 
facts about the planet including Diameter, Mass, etc. ***
'''
facts_url = 'http://space-facts.com/mars/'
all_facts_df = pd.read_html(facts_url)     # searches for html tables & returns list of dataframes
all_facts_df = all_facts_df[0]
all_facts_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [14]:
''' # clean up df (may be unnecessary)
all_facts_df.rename_axis({0:"category", 1:"content"}, axis=1, inplace=True)
all_facts_df.set_index("category", inplace=True)
all_facts_df 
'''

' # clean up df (may be unnecessary)\nall_facts_df.rename_axis({0:"category", 1:"content"}, axis=1, inplace=True)\nall_facts_df.set_index("category", inplace=True)\nall_facts_df \n'

In [15]:
# Use Pandas to convert the data to a HTML table string.
facts_html = all_facts_df.to_html(header=False, index=False, justify='left')

facts_html

'<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

### Mars Hemispheres

In [16]:
''' 
*** Visit the USGS Astrogeology site 
(https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) 
to obtain high resolution images for each of Mar's hemispheres.
'''
executable_path = {'executable_path': 'chromedriver.exe'}     # included these 2 lines
browser = Browser('chrome', **executable_path, headless=False)     # to prevent browser.visit() failing 
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)


In [17]:
# click each of the links to the hemispheres to find the image url to the full resolution image.
#    get list of <a href links> 
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

hemi_soup = soup.find_all(class_='itemLink product-item')

hemi_href_ls = []

for item in hemi_soup:
    url_index = 'https://astrogeology.usgs.gov'
    href = item['href']
    link = url_index + href
    hemi_href_ls.append(link)

# Get unique hrefs
'''     I could just go to these urls separately using browser.visit(url). But I interpret the instructions 
        as saying that I need to use splinter to click on the link in the browser.     '''
hemi_href_ls = np.unique(hemi_href_ls)
hemi_href_ls

array(['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
       'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
       'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
       'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'],
      dtype='<U78')

In [18]:
''' Caution!: It seems splinter can only click link based on the exact wording of the text
browser.click_link_by_partial_text('Cerberus Hemisphere')    #e.g. function will fail to find lower case 'cerberus'
'''

" Caution!: It seems splinter can only click link based on the exact wording of the text\nbrowser.click_link_by_partial_text('Cerberus Hemisphere')    #e.g. function will fail to find lower case 'cerberus'\n"

In [19]:
# Beautiful soup to search browser html for headers (these contain the hemisphere names)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

headers_soup = soup.find_all('h3')
#eliminates " Enhanced" string at the end of each header. 
test = headers_soup[2].text.replace(" Enhanced", "")
test

'Syrtis Major Hemisphere'

In [20]:
# For each header in the beautiful soup, click link associated with it and get img_url 
hemisphere_image_urls = []
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

for header in headers_soup:
    #start at origin url for the Mars hemisphere section 
    window = browser.windows[0]     # current window, the first window
    browser.visit(url)
    time.sleep(2)     # wait 2 secs for browser to load 
    #getting title
    title = header.text
    title = title.replace(" Enhanced", "")     #get rid of " " + "Enhanced" for when dict is appended
    browser.click_link_by_partial_text(title)
    time.sleep(2)     # again, wait 2 secs for browser to load    
    browser.click_link_by_text('Sample')
    browser.windows.current = browser.windows[1]     # switch current window to the window that just opened
    img_url = browser.url
    browser.windows.current = window     # switch the current window back 
    hemisphere_image_urls.append({'title':title, 'img_url':img_url})
    window.close_others()    # close all the other windows to keep browser nice and tidy!

hemisphere_image_urls

[{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere'}]