In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import time

### NASA Mars News

In [2]:
# *** Scrape the [NASA Mars News Site] ***
url_NASA = "https://mars.nasa.gov/news"
r = requests.get(url_NASA) # sends a request to the url
data = r.text # turns response into texts
soup = BeautifulSoup(data, "html.parser") # changes the response from text to html

In [3]:
# collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later.
soup_div = soup.find(class_="slide") # within div in body, within <ul>, <li class=slide>.
soup_news = soup_div.find_all('a') # search by anchor
soup_news[0] # note: find_all returns list, and further find_alls only look at one entry in list at a time

<a href="/news/8305/tiny-crystal-shapes-get-close-look-from-mars-rover/">
<div class="rollover_description">
<div class="rollover_description_inner">
Star-shaped, tiny, dark bumps in the fine-layered bright bedrock of a Martian ridge are drawing close inspection by NASA's Curiosity Mars rover.
</div>
<div class="overlay_arrow">
<img alt="More" src="/assets/overlay-arrow.png"/>
</div>
</div>
<img alt="Tiny Crystal Shapes Get Close Look From Mars Rover" class="img-lazy" data-lazy="/system/news_items/list_view_images/8305_pia22213_320.jpg" src="/assets/loading_320x240.png"/>
</a>

In [4]:
#getting the title
NASA_latest_t = soup_news[1].get_text().strip()

NASA_latest_t

'Tiny Crystal Shapes Get Close Look From Mars Rover'

In [5]:
#getting the paragraph
    # getting the paragraph url
soup_p = soup_div.find_all('a', href=True)
soup_p_url = soup_p[0]['href']
soup_p_url

'/news/8305/tiny-crystal-shapes-get-close-look-from-mars-rover/'

In [6]:
#    Scrape the href of the first news article
url = "https://mars.nasa.gov/"
news_url = url + soup_p_url
# request url
r = requests.get(news_url)
data = r.text
soup = BeautifulSoup(data, "html.parser")

soup_para = soup.find(class_='wysiwyg_content')
soup_para = soup_para.find_all('p')


In [7]:
#    save the text of the paragraphs to a list
NASA_latest_p = []
for entry in soup_para:
    paragraph = entry.get_text().strip()    
    NASA_latest_p.append(paragraph)
    

NASA_latest_p

["Star-shaped and swallowtail-shaped tiny, dark bumps in fine-layered bright bedrock of a Martian ridge are drawing close inspection by NASA's Curiosity Mars rover.",
 'This set of shapes looks familiar to geologists who have studied gypsum crystals formed in drying lakes on Earth, but Curiosity\'s science team is considering multiple possibilities for the origin of these features on "Vera Rubin Ridge" on Mars.',
 "One uncertainty the rover's inspection may resolve is the timing of when the crystal-shaped features formed, relative to when layers of sediment accumulated around them. Another is whether the original mineral that crystallized into these shapes remains in them or was subsequently dissolved away and replaced by something else. Answers may point to evidence of a drying lake or to groundwater that flowed through the sediment after it became cemented into rock.",
 "The rover team also is investigating other clues on the same area to learn more about the Red Planet's history. Th

### JPL Mars Space Images - Featured Image

In [8]:
# Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [9]:
# Use splinter to navigate the site and find the image url for the current Featured Mars Image and 

browser.click_link_by_partial_text('FULL IMAGE')


In [10]:
# assign the url string to a variable called `featured_image_url`.
html = browser.html
soup = BeautifulSoup(html, "html.parser")
 
feat_img_soup = soup.find_all(class_="button fancybox")

feat_img = feat_img_soup[0].get('data-fancybox-href')

ori_url = 'https://www.jpl.nasa.gov'
featured_image_url = ori_url + feat_img
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA14317_ip.jpg'

### Mars Weather

In [11]:
''' 
*** Visit the Mars Weather twitter account (https://twitter.com/marswxreport?lang=en) and scrape the latest 
Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`. ***
'''
url = 'https://twitter.com/marswxreport?lang=en'
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')

mars_tweets = soup.find(class_='stream-items js-navigable-stream')
mars_tweets = mars_tweets.find(class_="js-tweet-text-container")

mars_weather = mars_tweets.p.text
mars_weather

'Sol 1955 (Feb 04, 2018), Sunny, high -21C/-5F, low -77C/-106F, pressure at 7.45 hPa, daylight 05:41-17:27'

### Mars Facts

In [12]:
''' 
*** Visit the Mars Facts webpage (http://space-facts.com/mars/) and use Pandas to scrape the table containing 
facts about the planet including Diameter, Mass, etc. ***
'''
facts_url = 'http://space-facts.com/mars/'
all_facts_df = pd.read_html(facts_url)     # searches for html tables & returns list of dataframes
all_facts_df = all_facts_df[0]
all_facts_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [13]:
''' # clean up df (may be unnecessary)
all_facts_df.rename_axis({0:"category", 1:"content"}, axis=1, inplace=True)
all_facts_df.set_index("category", inplace=True)
all_facts_df 
'''

' # clean up df (may be unnecessary)\nall_facts_df.rename_axis({0:"category", 1:"content"}, axis=1, inplace=True)\nall_facts_df.set_index("category", inplace=True)\nall_facts_df \n'

In [14]:
# Use Pandas to convert the data to a HTML table string.
facts_html = all_facts_df.to_html(justify='left')

facts_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: left;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    <

### Mars Hemispheres

In [114]:
''' 
*** Visit the USGS Astrogeology site 
(https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) 
to obtain high resolution images for each of Mar's hemispheres.
'''
executable_path = {'executable_path': 'chromedriver.exe'}     # included these 2 lines
browser = Browser('chrome', **executable_path, headless=False)     # to prevent browser.visit() failing 
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)


In [115]:
# click each of the links to the hemispheres to find the image url to the full resolution image.
#    get list of <a href links> 
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

hemi_soup = soup.find_all(class_='itemLink product-item')

hemi_href_ls = []

for item in hemi_soup:
    url_index = 'https://astrogeology.usgs.gov'
    href = item['href']
    link = url_index + href
    hemi_href_ls.append(link)

# Get unique hrefs
'''     I could just go to these urls separately using browser.visit(url). But I interpret the instructions 
        as saying that I need to use splinter to click on the link in the browser.     '''
hemi_href_ls = np.unique(hemi_href_ls)
hemi_href_ls

array(['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
       'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
       'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
       'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'],
      dtype='<U78')

In [116]:
''' It seems splinter can only click link based on the exact wording of the text
browser.click_link_by_partial_text('Cerberus Hemisphere')    #e.g. function will fail to find lower case 'cerberus'
'''

" It seems splinter can only click link based on the exact wording of the text\nbrowser.click_link_by_partial_text('Cerberus Hemisphere')    #e.g. function will fail to find lower case 'cerberus'\n"

In [117]:
# Beautiful soup to search browser html for headers (these contain the hemisphere names)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

headers_soup = soup.find_all('h3')
test = headers_soup[2].text.replace(" Enhanced", "")
test

'Syrtis Major Hemisphere'

In [128]:
# For each header in the beautiful soup, click link associated with it and get img_url 
hemisphere_image_urls = []
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

for header in headers_soup:
    #start at origin url for the Mars hemisphere section 
    window = browser.windows[0]     # current window, the first window
    browser.visit(url)
    time.sleep(2)     # wait 2 secs for browser to load 
    #getting title
    title = header.text
    title = title.replace(" Enhanced", "")     #get rid of " " + "Enhanced" for when dict is appended
    browser.click_link_by_partial_text(title)
    time.sleep(2)     # again, wait 2 secs for browser to load    
    browser.click_link_by_text('Sample')
    browser.windows.current = browser.windows[1]     # switch current window to the window that just opened
    img_url = browser.url
    browser.windows.current = window     # switch the current window back 
    hemisphere_image_urls.append({'title':title, 'img_url':img_url})
    window.close_others()    # close all the other windows to keep browser nice and tidy!

hemisphere_image_urls

[{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere'},
 {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere'}]