# Web Scraping

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser

In [2]:
# Define executable path and browser for splinter
executable_path = {'executable_path':'/chromedriver/chromedriver.exe'}
browser = Browser('chrome',**executable_path, headless=False)

## NASA Mars News

In [3]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

In [4]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=0.5)
browser.visit(url)

In [5]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html = browser.html
soup = bs(html,'lxml')

In [6]:
# Retrieve the most recent headlines in the page
headlines = soup.find('div', class_='list_text')
print(headlines)

<div class="list_text"><div class="list_date">July 27, 2020</div><div class="content_title"><a href="/news/8721/a-new-video-captures-the-science-of-nasas-perseverance-mars-rover/" target="_self">A New Video Captures the Science of NASA's Perseverance Mars Rover</a></div><div class="article_teaser_body">With a targeted launch date of July 30, the next robotic scientist NASA is sending to the to the Red Planet has big ambitions.</div></div>


In [7]:
# Retrieve the most recent headline title and paragraph text
news_title = headlines.find('div', class_='content_title').a.text
news_p = headlines.find('div', class_='article_teaser_body').text
print('Latest News:')
print(news_title)
print(news_p)

Latest News:
A New Video Captures the Science of NASA's Perseverance Mars Rover
With a targeted launch date of July 30, the next robotic scientist NASA is sending to the to the Red Planet has big ambitions.


## JPL Mars Space Images - Featured Images

In [8]:
# URL of page to be scraped
url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [9]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.is_element_present_by_css("section.primary_media_feature", wait_time=0.5)
browser.visit(url_image)

In [10]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html_image = browser.html
image_soup = bs(html_image,'lxml')

In [11]:
# Retrieve the URL for the featured image
featured_image_url =image_soup.find('footer').find('a', class_='button')['data-fancybox-href']
featured_image_url = featured_image_url.strip('/')
featured_image_url = f'https://www.{featured_image_url}'
print(f'Featured image url: {featured_image_url}')

Featured image url: https://www.spaceimages/images/mediumsize/PIA08097_ip.jpg


## Mars Weather

In [17]:
# URL of page to be scraped
mars_twitter = 'https://twitter.com/marswxreport?lang=en'

In [18]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.is_element_present_by_tag("article", wait_time=0.5)
browser.visit(mars_twitter)

In [19]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html_twitter = browser.html
twitter_soup = bs(html_twitter,'lxml')

In [20]:
# Retrieve the text from the most recent twit
spans = twitter_soup.find_all('span')

mw_twits = []
for span in spans:
    if (span.text and span.text.startswith('InSight')):
        mw_twits.append(span.text)
        
mars_weather = mw_twits[0]
print('Latest Twit About Mars Weather:')
print(mars_weather)

Latest Twit About Mars Weather:
InSight sol 591 (2020-07-25) low -91.2ºC (-132.2ºF) high -15.5ºC (4.2ºF)
winds from the WNW at 7.5 m/s (16.9 mph) gusting to 19.0 m/s (42.5 mph)
pressure at 7.90 hPa


## Mars Facts

In [21]:
# URL of page to be scraped
mars_facts = 'https://space-facts.com/mars/'

In [22]:
# Retrieve table data into pandas and convert into a dataframe
scraped_tables = pd.read_html(mars_facts)

In [23]:
table = scraped_tables[0]
table

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [24]:
# Convert the data into a HTML string
html_table_string = table.to_html(header=False, index=False)
html_table_string

'<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

## Mars Hemispheres

In [25]:
# URL of page to be scraped
hem_urls = ['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [26]:
# Retrieve image URLs and hemisphere name for each hemisphere and add them to a list
hemisphere_image_urls = []

for hem in hem_urls:
    # Get a handle on the URL with splinter using a wait_time to allow for all results to come through
    browser.is_element_present_by_css("dl dd", wait_time=0.5)
    browser.visit(hem)
    
    # Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
    hem_html = browser.html
    hem_soup = bs(hem_html,'lxml')
    
    # Retrieve the image URL and hemisphere title
    img_url = hem_soup.find('div', class_='content').a['href']
    title = hem_soup.find('h2', class_='title').text
    
    # Modify the title to how only the name of the Hemisphere
    title = title.replace('Enhanced',"")
    
    # Create a dictionary
    dict_hem = {"title":title, "img_url": img_url}
    
    # Add the dictionary to the hemisphere_imare_urls list
    hemisphere_image_urls.append(dict_hem)
    
print(hemisphere_image_urls)

[{'title': 'Cerberus Hemisphere ', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'}, {'title': 'Schiaparelli Hemisphere ', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'}, {'title': 'Syrtis Major Hemisphere ', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'}, {'title': 'Valles Marineris Hemisphere ', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]
