# Web Scraping

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
from splinter import Browser

In [2]:
# Define executable path and browser for splinter
executable_path = {'executable_path':'/chromedriver/chromedriver.exe'}
browser = Browser('chrome',**executable_path, headless=False)

## NASA Mars News

In [3]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

In [4]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.visit(url)
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [5]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html = browser.html
soup = bs(html,'lxml')

In [6]:
# Retrieve the most recent headlines in the page
headlines = soup.find('div', class_='list_text')
print(headlines)

<div class="list_text"><div class="list_date">July 28, 2020</div><div class="content_title"><a href="/news/8723/nasas-perseverance-rover-will-carry-first-spacesuit-materials-to-mars/" target="_self">NASA's Perseverance Rover Will Carry First Spacesuit Materials to Mars</a></div><div class="article_teaser_body">In a Q&amp;A, spacesuit designer Amy Ross explains how five samples, including a piece of helmet visor, will be tested aboard the rover, which is targeting a July 30 launch. </div></div>


In [7]:
# Retrieve the most recent headline title and paragraph text
news_title = headlines.find('div', class_='content_title').a.text
news_p = headlines.find('div', class_='article_teaser_body').text
print('Latest News:')
print(news_title)
print(news_p)

Latest News:
NASA's Perseverance Rover Will Carry First Spacesuit Materials to Mars
In a Q&A, spacesuit designer Amy Ross explains how five samples, including a piece of helmet visor, will be tested aboard the rover, which is targeting a July 30 launch. 


## JPL Mars Space Images - Featured Images

In [8]:
# URL of page to be scraped
url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [9]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.visit(url_image)
browser.is_element_present_by_css("section.primary_media_feature", wait_time=1)

True

In [10]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html_image = browser.html
image_soup = bs(html_image,'lxml')

In [11]:
# Retrieve the URL for the featured image
featured_image_url =image_soup.find('footer').find('a', class_='button')['data-fancybox-href']
featured_image_url = featured_image_url.strip('/')
featured_image_url = f'https://www.{featured_image_url}'
print(f'Featured image url: {featured_image_url}')

Featured image url: https://www.spaceimages/images/mediumsize/PIA16028_ip.jpg


## Mars Weather

In [12]:
# URL of page to be scraped
mars_twitter = 'https://twitter.com/marswxreport?lang=en'

In [13]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.visit(mars_twitter)
browser.is_element_present_by_tag("span", wait_time=2)

True

In [16]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html_twitter = browser.html
twitter_soup = bs(html_twitter,'lxml')

In [17]:
# Retrieve the text from the most recent twit
spans = twitter_soup.find_all('span')
    
mw_twits = []
for span in spans:
        
    if (span.text and span.text.startswith('InSight')):
        mw_twits.append(span.text)

mars_weather = mw_twits[0]
print('Latest Twit About Mars Weather:')
print(mars_weather)
browser.cookies.delete()  # deletes all cookies

Latest Twit About Mars Weather:
InSight sol 593 (2020-07-27) low -91.8ºC (-133.2ºF) high -16.0ºC (3.3ºF)
winds from the WNW at 6.5 m/s (14.4 mph) gusting to 18.8 m/s (42.1 mph)
pressure at 7.90 hPa


## Mars Facts

In [None]:
# URL of page to be scraped
mars_facts = 'https://space-facts.com/mars/'

In [None]:
# Retrieve table data into pandas and convert into a dataframe
scraped_tables = pd.read_html(mars_facts)

In [None]:
table = scraped_tables[0]
table = table.rename(columns={0:"",1:'Value'})
table

In [None]:
# Convert the data into a HTML string
html_table_string = table.to_html(index=False)
html_table_string

## Mars Hemispheres

In [None]:
# URL of page to be scraped
hem_urls = ['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [None]:
# Retrieve image URLs and hemisphere name for each hemisphere and add them to a list
hemisphere_image_urls = []

for hem in hem_urls:
    # Get a handle on the URL with splinter using a wait_time to allow for all results to come through
    browser.visit(hem)
    browser.is_element_present_by_css("dl dd", wait_time=0.5)
    
    # Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
    hem_html = browser.html
    hem_soup = bs(hem_html,'lxml')
    
    # Retrieve the image URL and hemisphere title
    img_url = hem_soup.find('div', class_='content').a['href']
    title = hem_soup.find('h2', class_='title').text
    
    # Modify the title to how only the name of the Hemisphere
    title = title.replace('Enhanced',"")
    
    # Create a dictionary
    dict_hem = {"title":title, "img_url": img_url}
    
    # Add the dictionary to the hemisphere_imare_urls list
    hemisphere_image_urls.append(dict_hem)
    
print(hemisphere_image_urls)