# Web Scraping

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser

In [2]:
# Define executable path and browser for splinter
executable_path = {'executable_path':'/chromedriver/chromedriver.exe'}
browser = Browser('chrome',**executable_path, headless=False)

## NASA Mars News

In [3]:
# URL of page to be scraped
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

In [4]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=0.5)
browser.visit(url)

In [5]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html = browser.html
soup = bs(html,'lxml')

In [6]:
# Retrieve the most recent headlines in the page
headlines = soup.find('div', class_='list_text')
print(headlines)

<div class="list_text"><div class="list_date">July 23, 2020</div><div class="content_title"><a href="/news/8719/nasa-invites-public-to-share-excitement-of-mars-2020-perseverance-rover-launch/" target="_self">NASA Invites Public to Share Excitement of Mars 2020 Perseverance Rover Launch</a></div><div class="article_teaser_body">There are lots of ways to participate in the historic event, which is targeted for July 30.</div></div>


In [7]:
# Retrieve the most recent headline title and paragraph text
news_title = headlines.find('div', class_='content_title').a.text
news_p = headlines.find('div', class_='article_teaser_body').text
print('Latest News:')
print(news_title)
print(news_p)

NASA Invites Public to Share Excitement of Mars 2020 Perseverance Rover Launch
There are lots of ways to participate in the historic event, which is targeted for July 30.


## JPL Mars Space Images - Featured Images

In [8]:
# URL of page to be scraped
url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

In [9]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.is_element_present_by_css("section.primary_media_feature", wait_time=0.5)
browser.visit(url_image)

In [10]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html_image = browser.html
image_soup = bs(html_image,'lxml')

In [11]:
# Retrieve the URL for the featured image
featured_image_url =image_soup.find('footer').find('a', class_='button')['data-fancybox-href']
featured_image_url = featured_image_url.strip('/')
featured_image_url = f'https://www.{featured_image_url}'
print(f'Featured image url: {featured_image_url}')

Featured image url: https://www.spaceimages/images/mediumsize/PIA18851_ip.jpg


## Mars Weather

In [12]:
# URL of page to be scraped
mars_twitter = 'https://twitter.com/marswxreport?lang=en'

In [48]:
# Get a handle on the URL with splinter using a wait_time to allow for all results to come through
browser.is_element_present_by_tag("article", wait_time=0.5)
browser.visit(mars_twitter)

In [55]:
# Assign the splinter handle to an 'html' object and parse it with BeautifulSoup
html_twitter = browser.html
twitter_soup = bs(html_twitter,'lxml')

In [73]:
# Retrieve the text from the most recent twit
spans = twitter_soup.find_all('span')

mw_twits = []
for span in spans:
    if (span.text and span.text.startswith('InSight')):
        mw_twits.append(span.text)
        
mars_weather = mw_twits[0]
print('Latest Twit About Mars Weather:')
print(mars_weather)

Latest Twit About Mars Weather:
InSight sol 591 (2020-07-25) low -91.2ºC (-132.2ºF) high -15.5ºC (4.2ºF)
winds from the WNW at 7.5 m/s (16.9 mph) gusting to 19.0 m/s (42.5 mph)
pressure at 7.90 hPa


## Mars Facts