In [17]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [4]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {"executable_path":"/Users/Mikayla Kurland/Desktop/Class/Web Scraping/chromedriver.exe"}

In [5]:
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
# Visit the mars nasa news site

url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Optional delay for loading the page
##searching for elements with a specific combination of tag (ul and li) and attribute (item_list and slide, respectively).
##also telling our browser to wait one second before searching for components. 
###The optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are image-heavy.
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [7]:
#set up the html parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [None]:
#Notice how we've assigned slide_elem as the variable to look for the <ul /> tag and its descendent (the other tags within the <ul /> element), the <li /> tags? This is our parent element. 
#This means that this element holds all of the other elements within it, and we'll reference it when we want to filter search results even further. 
#The . is used for selecting classes, such as item_list, so the code 'ul.item_list li.slide' pinpoints the <li /> tag with the class of slide and the <ul /> tag with a class of item_list. 
#CSS works from right to left, such as returning the last item on the list instead of the first. Because of this, when using select_one, the first matching element returned will be a <li /> element with a class of slide and all nested elements within it.

In [8]:
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8794/independent-review-indicates-nasa-prepared-for-mars-sample-return-campaign/" target="_self">Independent Review Indicates NASA Prepared for Mars Sample Return Campaign</a></div>

In [9]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

'Independent Review Indicates NASA Prepared for Mars Sample Return Campaign'

In [10]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'NASA released an independent review report Tuesday indicating the agency is well positioned for its Mars Sample Return campaign to bring pristine samples from Mars to Earth for scientific study.'

### Featured Images

In [11]:
# Visit URL
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [12]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

In [13]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.links.find_by_partial_text('more info')
more_info_elem.click()

In [14]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [15]:
# Find the relative image url
#look inside the <figure class=”lede” /> tag for an <a /> tag, and then look within that <a /> tag for an <img /> tag

img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

'/spaceimages/images/largesize/PIA23170_hires.jpg'

In [16]:
# Use the base URL to create an absolute URL
img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA23170_hires.jpg'

In [18]:
#Mars Facts
df = pd.read_html('http://space-facts.com/mars/')[0]
df.columns=['description', 'value']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [19]:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [20]:
browser.quit()