In [1]:
#install dependencies

import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as bs
from pprint import pprint
import requests
import time

In [2]:

# intializing the browser object
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
## PART 1

# set NASA url
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

In [4]:
# Beautiful Soup time!

html = browser.html
soup = bs(html, 'html.parser')

#print(soup.prettify())

In [5]:
# Finding the most recent title and pargraph
time.sleep(1)
news_title_div = soup.find('div', class_ = 'list_text').find('div', class_='content_title')
print(news_title_div)

new_parag_div = soup.find('div', class_= 'article_teaser_body')
new_parag_div

<div class="content_title"><a href="/news/8765/ai-is-helping-scientists-discover-fresh-craters-on-mars/" target="_self">AI Is Helping Scientists Discover Fresh Craters on Mars</a></div>


<div class="article_teaser_body">It's the first time machine learning has been used to find previously unknown craters on the Red Planet.</div>

In [6]:
# print news title to check
news_title = news_title_div.get_text()
news_title

'AI Is Helping Scientists Discover Fresh Craters on Mars'

In [7]:
# print news summary paragraph
news_parag = new_parag_div.get_text()
news_parag

"It's the first time machine learning has been used to find previously unknown craters on the Red Planet."

In [8]:
## PART 2

# set featured image url
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [9]:
# use splinter to find full image button and click on button
full_image_button = browser.find_by_id('full_image')
full_image_button.click()

In [10]:
# use splinter to find more info button and click the button

# since not all elements do not load at the same time, this step says "keep checking for this element with this identifier and wait for so long before checking again"
browser.is_element_present_by_text('more info', wait_time=1)

# since button does not have an id, now we have to find the "link" by using text 
more_info_button = browser.links.find_by_partial_text('more info')
more_info_button.click()

In [11]:
# BeautifulSoup time to get the image html
html = browser.html
soup = bs(html, 'html.parser')


In [12]:
# Trying to get image url, it was recommended to do a try/except for error handling... need to understand when to use this more.

# this finds the relative image path
relative_img_url = soup.find('figure', class_ ="lede").find('img', class_='main_image').get('src')
    
feature_image_url = f'https://www.jpl.nasa.gov{relative_img_url}'
feature_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA15283_hires.jpg'

In [13]:
# PART 3

#Table Scrapping

# set the url
url = 'https://space-facts.com/mars/'

In [14]:
# scrape tables from the url and save to dataframes
tables = pd.read_html(url)

In [15]:
type(tables)

list

In [16]:
# print first table
tables[0]

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [17]:
# Save table to a pandas dataframe
mars_stats_df = tables[0]

In [18]:
# add column headers
mars_stats_df.columns=['Description', 'Mars']
mars_stats_df

Unnamed: 0,Description,Mars
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [19]:
# set Description column as index
# https://beenje.github.io/blog/posts/parsing-html-tables-in-python-with-pandas/
mars_stats_df.set_index('Description', inplace = True)
mars_stats_df

Unnamed: 0_level_0,Mars
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [20]:
# convert dataframe to html table string
mars_stats_df.to_html('mars_facts.html')

In [21]:
## Part 4

# set featured image url
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)


In [22]:
# CERBERUS (I'm sure there is going to be a way to loop this process, but I want to just get it working first)
# use Splinter to click on Cerberus link
time.sleep(1)
cerberus_link = browser.links.find_by_partial_text('Cerberus Hemisphere Enhanced')
cerberus_link.click()

# BeautifulSoup time to get image link!
html = browser.html
soup = bs(html, 'html.parser')

#get relative link
relative_cerberus_url = soup.find('img', class_='wide-image').get('src')

#append for full link
cerberus_img_url = f'https://astrogeology.usgs.gov{relative_cerberus_url}'
cerberus_img_url

browser.back()

In [23]:
# SCHIAPARELLI
# click on Schiaparelli link
time.sleep(1)
schia_link = browser.links.find_by_partial_text('Schiaparelli Hemisphere Enhanced')
schia_link.click()

# BeautifulSoup time to get image link!
html = browser.html
soup = bs(html, 'html.parser')

#get relative link
relative_schia_url = soup.find('img', class_='wide-image').get('src')

#append for full link
schiaparelli_img_url = f'https://astrogeology.usgs.gov{relative_schia_url}'
schiaparelli_img_url

browser.back()

In [24]:
# SYRTIS MAJOR
# click on Syrtis Major link
time.sleep(1)
syrtis_link = browser.links.find_by_partial_text('Syrtis Major Hemisphere Enhanced')
syrtis_link.click()

# BeautifulSoup time to get image link!
html = browser.html
soup = bs(html, 'html.parser')

#get relative link
relative_syrtis_url = soup.find('img', class_='wide-image').get('src')

#append for full link
syrtis_img_url = f'https://astrogeology.usgs.gov{relative_syrtis_url}'
syrtis_img_url

browser.back()

In [25]:
# VALLES MARINERIS
# click on Valles Marineris link
time.sleep(1)
valles_link = browser.links.find_by_partial_text('Valles Marineris Hemisphere Enhanced')
valles_link.click()

# BeautifulSoup time to get image link!
html = browser.html
soup = bs(html, 'html.parser')

#get relative link
relative_valles_url = soup.find('img', class_='wide-image').get('src')

#append for full link
valles_img_url = f'https://astrogeology.usgs.gov{relative_valles_url}'
valles_img_url

browser.back()

In [26]:
# Hemisphere Images
mars_hemis = [
    {"title": "Cerberus Hemisphere", "img_url": cerberus_img_url},
    {"title": "Schiaparelli Hemisphere", "img_url": schiaparelli_img_url},
    {"title": "Syrtis Major Hemisphere", "img_url": syrtis_img_url},
    {"title": "Valles Marineris Hemisphere", "img_url": valles_img_url}
]



In [27]:
mars_hemis

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]