# Step 1 - Web Scraping

In [1]:
import pymongo
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd

In [2]:
executable_path = {'executable_path': 'c:/bin/chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

## NASA Mars News

In [3]:
marsNewsUrl = 'https://mars.nasa.gov/news/'
browser.visit(marsNewsUrl)

In [4]:
# Retrieve page with the requests module - if desire
# response = requests.get(url)

# Create BeautifulSoup object; parse with 'html.parser'
# soup = BeautifulSoup(response.text,'html.parser')

# Examine the results, then determine element that contains sought info
# print(soup.prettify())

In [5]:
html = browser.html
soup = bs(html,'html.parser')

allSlides = soup.find_all('div', class_="list_text")
rootUrl = 'https://mars.nasa.gov'

titles = []
dates = []
previews = []
links = []

# print the first 5 recent artciles/slides
for slide in allSlides:
    listDate = slide.find('div',class_="list_date")
    slideTitle = slide.find('div',class_="content_title")
    title = slideTitle.find('a')
    link = rootUrl + title['href']
    preview = slide.find('div', class_="article_teaser_body")
    
    titles.append(title.text)
    dates.append(listDate.text)
    links.append(link)
    previews.append(preview.text)
    
#     display    
#     print('--------')
#     print(listDate.text)
#     print(title.text) 
#     print(link)
#     print(preview.text)

In [6]:
slideData = {'Title': titles, 'Date': dates, 'Link': links, 'Preview': previews}
slideDf = pd.DataFrame(data=slideData)
slideDf.head(5)

Unnamed: 0,Title,Date,Link,Preview
0,NASA Readies Perseverance Mars Rover's Earthly...,"September 4, 2020",https://mars.nasa.gov/news/8749/nasa-readies-p...,Did you know NASA's next Mars rover has a near...
1,NASA Engineers Checking InSight's Weather Sensors,"August 24, 2020",https://mars.nasa.gov/news/8744/nasa-engineers...,An electronics issue is suspected to be preven...
2,Follow NASA's Perseverance Rover in Real Time ...,"August 21, 2020",https://mars.nasa.gov/news/8742/follow-nasas-p...,A crisply rendered web application can show yo...
3,NASA Establishes Board to Initially Review Mar...,"August 14, 2020",https://mars.nasa.gov/news/8737/nasa-establish...,The board will assist with analysis of current...
4,NASA's Ingenuity Mars Helicopter Recharges Its...,"August 13, 2020",https://mars.nasa.gov/news/8736/nasas-ingenuit...,Headed to the Red Planet with the Perseverance...


In [7]:
# Latest article
# Assum first entry in teh dataframe
latestArticleTitle = slideDf.Title[0]
latestArticlePreview = slideDf.Preview[0]

print(latestArticleTitle)
print(latestArticlePreview)

NASA Readies Perseverance Mars Rover's Earthly Twin 
Did you know NASA's next Mars rover has a nearly identical sibling on Earth for testing? Even better, it's about to roll for the first time through a replica Martian landscape.


## JPL Mars Space Images - Featured Image

In [8]:
MarsPicUrl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(MarsPicUrl)

In [9]:
# Create BeautifulSoup object; parse with 'html.parser'
html = browser.html
soup = bs(html,'html.parser')
# print(soup.prettify())

In [10]:
rootImgUrl = 'https://www.jpl.nasa.gov'
allImages = soup.find_all('a', class_="fancybox")

marsImg = []
for img in allImages:
    curImg = img['data-fancybox-href']
    curUrl = rootImgUrl + curImg
    marsImg.append(curUrl)

In [11]:
#  choose any one as a feature image
featured_image_url = marsImg[10]
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA24076_hires.jpg


## Mars Facts

In [12]:
marsFactUrl = 'https://space-facts.com/mars/'
browser.visit(marsFactUrl)

In [13]:
html = browser.html
soup = bs(html,'html.parser')

In [14]:
marsFact = pd.read_html(marsFactUrl)
marsFactDf = marsFact[0]
marsFactDf.columns = ['Fact', 'Value']
marsFactDf= marsFactDf.set_index('Fact')
marsFactDf

Unnamed: 0_level_0,Value
Fact,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


## Mars Hemispheres

In [15]:
marsHemiUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(marsHemiUrl)

In [16]:
html = browser.html
soup = bs(html,'html.parser')

allHemi = soup.find_all('div', class_="description")
rootUrl = 'https://astrogeology.usgs.gov'

hemiTitles = []
hemiMainUrl = []

# print the first 5 recent artciles/slides
for hemi in allHemi:
    linkRef = hemi.find('a')
    link =  rootUrl + linkRef['href']
    title = hemi.find('h3').text
    
    hemiTitles.append(title)
    hemiMainUrl.append(link)
    
#     display
#     print(title)
#     print(link)
#     print('----------')
    
zipHemi = list(zip(hemiTitles,hemiMainUrl))

In [17]:
marsHemiImg = []
for curTitle, curMainUrl in zipHemi:
    browser.visit(curMainUrl)
    html = browser.html
    soup = bs(html,'html.parser')
    curHemi = soup.find('div', class_="wide-image-wrapper").find('a')
    curHemiLink = curHemi['href']
    curDict = {'title': curTitle, 'image_url': curHemiLink}
    marsHemiImg.append(curDict)

print(marsHemiImg)

[{'title': 'Cerberus Hemisphere Enhanced', 'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]
