# Mission to Mars: Scraping

In [1]:
import pandas as pd
import pymongo

from bs4 import BeautifulSoup as bs
from splinter import Browser

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)
# ran browser: headless=False during build and testing, set to True for submission

In [3]:
# scrape NASA Mars News Site

news_url = "http://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(news_url)
news_response = browser.html
soup_news = bs(news_response, 'html.parser')
# print(soup_news.prettify())

In [4]:
# collect news title and paragraph text (in variables "news_title" and "news_p" for later use)

news_title = soup_news.find('div', class_='content_title').text.strip()
news_teaser = soup_news.find('div', class_='article_teaser_body' ).text.strip()
# print(news_title)
# print(news_teaser)

In [5]:
# visit url for JPL Featured Space image ( )

jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

browser.visit(jpl_url)
jpl_response = browser.html
soup_jpl = bs(jpl_response, 'html.parser')
# print(soup_jpl.prettify())

In [6]:
# find image url for current Featured Mars Image and assign to "featured_image_url"
# ensure it's the full size .jpg (not a thumbnail)

footer = soup_jpl.find('footer')
featured_img_a = footer.find('a')
url_tag = featured_img_a['data-fancybox-href']
featured_image_url = ("https://www.jpl.nasa.gov" + url_tag)
# print(featured_image_url)

In [7]:
# visit Mars Twitter

mars_twitter_url = "https://twitter.com/marswxreport?lang=en"

browser.visit(mars_twitter_url)
twitter_response = browser.html
soup_twitter = bs(twitter_response, 'html.parser')
# print(soup_twitter.prettify())

In [8]:
# scape latest weather tweet and assign the text to "mars_weather"

mars_weather = soup_twitter.find('p', class_="TweetTextSize--normal").text
# print(mars_weather)

In [9]:
# visit Mars Facts page

mars_facts_url = "https://space-facts.com/mars/"
    
browser.visit(mars_facts_url)
fact_response = browser.html
soup_fact = bs(fact_response, 'html.parser')
# print(soup_fact.prettify())

In [10]:
# use pandas to scrape the table of Mars info and convert that table to_html

mars_table = pd.read_html(mars_facts_url)
mars_facts_df = mars_table[1]
# mars_facts_df

In [11]:
mars_facts_html = mars_facts_df.to_html(index=False, header=False)
# mars_facts_html

In [12]:
# visit Mars astrogeology site and get high res images of each hemisphere (through link clicking)

astro_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

browser.visit(astro_url)
astro_response = browser.html
soup_astro = bs(astro_response, 'html.parser')
# print(soup_astro)

In [13]:
hem_image_set = soup_astro.find_all("div", class_="description")

mars_image_urls = []

for link in hem_image_set:
    url_dict = {}    
    hem_name = link.find("h3").text[:-9]
    hem_search = link.find("h3").text[:5]
    url_dict['title'] = hem_name
    browser.click_link_by_partial_text(hem_search)
    browser.click_link_by_partial_text("Sample")
    url_dict['img_url'] = browser.windows[1].url
    browser.windows[1].close()
    browser.back()
    mars_image_urls.append(url_dict)