In [1]:
# Dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "chromedriver"}
    return Browser("chrome", **executable_path, headless=False)



In [5]:
def scrape():
    browser = init_browser()
    # Create mars_data dict that we can insert into mongoDB
    mars_data = {}


    # Access and visit the NASA Mars News Site URL
    news_url = 'https://mars.nasa.gov/news/'
    browser.visit(news_url)

    # HTML object
    html = browser.html

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Retrieve all elements that contain news title
    latest_news = soup.find_all('div', class_="list_text")

    # Get the latest news    
    news = latest_news[0]

    # Use BeautifulSoup' find() method to navigate and retrieve attributes
    news_title = news.find('div', class_="content_title")
    news_p = news.find('div', class_="article_teaser_body")

    # Add them to our mars_data dict
    news_title = str(news_title)
    news_p = str(news_p)
    mars_data["news_title"] = str(news_title)
    mars_data["news_p"] = str(news_p)

    # Access and visit the JPL Mars Space Images URL
    featured_img_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(featured_img_url)


    # HTML object
    img_html = browser.html

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(img_html, 'html.parser')

    # Retrieve Featured Mars Image url from style tag 
    featured_image_url  = soup.find('article')['style'].replace('background-image: url(','').replace(');', '')[1:-1]

    # Put the website url together with the features image url
    featured_image_url = 'https://www.jpl.nasa.gov' + featured_image_url

    # Add it to our mars_data dict
    mars_data["featured_image_url"] = str(featured_image_url)


    # Access and visit Mars Weather twitter URL
    mars_twitter_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_twitter_url)

    # HTML object
    twitter_marswx_html = browser.html

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(twitter_marswx_html, 'html.parser')

    # Get the text from the latest Mars weather tweet
    mars_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text

    # Add it to our mars_data dict
    mars_data["mars_weather"] = str(mars_weather)


    # Access and visit the Mars facts webpage
    mars_facts_url = 'https://space-facts.com/mars/'

    # Get any tabular data from the webpage
    facts_tables = pd.read_html(mars_facts_url)
    facts_tables

    # Slice off the dataframe that we want usin normal indexing
    facts_df = facts_tables[0]
    facts_df.columns = ['Fact', 'Value']

    # Set the index to the `Fact` column
    facts_df.set_index('Fact', inplace=True)

    # Convert the Dataframe to HTML
    html_table = facts_df.to_html()

    # Add facts table to our mars_data dict
    mars_data["facts_table"] = str(html_table)


    # Access and visit the USGS Astrogeology site
    mars_hemis_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hemis_url)
    xpath = '//div//a[@class="itemLink product-item"]/img'

    # Use splinter to Click the image to bring up the full resolution image
    results = browser.find_by_xpath(xpath)

    # Initiate hemisphere_image_urls list
    hemisphere_image_urls = []

    # Loop over results to get image data
    for i in range(len(results)):
        img = results[i]
                
        img.click()
        
        # Scrape the browser into soup and use soup to find the full resolution image of mars
        # Save the image url to a variable called `img_url`
        mars_usgs_html = browser.html
        soup = BeautifulSoup(mars_usgs_html, 'html.parser')
        partial_img_url = soup.find("img", class_="wide-image")["src"]
        
        img_url = 'https://astrogeology.usgs.gov/' + partial_img_url
        
        # Scrape the browser into soup and use soup to find the title of the image
        # Save the image's title to a variable called `img_title`
        img_title = soup.find('h2', class_="title").text
        
        # Get the data into a dictionary
        img_dict = {
            'img_url': img_url,
            'img_title': img_title
        }
        # Append image dictionaries to the list
        hemisphere_image_urls.append(img_dict)

        browser.back()
        results = browser.find_by_xpath(xpath)
        i = i + 1

    # Add hemispheres dictionary to mars_data dictionary
    mars_data['hemisphere_image_urls'] = hemisphere_image_urls


    # Close the browser after scraping
    browser.quit()

    # Return our mars_data dict
    return mars_data

In [6]:
scrape()

{'news_title': '<div class="content_title"><a href="/news/8568/nasas-treasure-map-for-water-ice-on-mars/" target="_self">NASA\'s Treasure Map for Water Ice on Mars</a></div>',
 'news_p': '<div class="article_teaser_body">A new study identifies frozen water just below the Martian surface, where astronauts could easily dig it up.</div>',
 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA19808-1920x1200.jpg',
 'mars_weather': 'InSight sol 371 (2019-12-12) low -96.6ºC (-141.9ºF) high -19.8ºC (-3.6ºF)\nwinds from the SW at 5.3 m/s (11.8 mph) gusting to 21.0 m/s (46.9 mph)\npressure at 6.60 hPapic.twitter.com/VKC855F0Mr',
 'facts_table': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Value</th>\n    </tr>\n    <tr>\n      <th>Fact</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar D