## Web Scraping Homework - Mission to Mars

## Step 1 - Scraping

### Scrape NASA Mars News

In [None]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
news_url = 'https://mars.nasa.gov/news/'
browser.visit(news_url)
news_html = browser.html

In [None]:
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(news_html, 'html.parser')
latest_slide = soup.select_one("li.slide")

In [None]:
latest_slide.find('div', class_='content_title')

In [None]:
# Retrieve the title in the latest news slide
news_title = latest_slide.find('div', class_='content_title').get_text()
print(news_title)

In [None]:
# Retrieve the teaser body from the latest news slide
news_para = latest_slide.find('div', class_='article_teaser_body').get_text()
print(news_para)

### Scrape JPL Mars Space Images - Featured Image

In [None]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(image_url)
images_html = browser.html

In [None]:
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(images_html, 'html.parser')

In [None]:
# Get img src
img_url = soup.find('img', class_="headerimage fade-in")['src']
img_url

In [None]:
# Use Base URL to obtain absolute URL
featured_image_url = f" {image_url}/{img_url}"
print(featured_image_url)

### Scrape Space Facts url

In [None]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
mars_facts_url = 'https://space-facts.com/mars/'
browser.visit(mars_facts_url)
facts_html = browser.html

In [None]:
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(facts_html, 'html.parser')

In [None]:
mars_fact_table = pd.read_html(mars_facts_url)
mars_fact_table

In [None]:
mars_fact_df = mars_fact_table[0]
mars_fact_df.columns = ['Description', 'Data']
mars_fact_df

In [None]:
mars_facts_html_table = mars_fact_df.to_html()
mars_facts_html_table

In [None]:
mars_facts_html_table.replace('\n', '')

### Scrape  USGS Astrogeology url

In [None]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [None]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(usgs_url)
usgs_html = browser.html

In [None]:
# Parse HTML with Beautiful Soup
soup = BeautifulSoup(usgs_html, 'html.parser')

In [None]:
hemisphere_image_urls = []

# Get Hemispheres list
links = browser.find_by_css("a.product-item h3")
for item in range(len(links)):
    hemisphere = {}
    
    # Iterate through hemisphere information
    browser.find_by_css("a.product-item h3")[item].click()
    
     # Get Hemisphere Title
    hemisphere["title"] = browser.find_by_css("h2.title").text
    
    # Find Sample Image Anchor Tag & Extract <href>
    sample_image = browser.find_link_by_text("Sample").first
    hemisphere["img_url"] = sample_image["href"]
    
    # Append Hemisphere Object to List
    hemisphere_image_urls.append(hemisphere)
    
    # Back to main page
    browser.back()

In [None]:
# Print image title & urls
hemisphere_image_urls