In [1]:
# Dependables
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import pymongo
from selenium import webdriver
import time

In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
# executable_path = {'executable_path': r'C:\Users\pippi\.wdm\drivers\chromedriver\win32\93.0.4577.63\chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Driver [C:\Users\pippi\.wdm\drivers\chromedriver\win32\93.0.4577.63\chromedriver.exe] found in cache


In [3]:
# Scrape NASA Mars News
# Set up url
newsUrl = 'https://redplanetscience.com/'
browser.visit(newsUrl)
browser.is_element_present_by_css('div.list_text', wait_time=1)
html = browser.html

In [4]:
# Create soup object
newsSoup = bs(html, 'html.parser')

In [5]:
# Parse out specific title
div_level = newsSoup.select_one('div.list_text')
div_level.find('div', class_='content_title')

<div class="content_title">NASA's Mars 2020 Heads Into the Test Chamber</div>

In [6]:
# Assign title to variable
news_title = div_level.find('div', class_='content_title').text

In [7]:
print(news_title)

NASA's Mars 2020 Heads Into the Test Chamber


In [8]:
# Scrape the paragraph
div_level = newsSoup.select_one('div.list_text')
div_level.find('div', class_='article_teaser_body')

<div class="article_teaser_body">In this time-lapse video taken at JPL, engineers move the Mars 2020 rover into a large vacuum chamber for testing in Mars-like environmental conditions.</div>

In [9]:
# Assign paragraph to variable
news_p = div_level.find('div', class_='article_teaser_body').text

In [10]:
print(news_p)

In this time-lapse video taken at JPL, engineers move the Mars 2020 rover into a large vacuum chamber for testing in Mars-like environmental conditions.


In [11]:
# Scrape the Mars space images
# Set up url
imageUrl = 'https://spaceimages-mars.com/'
browser.visit(imageUrl)

In [12]:
fullImage = browser.find_by_tag("button")[1]
fullImage.click()

In [13]:
# Create soup object
html = browser.html
imageSoup = bs(html, 'html.parser')

In [14]:
# Find the image using for loop
image_src = imageSoup.find('img', class_='headerimage fade-in')['src']
print(image_src)

image/featured/mars3.jpg


In [15]:
# Create the full url string
featured_image_url = 'https://spaceimages-mars.com/' + image_src
print(featured_image_url)

https://spaceimages-mars.com/image/featured/mars3.jpg


In [16]:
# Scrape the Mars facts
# Set up the tables
factsUrl = 'https://galaxyfacts-mars.com/'
factTables = pd.read_html(factsUrl)

In [17]:
factTables

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [18]:
# Set up DF
mars_df = factTables[0]
mars_df.head()

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"


In [19]:
# Drop Earth Column
mars_df = mars_df.drop(columns=2)
mars_df.head()

Unnamed: 0,0,1
0,Mars - Earth Comparison,Mars
1,Diameter:,"6,779 km"
2,Mass:,6.39 × 10^23 kg
3,Moons:,2
4,Distance from Sun:,"227,943,824 km"


In [20]:
# Drop Mars-Earth Comparison cell
mars_df.at[0,0] = 'Name'
mars_df.head()

Unnamed: 0,0,1
0,Name,Mars
1,Diameter:,"6,779 km"
2,Mass:,6.39 × 10^23 kg
3,Moons:,2
4,Distance from Sun:,"227,943,824 km"


In [21]:
print(mars_df)

                    0                1
0                Name             Mars
1           Diameter:         6,779 km
2               Mass:  6.39 × 10^23 kg
3              Moons:                2
4  Distance from Sun:   227,943,824 km
5     Length of Year:   687 Earth days
6        Temperature:     -87 to -5 °C


In [22]:
# Save mars_df to html
mars_df.to_html('Mars.html')

In [23]:
# Scrape Mars Hemispheres
# Set up url
hemisphereUrl = 'https://marshemispheres.com/'
browser.visit(hemisphereUrl)

In [24]:
# Create loop to cycle through each link
html = browser.html
hemisphereSoup = bs(html, 'html.parser')

In [25]:
# Create for loop
hemisphere_image_url = []
links = browser.find_by_css('a.itemLink img')
for link in range(len(links)):
    hemisphere_image_dict = {}
    browser.find_by_css('a.itemLink img')[link].click()
    imageUrl = browser.find_by_text("Sample").first
    hemisphere_image_dict["img_url"] = imageUrl["href"]
    title = browser.find_by_css('h2.title').text
    hemisphere_image_dict["title"] = title
    hemisphere_image_url.append(hemisphere_image_dict)
    browser.back()

In [26]:
print(hemisphere_image_url)

[{'img_url': 'https://marshemispheres.com/images/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg', 'title': 'Schiaparelli Hemisphere Enhanced'}, {'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg', 'title': 'Syrtis Major Hemisphere Enhanced'}, {'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg', 'title': 'Valles Marineris Hemisphere Enhanced'}]


In [27]:
# links = browser.find_by_css('a.itemLink img')
# for link in range(len(links)):
    # hemisphere_image_urls = {}
    # Click the link
    # browser.find_by_css('a.itemLink img')[link].click()
    # Retrieve the image link
    # sample = browser.find_link_by_text("Sample").first
    # hemisphere_image_urls['img_url'] = sample['href']
    # Retrieve the image title
    # title = browser.find_by_css('h2.title').text
    # hemisphere_image_urls['title'] = title
    # browser.back()
    
    # title = browser.find_by_css('h2.title')\
    # title = link.find('h2', class_='title').text
    #title = hemisphereSoup.find('h2', class_='title').text
    # titles_list.append(title)
    # hemisphere_dict['title'] = title
    # titles_list.append(title)
    # href = browser.links.find_by_text('Sample')
    # hemisphere_dict['image url']= href['href']
    # title = hemisphereSoup.find('h3').text
    # titles_list.append(title)
    # html = browser.html
    # pictureSoup = bs(html, 'html.parser')
    # img_url_href = hemisphereSoup.find('a', target='_blank')['href']
    # img_url = 'https://marshemispheres.com/' + img_url_href
    # img_url_list.append(img_url)
    # original = browser.find_by_css('a._blank').first
    
    # hemisphere_image_urls.append({'title': titles_list, 'img_url': img_url_list})