In [1]:
# Dependencies
import os
from bs4 import BeautifulSoup as bs
import requests
import pymongo
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

## Step 1 - Scraping

### NASA Mars News

In [2]:
# Tried using get request, didn't work - it did not return an error, but also did not return any results.  
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\kaldm\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache


 


In [3]:
# Assign variable for NASA Mars News Site(https://mars.nasa.gov/news/) 
url_news = 'https://mars.nasa.gov/news/'
browser.visit(url_news)

In [4]:
# create soup object

html = browser.html
soup = bs(html, 'html.parser')

In [5]:
# Scrape the website for the most recent headline. Note that, for the headline title, this is the second 
# matching element on the page, as the first match is in the navigation bar.  The paragraphs is the first  
# on the page 

title = soup.find_all('div', class_='content_title')[1].text
paragraph = soup.find_all('div', class_='article_teaser_body')[0].text
   
# print article info
print('-----------------')
print(title)
print(paragraph)

-----------------
5 Hidden Gems Are Riding Aboard NASA's Perseverance Rover
The symbols, mottos, and small objects added to the agency's newest Mars rover serve a variety of purposes, from functional to decorative.


### JPL Mars Space Images - Featured Image

In [6]:
# url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)
url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url_image)

In [7]:
# find the large-size images. First, find the page where it is located. 
html = browser.html
soup = bs(html, 'html.parser')
image = soup.find(id = "full_image")["data-link"]
feature_image_url = "https://www.jpl.nasa.gov" + image
print(feature_image_url)


https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA18048


In [8]:
# now, visit the page (from above) where it is located 
browser.visit(feature_image_url)

In [9]:
# now, get the link to the jpg itself
html = browser.html
soup = bs(html, 'html.parser')
image_lg = soup.find('img', class_ = "main_image")["src"]
image_lg_url = "https://www.jpl.nasa.gov" + image_lg
print (image_lg_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18048_hires.jpg


### Mars Facts

In [10]:
# url for Mars Facts [here](https://space-facts.com/mars/)
url_facts = 'https://space-facts.com/mars/'
browser.visit(url_facts)

In [11]:
# Use Pandas to scrape the table containing facts about the planet. First import dependency
import pandas as pd

In [37]:
#read table
table = pd.read_html(url_facts)
table

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:     -87 to -5 °C      -88 to 58°C,
           

In [39]:
# convert list to a dataframe - call [0] for the first table in the page, although here there is only one  
planet_df = table[0]
planet_df

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [45]:
# assign column header names 
planet_df.columns = ['Fact Type', 'Answer']
planet_df

Unnamed: 0,Fact Type,Answer
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [57]:
#convert table to html table string 
planet_table_html = planet_df.to_html(index=False)
planet_table_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th>Fact Type</th>\n      <th>Answer</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>'

In [58]:
# # remove new lines /carriage returns

planet_table_html = planet_table_html.replace('\n', '')
planet_table_html

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th>Fact Type</th>      <th>Answer</th>    </tr>  </thead>  <tbody>    <tr>      <td>Equatorial Diameter:</td>      <td>6,792 km</td>    </tr>    <tr>      <td>Polar Diameter:</td>      <td>6,752 km</td>    </tr>    <tr>      <td>Mass:</td>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <td>Moons:</td>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <td>Orbit Distance:</td>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <td>Orbit Period:</td>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <td>Surface Temperature:</td>      <td>-87 to -5 °C</td>    </tr>    <tr>      <td>First Record:</td>      <td>2nd millennium BC</td>    </tr>    <tr>      <td>Recorded By:</td>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

In [60]:
# export table to html 
planet_df.to_html('table.html', index=False)

### Mars Hemispheres

In [18]:
# Assign variable for url https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars 
url_astro = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url_astro)

In [19]:
# obtain the url for the high-resolution image for each of Mars hemispheres, along with the title
# construct for-loop to identify all the titles and image urls, add them to the dictionary, and
# store the dictionary in a list

hemi_list = []

# for-loop for four hemispheres
for i in range(4):
    
    # click on the html for the image - now at the next page
    browser.find_by_tag('h3')[i].click()
    # create HTML and beautiful soup objects
    html = browser.html
    soup = bs(html, 'html.parser')
    # get image source link for full-resoultion photo
    image_1 = soup.find('img', class_='wide-image')['src']
    # construct url from original page to full-resolution image photo
    image_1_url = 'https://astrogeology.usgs.gov' + image_1
    # find title of hemisphere photo at the photo page 
    title_1 = soup.find('h2', class_= 'title').text
    # add the title and its corresponding image to a dictionary
    hemi_dict = {'Title': title_1, 'Image_URL': image_1_url}
    # append the dictionary to the list
    hemi_list.append(hemi_dict)
    #go back to the original page with all four hemispheres
    browser.back()


In [20]:
#check where I went back to - I am back to the original page with all four hemipshere photos
browser.url

'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

In [21]:
# print the list of dictionaries
for i in (hemi_list):
    print (i)

{'Title': 'Cerberus Hemisphere Enhanced', 'Image_URL': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'}
{'Title': 'Schiaparelli Hemisphere Enhanced', 'Image_URL': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'}
{'Title': 'Syrtis Major Hemisphere Enhanced', 'Image_URL': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'}
{'Title': 'Valles Marineris Hemisphere Enhanced', 'Image_URL': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}


In [22]:
browser.quit()

## Step 2 - MongoDB and Flask Application

# Reference Code Attempts

In [23]:
# # HTML object
# html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
# soup = bs(html, 'html.parser')

In [24]:
## Mongo DB stuff

# # Initialize PyMongo to work with MongoDBs
# conn = 'mongodb://localhost:27017'
# client = pymongo.MongoClient(conn)

# # Define database and collection
# db = client.nhl_db
# collection = db.articles

In [25]:
#     # Dictionary to be inserted into MongoDB
#     post = {
#         'title': header,
#         'paragraph': subheader,
#     }

#     # Insert dictionary into MongoDB as a document
#     collection.insert_one(post)

In [26]:
# Retrieve page with the requests module
# response = requests.get(url)

# # collect the latest News Title and Paragraph Text. 
# results = soup.find_all('div', class_='content_title')
# results = soup.find_all('div', class_='list_text')
# results = soup.find_all('div', class_='bottom_gradient')
# results = soup.find_all('h3', class_='list_text')

In [27]:
# for result in results: 
#     titles = soup.find_all('div', class_='content_title')
# #     paragraph = result.find('div', class_='article_teaser_body').text
   
#     # print article info
#     print('-----------------')
#     print(titles)
# #     print(paragraph)

In [28]:
# Print all title texts
# titles = soup.find_all('div', class_='content_title')
# for title in titles:
#     print(title.text)

# titles = soup.find_all('h3')
# for title in titles:
#     print(title)

# titles = soup.find_all('h3')
# for title in titles:
#     print(title.text)

In [29]:
# # Extract title text
# title = soup.title.text
# print (title)

In [30]:
# for result in results: 
#     title = result.find('div', class_='content_title').text
#     paragraph = result.find('div', class_='article_teaser_body').text
   
#     # print article info
#     print('-----------------')
#     print(title)
#     print(paragraph)


In [31]:
# # loop over results to get article title and paragraph and store in lists for later
# titles = []
# for result in results:
#     if (div.content_title):
#         if (div.content_title.text):
#             titles.append(div)
        
#     # scrape the article title 
# #     title = result.find('div', class_='content_title').text
# #     paragraph = result.find('div', class_='article_teaser_body').text


In [32]:
# planet_df.rename(columns={"0": "Property", "1": "Measurement"}, inplace=True)
# planet_df

In [33]:
# *** OTHER CODE **** 
# for title in titles:
#     print(title.text)

# titles = soup.find_all('h3')
# for title in titles:
#     print(title)

# titles = soup.find_all('h3')
# for title in titles:
#     print(title.text)


# title = soup.find_all('div', class_='content_title')[1].text
# paragraph = soup.find_all('div', class_='article_teaser_body')[0].text

In [34]:
# need a for-loop to iterate over the images?

# This works
# images = soup.find('img', class_='thumb')['src']

# But this doesn't. Error: list indices must be integers or slices, not str - so I guess 
# images = soup.find_all('img', class_='thumb')['src']


# Try this- ok this works, and entering diff index yields different image source, but it provides all the 
# info in the image tag. I want just the source. Need to get just the source, then iterate over these.  
# tried to add src into the params, does not work. SyntaxError: positional argument follows keyword argument
# images = soup.find_all('img', class_='thumb')[0]
# print (images)

# images = soup.find_all('img', class_='thumb')
# print (images)

# # what about just this - ok, this lists all of the image tag contents for each one 
# images = soup.find_all('img', class_='thumb')
# print (images)

In [35]:
#     time.sleep(1)