In [1]:
# Importing Dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import pandas as pd
import time

In [2]:
# Setting up the Search Engine (this is necessary for Python to run a browser to allow scraping)

# (This executable file is necessary for the rest of the assignment to run, it basically runs a fork of Chrome to allow 
# commands to be run.) [P.S.: I removed the personal path I used for it, mainly for security reasons.]
exe_path = {"executable_path": r"<file path location>\chromedriver.exe"}
# (The "r"  is needed to be before the driver path as it needs for it to be converted to a raw string in order to work)

# [Extra Note: There are other methods of doing this, but I figured this would be easier as having it locally is better
# than having to install it every single time I want to use it for scraping data. However, I did use web install for
# everything else.]

# (This handles the bot navigation through each of the websites by using it as a variable down the line)
browser = Browser("chrome", **exe_path, headless=False)

In [3]:
# Estabilishing Connection to NASA site (the website was given to us) 
# [The latest headline will change every time this cell is run as there is always something new each day]
url = "https://redplanetscience.com/"
browser.visit(url)
time.sleep(3)

In [4]:
# Using results from the Driver and using BeautifulSoup to parse information (notice that its using the variable)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [5]:
# Finding the most recent article title and body description (the variables for it were given to us from the homework)
news_title = soup.find('div', class_='content_title')
news_p = soup.find('div', class_='article_teaser_body')
print(news_title, news_p)

<div class="content_title">How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus </div> <div class="article_teaser_body">Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.</div>


In [6]:
# Having the Chrome Driver go to the JPL image website (link was given to us)
url2 = "https://spaceimages-mars.com/"
browser.visit(url2)
time.sleep(3)
# (The browser has to be re-parsed each time you want to gather data from)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [7]:
# Gathering data from the featured image at the top of the website

# (This is telling to gather all image links and place them in a list, then have it display the first one in said list)
img = [i.get("src") for i in soup.find_all("img", class_="headerimage fade-in")]
img[0]

# (varible was given to us)
featured_image_url = url2 + img[0]
featured_image_url

'https://spaceimages-mars.com/image/featured/mars3.jpg'

In [8]:
# Having the driver go to the Galaxy Facts of Mars website (link was provided to us)
url3 = "https://galaxyfacts-mars.com/"
browser.visit(url3)
time.sleep(3)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# Gathering data from the Mars information column and compiling it into a dataframe
# (Note: While the assignment directions did not specify, a comparison list of information between both Mars and Earth are
# needed, so the table that exists on this website comparing the two planets will be used.)
table_list = soup.find_all("table", class_="table")[0]
table_list

<table class="table">
<tbody>
<tr>
<th scope="row"><b> Mars - Earth Comparison</b></th>
<td><span class="orange"><b> Mars</b></span></td>
<td><span class="purple"> <b>Earth </b></span> </td>
</tr>
<tr>
<th scope="row">Diameter:</th>
<td><span class="orange">6,779 km</span></td>
<td><span class="purple">12,742 km</span> </td>
</tr>
<tr>
<th scope="row">Mass:</th>
<td><span class="orange">6.39 × 10^23 kg </span></td>
<td><span class="purple">5.97 × 10^24 kg</span> </td>
</tr>
<tr>
<th scope="row">Moons:</th>
<td><span class="orange">2</span></td>
<td><span class="purple">1</span> </td>
</tr>
<tr>
<th scope="row">Distance from Sun:</th>
<td><span class="orange">227,943,824 km</span></td>
<td><span class="purple">149,598,262 km</span> </td>
</tr>
<tr>
<th scope="row">Length of Year:</th>
<td><span class="orange">687 Earth days</span></td>
<td><span class="purple">365.24 days</span> </td>
</tr>
<tr>
<th scope="row">Temperature:</th>
<td><span class="orange">-87 to -5 °C</span></td>
<td><spa

In [10]:
# Creating the frame for the database (the headers that will be used) ["for" loops are used to make it easier to manage]
header = [i.text for i in table_list("th")] # (The "i" is really just a random letter, as any letter would do. Its just
header                                      # the one most commonly used)

[' Mars - Earth Comparison',
 'Diameter:',
 'Mass:',
 'Moons:',
 'Distance from Sun:',
 'Length of Year:',
 'Temperature:']

In [11]:
# Creating the tabs for the Mars and Earth Columns
column_Mars = [i.text for i in table_list("span", class_="orange")]
column_Mars

[' Mars',
 '6,779 km',
 '6.39 × 10^23 kg ',
 '2',
 '227,943,824 km',
 '687 Earth days',
 '-87 to -5 °C']

In [12]:
column_Earth = [i.text for i in table_list("span", class_="purple")]
column_Earth

[' Earth ',
 '12,742 km',
 '5.97 × 10^24 kg',
 '1',
 '149,598,262 km',
 '365.24 days',
 '\t-88 to 58°C']

In [13]:
# Combining the data together
combined_data = {"Description": header, "Mars": column_Mars, "Earth": column_Earth}
combined_data

{'Description': [' Mars - Earth Comparison',
  'Diameter:',
  'Mass:',
  'Moons:',
  'Distance from Sun:',
  'Length of Year:',
  'Temperature:'],
 'Mars': [' Mars',
  '6,779 km',
  '6.39 × 10^23 kg ',
  '2',
  '227,943,824 km',
  '687 Earth days',
  '-87 to -5 °C'],
 'Earth': [' Earth ',
  '12,742 km',
  '5.97 × 10^24 kg',
  '1',
  '149,598,262 km',
  '365.24 days',
  '\t-88 to 58°C']}

In [14]:
# Creating the Mars/Earth Comparision Dataframe
comparison_df = pd.DataFrame(combined_data)
comparison_df.set_index("Description", inplace=True)
comparison_df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,\t-88 to 58°C


In [15]:
# Cleaning up the dataframe slightly (as the "Earth Temperature" has a \t next to it)
comparison_df["Earth"] = comparison_df["Earth"].str.replace("\t", "")
comparison_df

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [16]:
# Converting the Dataframe into a format used by HTML (for the index.html)
# (It uses the same html class as the one on the actual website for consistency)
comparison_html = comparison_df.to_html(classes="table table-striped")
print(comparison_html)

<table border="1" class="dataframe table table-striped">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Mars</th>
      <th>Earth</th>
    </tr>
    <tr>
      <th>Description</th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Mars - Earth Comparison</th>
      <td>Mars</td>
      <td>Earth</td>
    </tr>
    <tr>
      <th>Diameter:</th>
      <td>6,779 km</td>
      <td>12,742 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg</td>
      <td>5.97 × 10^24 kg</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2</td>
      <td>1</td>
    </tr>
    <tr>
      <th>Distance from Sun:</th>
      <td>227,943,824 km</td>
      <td>149,598,262 km</td>
    </tr>
    <tr>
      <th>Length of Year:</th>
      <td>687 Earth days</td>
      <td>365.24 days</td>
    </tr>
    <tr>
      <th>Temperature:</th>
      <td>-87 to -5 °C</td>
      <td>-88 to 58°C</td>
    </tr>
  </tbody>
</table>


In [17]:
# Having the chrome driver go to the images of Mars' Hemispheres website (link also given to us)
url4 = "https://marshemispheres.com/"
browser.visit(url4)
time.sleep(3)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# Gathering each of the four full resoulution images of Mars' hemispheres
hemisphere_image_urls = []

# (A "for" loop is used to gather all the source urls into a list as to reduce the amount of work needed to be done)
for i in range(4):
    html = browser.html                       # (Every time a new website is opened, it needs to be parsed, which is why
    soup = BeautifulSoup(html, 'html.parser') # this code is here)
    
    title = soup.find_all("h3")[i].get_text() # ("h3" is used because each text header is using h3 in its html code)
    browser.find_by_tag('h3')[i].click()
    time.sleep(3)
    
    html = browser.html # (Same for here too, technically its a new website for each of the full resoultion images)
    soup = BeautifulSoup(html, 'html.parser')
    
    image_url = soup.find("img", class_="wide-image")["src"] # (Originally I thought .get("src") was needed but it didn't
    hemisphere_image_urls.append({                           # work unless this method was used instead. It does the same)
        "title": title,
        "img_url": "https://marshemispheres.com/" + image_url
    })
    browser.back() # (This is here so that once it is finished it can go back to the previous page and start again)
    time.sleep(3)
    
hemisphere_image_urls # (This will display the results)

In [None]:
browser.quit() # (May or may not be necessary, but it basically does what it implies. It closes once its finished.)