In [1]:
#set up the scraping

from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# Visit the Mars Facts site
url = 'https://galaxyfacts-mars.com/'
browser.visit(url)

html = browser.html
html_soup = soup(html, 'html.parser')

In [2]:
# use the Beautiful Soup find method to find the table. Because we identified 
    #one of the class attributes of the sidebar table 
        # as table-striped 
#Assign the HTML code for this table to the table variable.        

table = html_soup.find('table', class_='table-striped')

In [3]:
#store the table data in a Python data structure

# 1) mars_facts dictionary, empty for now, will eventually hold the table data.
mars_facts = {}

# 2) We use the find_all method to find all the table rows (tr). We save the table rows in the rows variable
rows = table.find_all('tr')

for row in rows:
    # 3) In each for loop iteration, the text of the row's table header (th) gets saved to the row_heading variable.
    row_heading = row.find('th').text
    # 4) In each for loop iteration, text of the row’s table data (td) gets extracted. 
            # The strip method then strips any white space that this text contains. 
            # The text then gets saved to the row_data variable.
    row_data = row.find('td').text.strip()
    
#5) In each for loop iteration, the row becomes an entry in the mars_facts dictionary. 
#The table heading becomes the key, and the table data becomes the value
    mars_facts[row_heading] = row_data

In [4]:
print (mars_facts)

{'Equatorial Diameter:': '6,792 km', 'Polar Diameter:': '6,752 km', 'Mass:': '6.39 × 10^23 kg (0.11 Earths)', 'Moons:': '2 ( Phobos  &  Deimos )', 'Orbit Distance:': '227,943,824 km (1.38 AU)', 'Orbit Period:': '687 days (1.9 years)', 'Surface Temperature:': '-87 to -5 °C', 'First Record:': '2nd millennium BC', 'Recorded By:': 'Egyptian astronomers'}


In [5]:
browser.quit()

In [6]:
import pandas as pd

In [7]:
#creates a new DataFrame from the HTML table
#Pandas read_html method searches for tables and returns a list of those that exist in the HTML code of the webpage.
df = pd.read_html('https://galaxyfacts-mars.com')
df
    #for this examples the df variable thus gets assigned the two tables in the form of a list.

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [8]:
# first table is the one that we're interested in. The second table is the one that we previously scraped. 
#So, we’ll select the first table by using its index of 0.

mars_df = df[0]
mars_df

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [12]:
#To rename the columns, in the next cell, enter and run the following code
mars_df.columns=['description', 'Mars', 'Earth']
mars_df

Unnamed: 0,description,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [13]:
#we want to eliminate the first row. 
mars_df = mars_df.iloc[1:]
#prints to screen
mars_df

Unnamed: 0,description,Mars,Earth
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C


In [15]:
import pandas as pd

# Read in HTML tables into a DataFrame
df = pd.read_html('https://galaxyfacts-mars.com')
# Select the first table
mars_df = df[0]

# Rename columns
mars_df.columns=['description', 'Mars', 'Earth']

# Remove the first row from the DataFrame
mars_df = mars_df.iloc[1:]
mars_df

Unnamed: 0,description,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"
5,Length of Year:,687 Earth days,365.24 days
6,Temperature:,-87 to -5 °C,-88 to 58°C
