# Web scraping from the Wikipedia webpage ["List of Falcon 9 and Falcon Heavy launches"](https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922)

  ## Objectives
Web scrap Falcon 9 launch records with `BeautifulSoup`: 
- Extract a Falcon 9 launch records HTML table from Wikipedia
- Parse the table and convert it into a Pandas data frame

In [11]:
# Import the necessary libraries
import pandas as pd # For data manipulation and analysis
import unicodedata  # For Unicode Database and normalization
import requests     # For sending HTTP requests for web scraping
import re           # For regular expressions operations 
import sys          # For system-specific parameters and functions

from bs4 import BeautifulSoup # For pulling data out of HTML and URLs
from io import StringIO       # Allows to treat strings as file-like objects
    

### Define the necessary function for the data collection. 

In [4]:
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name   


Request the HTML page from the URL and get a `response` object

In [5]:
# Made the request to the URl
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

# Assign the response to a object
response = requests.get(static_url).text

# Create a BeautifulSoup object from a response text content
soup = BeautifulSoup(response, 'html.parser')

# Display the page title to verify if the BeautifulSoup object was created properly 
title = soup.title.string
print('The title of the page is: ', title)

The title of the page is:  List of Falcon 9 and Falcon Heavy launches - Wikipedia


### Data Scraping 


In [6]:
# Extract all column/variable names from the HTML table header
html_tables = soup.find_all('table')

all_headers = []

for table in html_tables:
    headers = [header.text.strip() for header in table.find_all('th')]
    all_headers.append(headers)

# Create a Data Frame of 'all_headers' for readability 
all_headers_df = pd.DataFrame(all_headers)

# Display the first 5 records
all_headers_df.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,Flight No.,Date andtime (UTC),"Version,Booster [b]",Launch site,Payload[c],Payload mass,Orbit,Customer,Launchoutcome,Boosterlanding,...,,,,,,,,,,
3,Flight No.,Date andtime (UTC),"Version,Booster[b]",Launch site,Payload[c],Payload mass,Orbit,Customer,Launchoutcome,Boosterlanding,...,,,,,,,,,,
4,Flight No.,Date andtime (UTC),"Version,Booster[b]",Launch site,Payload[c],Payload mass,Orbit,Customer,Launchoutcome,Boosterlanding,...,,,,,,,,,,


In [None]:
# Starting from the third table is our target table contains the actual launch records.
first_launch_table = html_tables[2]

# Read the HTML table into a data frame for readability
html_table_str = str(first_launch_table)
table_io = StringIO(html_table_str)
html_table_df = pd.read_html(table_io)[0]

# Display the fist 5 records of the Data Frame  
html_table_df.head() 

Unnamed: 0,Flight No.,Date andtime (),Unnamed: 2,Launch site,Payload,Payload mass,Orbit,Customer,Launchoutcome,Unnamed: 9
0,1,"4 June 2010, 18:45",F9 v1.0[7] B0003.1[8],"CCAFS, SLC-40",Dragon Spacecraft Qualification Unit,,LEO,SpaceX,Success,Failure[9][10] (parachute)
1,1,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...,First flight of Falcon 9 v1.0.[11] Used a boil...
2,2,"8 December 2010, 15:43[13]",F9 v1.0[7] B0004.1[8],"CCAFS, SLC-40",Dragon demo flight C1 (Dragon C101),,LEO (ISS),NASA (COTS) NRO,Success[9],Failure[9][14] (parachute)
3,2,"Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of...","Maiden flight of Dragon capsule, consisting of..."
4,3,"22 May 2012, 07:44[17]",F9 v1.0[7] B0005.1[8],"CCAFS, SLC-40",Dragon demo flight C2+[18] (Dragon C102),"525 kg (1,157 lb)[19]",LEO (ISS),NASA (COTS),Success[20],No attempt


### Data Parsing

In [8]:
# Iterate through the `<th>` elements to extract column name one by one

# Find all th elements in the table
header_elements = first_launch_table.find_all('th')
column_names = []

# Get the column names 
for header in header_elements:
    name = extract_column_from_header(header)

    # Append the Non-empty column names
    if name is not None and len(name) >0:
        column_names.append(name)

# Display the column names
column_names

['Flight No.',
 'Date and time ( )',
 'Launch site',
 'Payload',
 'Payload mass',
 'Orbit',
 'Customer',
 'Launch outcome']

In [9]:
# Create a data frame by parsing the launch HTML tables

# Initialize Dictionary
launch_dict= dict.fromkeys(column_names)

# Remove an irrelevant column
del launch_dict['Date and time ( )']

# Let's initial the launch_dict with each value to be an empty list
launch_dict['Flight No.'    ] = []
launch_dict['Launch site'   ] = []
launch_dict['Payload'       ] = []
launch_dict['Payload mass'  ] = []
launch_dict['Orbit'         ] = []
launch_dict['Customer'      ] = []
launch_dict['Launch outcome'] = []

# Added some new columns
launch_dict['Version Booster'] = []
launch_dict['Booster landing'] = []
launch_dict['Date'           ] = []
launch_dict['Time'           ] = []


# Fill up the launch_dict with launch records extracted from table rows

# Initialize variables
extracted_row = 0

#Extract each table 
for table_number,table in enumerate(soup.find_all('table',"wikitable plainrowheaders collapsible")):

   # Get table row 
    for rows in table.find_all("tr"):

        #Check if first table heading is as number corresponding to launch a number 
        if rows.th:
            if rows.th.string:
                flight_number=rows.th.string.strip()
                flag=flight_number.isdigit()
        else:
            flag = False

        #Get table element 
        row = rows.find_all('td')
        
        #If it is number save cells in a dictionary 
        if flag:
            extracted_row += 1

            datatimelist = date_time(row[0])
            
            # Append the flight Number into launch_dict
            launch_dict['Flight No.'].append(flight_number)

            # Append the date into launch_dict 
            date = datatimelist[0].strip(',')
            launch_dict['Date'].append(date)
            
            # Append the time into launch_dict 
            time = datatimelist[1]
            launch_dict['Time'].append(time)
              
            # Append the bv into launch_dict 
            bv = booster_version(row[1])
            if not(bv):
                bv = row[1].a.string
            launch_dict['Version Booster'].append(bv)
            
            # Extract and append the Launch site into launch_dict 
            launch_site = row[2].a.string
            launch_dict['Launch site'].append(launch_site)
            
            # Append the payload into launch_dict
            payload = row[3].a.string
            launch_dict['Payload'].append(payload)
            
            # Append the payload_mass into launch_dict
            payload_mass = get_mass(row[4])
            launch_dict['Payload mass'].append(payload_mass)

            # Append the orbit into launch_dict
            orbit = row[5].a.string
            launch_dict['Orbit'].append(orbit)
            
            # Append the customer into launch_dict
            if len(row) > 6 and row[6].a is not None:
                customer = row[6].a.string
            else:
                customer = None 
            launch_dict['Customer'].append(customer)
            
            # Append the launch_outcome into launch_dict
            if len(row) > 7 and row[7].a is not None:
                launch_outcome = row[7].a.string
            else:
                launch_outcome = None     
            launch_dict['Launch outcome'].append(launch_outcome)   
            
            # Append the booster_landing into launch_dict
            if len(row) > 8 and row[8].a is not None:
                booster_landing = row[8].a.string
            else:
                booster_landing = None  
            launch_dict['Booster landing'].append(booster_landing)
           


In [10]:
# Create a data frame from the launch_dict export it to a CSV file format 
df= pd.DataFrame({ key:pd.Series(value) for key, value in launch_dict.items() })
df.to_csv('spacex_web_scraped.csv', index=False)
df.head(10)

Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,,F9 v1.07B0003.18,,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,,F9 v1.07B0004.18,,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,,F9 v1.07B0005.18,,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,,F9 v1.07B0006.18,,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,,F9 v1.07B0007.18,,1 March 2013,15:10
5,6,VAFB,CASSIOPE,500 kg,Polar orbit,MDA,,F9 v1.17B10038,,29 September 2013,16:00
6,7,CCAFS,SES-8,"3,170 kg",GTO,SES,,F9 v1.1,,3 December 2013,22:41
7,8,CCAFS,Thaicom 6,"3,325 kg",GTO,Thaicom,,F9 v1.1,,6 January 2014,22:06
8,9,Cape Canaveral,SpaceX CRS-3,"2,296 kg",LEO,NASA,,F9 v1.1,,18 April 2014,19:25
9,10,Cape Canaveral,Orbcomm-OG2,"1,316 kg",LEO,Orbcomm,,F9 v1.1,,14 July 2014,15:15





This Jupyter Notebook was Completed By [Jonathan Scott](http://www.linkedin.com/in/jonathan-scott-140709317) as part of the IBM Data Science Certification.

### Authors
[Yan Luo](https://www.linkedin.com/in/yan-luo-96288783/)

[Nayef Abou Tayoun](https://www.linkedin.com/in/nayefaboutayoun/)


Copyright © 2021 IBM Corporation. All rights reserved.