In [9]:
!pip install beautifulsoup4
!pip install requests
!pip install pandas



In [17]:
# Sys allows access to system-specific parameters and functions
import sys
# Requests allows us to make HTTP requests which we will use to get data from an API
import requests
# Import BeautifulSoup library
from bs4 import BeautifulSoup
# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Datetime is a library that allows us to represent dates
import datetime
# Re is a library that provides support for regular expressions
import re
# Unicodedata is a library that provides access to unicode character database
import unicodedata

In [60]:
# This function returns the date and time from the html table cells. 
def get_Date_Time(table_cells):
    # Get all strings, stripping whitespace
    cell_strings = [s.strip() for s in table_cells.strings if s.strip()]
    # The below code is to ensure we grab just the date and time and not other characters as some tables have shown to have.
    date = None
    time = None
    for s in cell_strings:
        if re.search(r'\d{1,2} \w+ \d{4}', s):  # Pattern for '4 June 2010'
            date = s
        elif re.match(r'\d{2}:\d{2}', s): # Pattern for '18:45'
            time = s
    # Fallback to original logic if regex fails, but clean the output
    if not date and len(cell_strings) >= 1:
         date = cell_strings[0]
    if not time and len(cell_strings) >= 2:
         time = cell_strings[1]
    return [date if date is not None else '', time if time is not None else '']
    #return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

In [61]:
# This function returns the booster version from the html table cells.
def get_Booster_Version(table_cells):
    out=''.join([booster_version for i, booster_version in enumerate(table_cells.strings) if i%2==0][0:-1])
    return out

In [62]:
# This function returns the landing status from the html table cells.
def get_Landing_Status(table_cells):
    out=[i for i in table_cells.strings][0]
    return out

In [63]:
# This function returns the mass of the rocket
def get_Mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass

In [64]:
# This function cleans and extracts the text content from an HTML table cell (tag object)
#by removing extraneous tags (like <br>, <a>, <sup>) and whitespace.
def extract_column_from_header(row):

    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name

In [65]:
# The URL
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

In [66]:
#Task 1: Request the Falcon9 Launch Wiki page from its UR
#HTTP GET method to request the Falcon9 Launch HTML page, as an HTTP response.
# use requests.get() method with the provided static_url
# assign the response to a object
headers = {"User-Agent": "IBM-DataScienceCourse-Scraper/1.0 (educational use)"}
response = requests.get(static_url, headers=headers)
print(response.status_code)
print(response.text[:500])

200
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vect


In [67]:
# Use BeautifulSoup() to create a BeautifulSoup object from a response text content
soup = BeautifulSoup(response.text)

In [68]:
# Use soup.title attribute
soup.title

<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>

TASK 2: Extract all column/variable names from the HTML table header

In [69]:
# Use the find_all function in the BeautifulSoup object, with element type `table`
# Assign the result to a list called `html_tables
html_tables = soup.find_all('table')
print(len(html_tables))

25


In [70]:
# Let's print the third table and check its content
first_launch_table = html_tables[2]
print(str(first_launch_table)[0:500])

<table class="wikitable plainrowheaders collapsible" style="width: 100%;">
<tbody><tr>
<th scope="col">Flight No.
</th>
<th scope="col">Date and<br/>time (<a href="/wiki/Coordinated_Universal_Time" title="Coordinated Universal Time">UTC</a>)
</th>
<th scope="col"><a href="/wiki/List_of_Falcon_9_first-stage_boosters" title="List of Falcon 9 first-stage boosters">Version,<br/>Booster</a> <sup class="reference" id="cite_ref-booster_11-0"><a href="#cite_note-booster-11"><span class="cite-bracket">[<


In [71]:
# Apply find_all() function with `th` element on first_launch_table
html_header_cells = first_launch_table.find_all('th')

# Initialize the list to store the column names
column_names = []

# Iterate each th element and apply the provided extract_column_from_header()
for header in html_header_cells:
    name = extract_column_from_header(header)
    
    # Append the Non-empty column name (if name is not None and len(name) > 0)
    if name is not None and len(name) > 0:
        column_names.append(name)


In [72]:
# Display the final list of column names
print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


TASK 3: Create a data frame by parsing the launch HTML tables

In [73]:
launch_dict= dict.fromkeys(column_names)

# Remove an irrelvant column
del launch_dict['Date and time ( )']

# Let's initial the launch_dict with each value to be an empty list
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

In [74]:
extracted_row = 0

#Extract each table
for table_number,table in enumerate(soup.find_all('table',"wikitable plainrowheaders collapsible")):
# get table row
    for rows in table.find_all("tr"):
        #check to see if first table heading is as number corresponding to launch a number
        if rows.th:
            if rows.th.string:
                flight_number=rows.th.string.strip()
                flag=flight_number.isdigit()
            else:
                flag=False
        else:
            flag=False # If no <th> element, it's not a launch record row.
        #get table element
        row=rows.find_all('td')
        #if it is number save cells in a dictonary
        if flag:
            # --- ADDED: Check if the row contains any <td> cells ---
            if not row: 
                # This handles rows that contain a <th> but no <td> elements.
                # print(f"Skipping row with flight number {flight_number} because it contains no <td> elements.")
                continue
            
            # The subsequent checks ensure the row is long enough for all required indices (like [8])
        if len(row) < 9:
            # print(f"Skipping row {flight_number} due to insufficient columns ({len(row)} found, 9 required).")
            continue
            
        extracted_row += 1
        
        # 1. Flight Number value
        launch_dict['Flight No.'].append(flight_number)
        
        # print(flight_number)
        datatimelist=get_Date_Time(row[0])
        #print(flight_number)

        # 2. Date value
        datatimelist=get_Date_Time(row[0])
        date = datatimelist[0].strip(',')
        launch_dict['Date'].append(date)
        #print(date)

        # 3. Time value
        time = datatimelist[1]
        launch_dict['Time'].append(time)
        #print(time)

        # 4. Booster version
        bv=get_Booster_Version(row[1])
        if not(bv):
            bv=row[1].a.string
        launch_dict['Version Booster'].append(bv)
        #print(bv)

        # 5. Launch site
        launch_site = row[2].a.string
        launch_dict['Launch site'].append(launch_site)
        #print(launch_site)

        # 6. Payload
        payload = row[3].a.string
        launch_dict['Payload'].append(payload)
        #print(payload)

        # 7. Payload Mass
        payload_mass = get_Mass(row[4])
        launch_dict['Payload mass'].append(payload_mass)
        #print(payload_mass)

        # 8. Orbit
        orbit = row[5].a.string
        launch_dict['Orbit'].append(orbit)
        #print(orbit)

        # 9. Customer
        # The below code handles missing customer link or None customer link by using .strings[0] or getting the text
        customer = None
        if row[6].a:
            customer = row[6].a.string
        elif row[6].strings:
            # Fallback to get text content if no 'a' tag (for companies with no link)
            customer = next(row[6].strings).strip()

        # The original logic expected a .a.string, which probably will fail. 
        if customer is None:
            customer = row[6].text.strip()
                
        launch_dict['Customer'].append(customer)
        #print(customer)

        # 10. Launch outcome
        launch_outcome = list(row[7].strings)[0].strip() # Added .strip()
        launch_dict['Launch outcome'].append(launch_outcome)
        #print(launch_outcome)

        # 11. Booster landing
        # NOTE: Correcting function name: landing_status should be get_Landing_Status
        booster_landing = get_Landing_Status(row[8])
        launch_dict['Booster landing'].append(booster_landing)
        #print(booster_landing)
        #print()

In [75]:
# Create a datafreme of the populated launc_dict table
df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})
print(df.shape)
df.head()

(125, 11)


Unnamed: 0,Flight No.,Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,CCAFS,Dragon Spacecraft Qualification Unit,0,LEO,SpaceX,Success,F9 v1.07B0003.18,Failure,4 June 2010,18:45
1,2,CCAFS,Dragon,0,LEO,NASA,Success,F9 v1.07B0004.18,Failure,8 December 2010,15:43
2,3,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.07B0005.18,No attempt\n,22 May 2012,07:44
3,4,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success,F9 v1.07B0006.18,No attempt,8 October 2012,00:35
4,5,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success,F9 v1.07B0007.18,No attempt\n,1 March 2013,15:10


In [76]:
# Export output to CSV
df.to_csv('spacex_web_scraped.csv', index=False)