In [1]:
!pip install beautifulsoup4 requests pandas



In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata
import re

import sys

In [4]:
#return date and time from the td
def date_time(table_cells):
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

#ret booster version
def booster_version(table_cells):
    out = ''.join([item for i, item in enumerate(table_cells.strings)])
    return out.strip()

#ret landing status
def landing_status(table_cells):
    out = [s for s in table_cells.strings]
    return out[0].strip() if out else None

#returns normalized payload mass
def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if "kg" in mass:
        new_mass = mass[0:mass.find("kg")+2].strip()
    else:
        new_mass = None
    return new_mass

#filter
def extract_column_from_header(row):
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    column_name = ' '.join(row.stripped_strings)
    
    if not column_name.strip().isdigit():
        return column_name.strip()
    return None

In [10]:
#fetch and parse
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

response = requests.get(static_url)
soup = BeautifulSoup(response.text, 'html.parser')

print(soup.title.string)

#get all tables
html_tables = soup.find_all("table", class_="wikitable")
first_launch_table = html_tables[0]

List of Falcon 9 and Falcon Heavy launches - Wikipedia


In [14]:
#extract column names
column_names = []
for th in first_launch_table("th"):
    name = extract_column_from_header(th)
    if name is not None and len(name) > 0:
        column_names.append(name)

print(column_names)

['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']


In [15]:
launch_dict = dict.fromkeys(column_names)

#remove irrelvant
del launch_dict['Date and time ( )']

#init
launch_dict = {key: [] for key in column_names}

launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

In [16]:
#populate the dicts
extracted_row = 0

for table in soup.find_all('table', "wikitable plainrowheaders collapsible"):
    for rows in table.find_all("tr"):
        if rows.th and rows.th.string:
            flight_number = rows.th.string.strip()
            if flight_number.isdigit():
                row = rows.find_all('td')

                datatimelist = date_time(row[0])
                date = datatimelist[0].strip(',')
                time = datatimelist[1]

                bv = booster_version(row[1])
                if not bv and row[1].a:
                    bv = row[1].a.string

                launch_site = row[2].a.string if row[2].a else None
                payload = row[3].a.string if row[3].a else None
                payload_mass = get_mass(row[4])
                orbit = row[5].a.string if row[5].a else None
                customer = row[6].a.string if row[6].a else None
                launch_outcome = list(row[7].strings)[0].strip()
                booster_landing = landing_status(row[8])

                launch_dict['Flight No.'].append(flight_number)
                launch_dict['Date'].append(date)
                launch_dict['Time'].append(time)
                launch_dict['Version Booster'].append(bv)
                launch_dict['Launch site'].append(launch_site)
                launch_dict['Payload'].append(payload)
                launch_dict['Payload mass'].append(payload_mass)
                launch_dict['Orbit'].append(orbit)
                launch_dict['Customer'].append(customer)
                launch_dict['Launch outcome'].append(launch_outcome)
                launch_dict['Booster landing'].append(booster_landing)

                extracted_row += 1


In [18]:
#save to csv
df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})
df.to_csv("spacex_web_scraped.csv", index=False)
df.head()


Unnamed: 0,Flight No.,Date and time ( ),Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Version Booster,Booster landing,Date,Time
0,1,,CCAFS,Dragon Spacecraft Qualification Unit,,LEO,SpaceX,Success,F9 v1.0[7]B0003.1[8],Failure,4 June 2010,18:45
1,2,,CCAFS,Dragon,,LEO,NASA,Success,F9 v1.0[7]B0004.1[8],Failure,8 December 2010,15:43
2,3,,CCAFS,Dragon,525 kg,LEO,NASA,Success,F9 v1.0[7]B0005.1[8],No attempt,22 May 2012,07:44
3,4,,CCAFS,SpaceX CRS-1,"4,700 kg",LEO,NASA,Success,F9 v1.0[7]B0006.1[8],No attempt,8 October 2012,00:35
4,5,,CCAFS,SpaceX CRS-2,"4,877 kg",LEO,NASA,Success,F9 v1.0[7]B0007.1[8],No attempt,1 March 2013,15:10
