In [10]:
import sys
!{sys.executable} -m pip install html5lib




[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata
from datetime import datetime

# Helper functions
def date_time(table_cells):
    return [dt.strip() for dt in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    out = ''.join([bv for i, bv in enumerate(table_cells.strings) if i % 2 == 0][0:-1])
    return out

def landing_status(table_cells):
    return [i for i in table_cells.strings][0]

def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if "kg" in mass:
        return float(mass.split(" ")[0].replace(',', '').replace('~',''))
    return None

# Wikipedia static URL for consistent scraping
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

# Request and parse page
page = requests.get(static_url)
soup = BeautifulSoup(page.content, 'html.parser')

# Data container
launch_dict = {
    "FlightNumber": [], "Date": [], "Time": [], "BoosterVersion": [], "LaunchSite": [],
    "Payload": [], "PayloadMass": [], "Orbit": [], "Customer": [],
    "Wikipedia_Launch_Outcome": [], "BoosterLanding": []
}

# Extraction logic
for table in soup.find_all('table', class_='wikitable plainrowheaders collapsible'):
    for row in table.find_all("tr"):
        if row.th and row.th.string and row.th.string.strip().isdigit():
            flight_number = int(row.th.string.strip())
            cells = row.find_all("td")
            if len(cells) >= 9:
                dt = date_time(cells[0])
                try:
                    date_obj = datetime.strptime(dt[0].replace(',', ''), "%Y-%m-%d").date()
                except:
                    continue

                launch_dict["FlightNumber"].append(flight_number)
                launch_dict["Date"].append(date_obj)
                launch_dict["Time"].append(dt[1])
                launch_dict["BoosterVersion"].append(booster_version(cells[1]))
                launch_dict["LaunchSite"].append(cells[2].get_text(strip=True))
                launch_dict["Payload"].append(cells[3].get_text(strip=True))
                launch_dict["PayloadMass"].append(get_mass(cells[4]))
                launch_dict["Orbit"].append(cells[5].get_text(strip=True))
                launch_dict["Customer"].append(cells[6].get_text(strip=True))
                launch_dict["Wikipedia_Launch_Outcome"].append(cells[7].get_text(strip=True))
                launch_dict["BoosterLanding"].append(landing_status(cells[8]))

# Convert to DataFrame
wiki_df = pd.DataFrame(launch_dict)
wiki_df.head()


Unnamed: 0,FlightNumber,Date,Time,BoosterVersion,LaunchSite,Payload,PayloadMass,Orbit,Customer,Wikipedia_Launch_Outcome,BoosterLanding


In [29]:
# Load API dataset
api_df = pd.read_csv('dataset_part_1.csv')
api_df['Date'] = pd.to_datetime(api_df['Date']).dt.date

# Merge with Wikipedia data on Date
merged_df = pd.merge(api_df, wiki_df[['Date', 'Wikipedia_Launch_Outcome']], on='Date', how='left')

# Save merged dataset
merged_df.to_csv('dataset_part_2.csv', index=False)
merged_df.head()


Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude,Wikipedia_Launch_Outcome
0,1,2010-06-04,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857,
1,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857,
2,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857,
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093,
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857,
