# 1.Importing required libraries for data collection -scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

# 2 .Send a GET request to fetch the page content

In [4]:
# Send a GET request to fetch the page content
response = requests.get(url)
response.raise_for_status()  # Raise an error for bad status codes

# 3 .Parse the HTML content using BeautifulSoup

In [5]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# 4. Find all tables with class 'wikitable'

In [6]:
# Find all tables with class 'wikitable'
tables = soup.find_all('table', class_='wikitable')

# Initialize an empty list to store launch data


In [7]:
# Initialize an empty list to store launch data
launch_data = []

# Define column headers based on typical table structure

In [8]:
# Define column headers based on typical table structure
headers = [
    'Flight No.', 'Date and time (UTC)', 'Version, Booster', 'Launch site', 
    'Payload', 'Payload mass', 'Orbit', 'Customer', 
    'Launch outcome', 'Booster landing'
]


# process each table

In [9]:
#process each table
for table in tables:
    # Extract table rows
    rows = table.find_all('tr')
    
    # Skip the header row and process data rows
    for row in rows[1:]:
        cols = row.find_all(['td', 'th'])
        row_data = []
        
        # Extract text from each cell, handling rowspan and colspan
        for col in cols:
            text = col.get_text(strip=True)
            # Clean up text: remove reference tags like [123], superscripts, and extra whitespace
            text = re.sub(r'\[\d+\]', '', text)  # Remove [123] references
            text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
            row_data.append(text.strip())
        
        # Ensure the row has the expected number of columns
        # If fewer columns, fill with None (handles merged cells or incomplete rows)
        while len(row_data) < len(headers):
            row_data.append(None)
        
        # If row has data, append to launch_data
        if row_data and any(row_data):  # Check if row has non-empty values
            launch_data.append(row_data)

In [10]:
# Create a Pandas DataFrame
df = pd.DataFrame(launch_data, columns=headers)


In [11]:
# Clean the DataFrame
# Remove rows where 'Flight No.' is None or empty
df = df[df['Flight No.'].notna() & (df['Flight No.'] != '')]

In [11]:
launch_df.to_csv("wikipedia_falcon9_launches.csv", index=False)


In [12]:
# Clean 'Date and time (UTC)' column: remove extra annotations and standardize
df['Date and time (UTC)'] = df['Date and time (UTC)'].str.replace(r'\[\d+\]', '', regex=True)
df['Date and time (UTC)'] = df['Date and time (UTC)'].str.strip()


In [13]:
# Clean 'Payload mass' column: remove annotations, extract numeric value
df['Payload mass'] = df['Payload mass'].str.replace(r'\[\w+\]', '', regex=True)  # Remove [h], etc.
df['Payload mass'] = df['Payload mass'].str.replace(r'~', '', regex=True)  # Remove ~
df['Payload mass'] = df['Payload mass'].str.replace(r'kg.*', '', regex=True)  # Remove 'kg' and after
df['Payload mass'] = df['Payload mass'].str.replace(r',', '', regex=True)  # Remove commas
df['Payload mass'] = pd.to_numeric(df['Payload mass'], errors='coerce')  # Convert to numeric

In [15]:
df.head()


Unnamed: 0,Flight No.,Date and time (UTC),"Version, Booster",Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,Booster landing
0,195,"January 3, 202314:56",F9B5B1060‑15,"Cape Canaveral,SLC‑40",Transporter-6(115 payload smallsat rideshare),,SSO,Various,Success,Success (LZ‑1)
1,Dedicated SmallSat Rideshare mission to Sun-sy...,,,,,,,,,
2,196,"January 10, 202304:50",F9B5B1076‑2,"Cape Canaveral,SLC‑40",OneWeb 16(40 satellites),6000.0,PolarLEO,OneWeb,Success,Success (LZ‑1)
3,"Following theRussian invasion of Ukraine, OneW...",,,,,,,,,
4,FH 5,"January 15, 202322:56",Falcon HeavyB5B1070(core),"Kennedy,LC‑39A",USSF-67(CBAS-2&LDPE-3A),3750.0,GEO,USSF,Success,No attempt


# 5. Last step is to save the csv file inorder to take further steps

In [16]:
# Save the DataFrame to a CSV file
df.to_csv('falcon_launches.csv', index=False)