## Module 1 Part 1.1 -- Data Collection with Web Scrapping

In [2]:
import sys
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr',False)

## Space X Falcon 9 First Stage Landing Prediction
### Web scraping Falcon 9 and Falcon Heavy Launches Records from Wikipedia

#### Objectives
- Web scrap Falcon 9 launch records with BeautifulSoup
- Extract a Falcon 9 launch records HTML table from Wikipedia
- Parse the table and convert it into a Pandas data frame

#### To keep the lab tasks consistent, you will be asked to scrape the data from a snapshot of the List of Falcon 9 and Falcon Heavy launches Wikipage updated on 9th June 2021.

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches'

## Next, request the HTML page from the above URL and get a response object.

### TASK 1: Request the Falcon9 Launch Wiki page from its URL.
### First, let's perform an HTTP GET method to request the Falcon9 Launch HTML page, as an HTTP response.

In [4]:
# Send an HTTP Get request to the web page 
response = requests.get(url)

# Store the HTML content in a variable.
html_content = response.text

# Create a BeautifulSoup Object to parse the HTML 
soup = BeautifulSoup(html_content,'html.parser')

# Print the page title to verify if the BeautifulSoup object was created properly.
soup_title = soup.title # Output: <title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>

## TASK 2: Extract all column/variable names from the HTML table header
### Next, we want to collect all relevant column names from the HTML table header.

In [5]:
# Find all tables on the wiki page
html_tables = soup.find_all('table', class_=["wikitable", "plainrowheaders", "collapsible"])

# The third table (index 2) contains the actual launch records
first_launch_table = html_tables[2]

In [6]:
# Define function to extract column names from <th> elements
def extract_column_from_header(row):
    """
   This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    if row.br:
        row.br.extract() 
    if row.a:
        row.a.extract() 
    if row.sup:
        row.sup.extract() 

    column_name = ' '.join(row.contents).strip()
    return column_name if column_name and not column_name.isdigit() else None

In [7]:
# Get headers
column_names = [extract_column_from_header(th) for th in first_launch_table.find_all('th') if extract_column_from_header(th)]

# Print extracted column names
(column_names) # Output: ['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']

['Flight No.',
 'Date and time ( )',
 'Launch site',
 'Payload',
 'Payload mass',
 'Orbit',
 'Customer',
 'Launch outcome']

## TASK 3: Create a data frame by parsing the launch HTML tables.
### We will create an empty dictionary with keys from the extracted column names in the previous task. Later, this dictionary will be converted into a Pandas dataframe. 

In [8]:
launch_dict = {col: [] for col in column_names}

# Remove an irrelvant column
del launch_dict['Date and time ( )']

In [9]:


# Let's initial the launch_dict with each value to be an empty list
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
# Added some new columns
launch_dict['Version Booster']=[]
launch_dict['Booster landing']=[]
launch_dict['Date']=[]
launch_dict['Time']=[]

In [10]:
# Other functions 
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in table_cells.stripped_strings][:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell 
    Input: the  element of a table data cell extracts extra row
    """
    return ''.join([booster_version for i,booster_version in enumerate( table_cells.stripped_strings) if i%2==0][:-1])
    

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell 
    Input: the  element of a table data cell extracts extra row
    """
    return next(iter(table_cells.stripped_strings), "Unknown")
    
def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.get_text(strip=True))
    return mass[:mass.find("kg") + 2] if "kg" in mass else "0 kg"

#### To simplify the parsing process, we have provided an incomplete code snippet below to help you to fill up the launch_dict. Please complete the following code snippet with TODOs or you can choose to write your own logic to parse all launch tables.

#### Extract and parse all launch tables

In [11]:

extracted_row = 0

for table in html_tables:
    for rows in table.find_all("tr"):
        # Check if the first table heading is a number corresponding to a launch number
        if rows.th and rows.th.string and rows.th.string.strip().isdigit():
            flight_number = rows.th.string.strip()
            flag = True
        else:
            flag = False

        row = rows.find_all('td')

        if flag and len(row) >= 9:
            extracted_row += 1
            datatimelist = date_time(row[0])
            date = datatimelist[0].strip(',') if len(datatimelist) > 0 else "Unknown"
            time = datatimelist[1] if len(datatimelist) > 1 else "Unknown"

            bv = booster_version(row[1]) if booster_version(row[1]) else row[1].get_text(strip=True)
            launch_site = row[2].get_text(strip=True) if row[2] else "Unknown"
            payload = row[3].get_text(strip=True) if row[3] else "Unknown"
            payload_mass = get_mass(row[4]) if row[4] else "Unknown"
            orbit = row[5].get_text(strip=True) if row[5] else "Unknown"
            customer = row[6].get_text(strip=True) if row[6] else "Unknown"
            launch_outcome = landing_status(row[7]) if row[7] else "Unknown"
            booster_landing = landing_status(row[8]) if row[8] else "Unknown"

            launch_dict['Flight No.'].append(flight_number)
            launch_dict['Date'].append(date)
            launch_dict['Time'].append(time)
            launch_dict['Version Booster'].append(bv)
            launch_dict['Launch site'].append(launch_site)
            launch_dict['Payload'].append(payload)
            launch_dict['Payload mass'].append(payload_mass)
            launch_dict['Orbit'].append(orbit)
            launch_dict['Customer'].append(customer)
            launch_dict['Launch outcome'].append(launch_outcome)
            launch_dict['Booster landing'].append(booster_landing)

#### Save to Dataframe

In [12]:
df= pd.DataFrame({ key:pd.Series(value) for key, value in launch_dict.items() })

#### Resetting the flight number column 

In [13]:
df.loc[:,'Flight No.'] = list(range(1,df.shape[0]+1))

#### Save datafram to csv

In [14]:
#df.to_csv('IBM Data Certificate/Course11_Applied_Data_Science_Capstone/spacex_web_scraped.csv',index=False)
#print(df)

    Flight No.            Launch site                                            Payload Payload mass                  Orbit                               Customer Launch outcome Version Booster    Booster landing                Date   Time
0            1  Cape Canaveral,SLC‑40      Transporter-6(115 payload smallsat rideshare)         0 kg                    SSO                                Various        Success              F9          Success (      3 January 2023  14:56
1            2  Cape Canaveral,SLC‑40                           OneWeb 16(40 satellites)     6,000 kg               PolarLEO                                 OneWeb        Success              F9          Success (     10 January 2023  04:50
2            3  Cape Canaveral,SLC‑40                              USA-343(GPS-III SV06)     4,352 kg                    MEO                                   USSF        Success              F9          Success (     18 January 2023  12:24
3            4      Vandenberg,SLC‑4