# Data Collection and Wrangling Notebook
This notebook collects data from the SpaceX API and scrapes the Falcon 9 launch data from Wikipedia.

In [1]:
import requests
import pandas as pd
import numpy as np
import datetime
from bs4 import BeautifulSoup
import unicodedata

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Part 1: Load JSON from SpaceX API

In [2]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json'
response = requests.get(url)
data = pd.json_normalize(response.json())

data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]
data = data[data['cores'].map(len) == 1]
data = data[data['payloads'].map(len) == 1]
data['cores'] = data['cores'].map(lambda x: x[0])
data['payloads'] = data['payloads'].map(lambda x: x[0])
data['date'] = pd.to_datetime(data['date_utc']).dt.date
data = data[data['date'] <= datetime.date(2020, 11, 13)]

data.to_csv("spacex_api_cleaned.csv", index=False)
data.head()

Unnamed: 0,rocket,payloads,launchpad,cores,flight_number,date_utc,date
0,5e9d0d95eda69955f709d1eb,5eb0e4b5b6c3bb0006eeb1e1,5e9e4502f5090995de566f86,"{'core': '5e9e289df35918033d3b2623', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",1,2006-03-24T22:30:00.000Z,2006-03-24
1,5e9d0d95eda69955f709d1eb,5eb0e4b6b6c3bb0006eeb1e2,5e9e4502f5090995de566f86,"{'core': '5e9e289ef35918416a3b2624', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",2,2007-03-21T01:10:00.000Z,2007-03-21
3,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e5,5e9e4502f5090995de566f86,"{'core': '5e9e289ef3591855dc3b2626', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",4,2008-09-28T23:15:00.000Z,2008-09-28
4,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e6,5e9e4502f5090995de566f86,"{'core': '5e9e289ef359184f103b2627', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",5,2009-07-13T03:35:00.000Z,2009-07-13
5,5e9d0d95eda69973a809d1ec,5eb0e4b7b6c3bb0006eeb1e7,5e9e4501f509094ba4566f84,"{'core': '5e9e289ef359185f2b3b2628', 'flight': 1, 'gridfins': False, 'legs': False, 'reused': False, 'landing_attempt': False, 'landing_success': None, 'landing_type': None, 'landpad': None}",6,2010-06-04T18:45:00.000Z,2010-06-04


## Part 2: Web Scraping Wikipedia Falcon 9 Launch Data

In [3]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"
html = requests.get(wiki_url).text
soup = BeautifulSoup(html, 'html.parser')

def date_time(table_cells):
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    return ''.join([booster for i, booster in enumerate(table_cells.strings) if i % 2 == 0])

def landing_status(table_cells):
    return [i for i in table_cells.strings][0]

def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    return mass[0:mass.find("kg") + 2] if "kg" in mass else np.nan

def extract_column_from_header(row):
    if row.br: row.br.extract()
    if row.a: row.a.extract()
    if row.sup: row.sup.extract()
    return ' '.join(row.contents).strip()

html_tables = soup.find_all("table")
first_launch_table = html_tables[2]

column_names = []
for th in first_launch_table.find_all('th'):
    name = extract_column_from_header(th)
    if name and not name.isdigit():
        column_names.append(name)

launch_dict = {key: [] for key in column_names}
launch_dict.update({'Version Booster': [], 'Booster landing': [], 'Date': [], 'Time': []})

for table in soup.find_all('table', class_='wikitable plainrowheaders collapsible'):
    for row in table.find_all('tr'):
        if row.th and row.th.string and row.th.string.strip().isdigit():
            cells = row.find_all('td')
            if len(cells) > 7:
                datelist = date_time(cells[0])
                launch_dict['Flight No.'].append(row.th.string.strip())
                launch_dict['Date'].append(datelist[0])
                launch_dict['Time'].append(datelist[1])
                launch_dict['Version Booster'].append(booster_version(cells[1]))
                launch_dict['Launch site'].append(cells[2].a.string if cells[2].a else '')
                launch_dict['Payload'].append(cells[3].a.string if cells[3].a else '')
                launch_dict['Payload mass'].append(get_mass(cells[4]))
                launch_dict['Orbit'].append(cells[5].a.string if cells[5].a else '')
                launch_dict['Customer'].append(cells[6].a.string if cells[6].a else '')
                launch_dict['Launch outcome'].append(list(cells[7].strings)[0])
                launch_dict['Booster landing'].append(landing_status(cells[8]))

df_wiki = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})
df_wiki.to_csv("spacex_wiki_scraped.csv", index=False)
df_wiki.head()

Unnamed: 0,Flight No.,Date and time ( ),Launch site,Payload,Payload mass,Orbit,Customer,Launch outcome,FH 5,FH 6,FH 7,FH 8,FH 9,Version Booster,Booster landing,Date,Time
