# Tarea 1 - Introducción a Data Science

Integrantes:
- Axel Mondaca
- Sebastián Hernández

### Instalación de librerías

Se instalan las librerías necesarías para el funcionamiento correcto del código.

In [None]:
%pip install selenium
%pip install pandas
%pip install matplotlib

### Importar librerías

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import matplotlib.pyplot as plt
import math

### Código

In [None]:
base_url = "https://nextspaceflight.com/launches/past/" #Indicamos el sitio web
last_page = 240 #Indicamos el total de páginas

In [None]:
options = Options()
options.add_argument("--headless")
browser = webdriver.Chrome(options=options)
browser.get(base_url)

browser2 = webdriver.Chrome(options=options)
#Obtención de datos
launch_data = []
for page in range(1, last_page):  
    browser.get(f"{base_url}?page={page}") # obtenemos el html (hace la petición), sin selenium sería la librería de requests
    cards = browser.find_elements(By.CLASS_NAME, "mdl-card")
    
    # Lectura de las cartas
    for card in cards:
        company = card.find_element(By.CLASS_NAME, "mdl-card__title-text").text
        rocket_and_payload = card.find_element(By.CLASS_NAME, "header-style").text
        date_and_location = card.find_element(By.CLASS_NAME, "mdl-card__supporting-text").text.replace(",", "").replace("CLST", "").replace("CLT", "").replace("GMT-4", "")
        details_url = card.find_element(By.CLASS_NAME, "mdc-button").get_attribute("href")
        style = card.get_attribute("style")
        total_payload = 0
        rocket_height = 0
        fairing_diameter = 0
        fairing_height = 0
        price = 0
        volume = 0
        success = "Desconocido"
        color = card.get_attribute("style").split("border-color:")[1].split(";")[0].strip().lower()
        
        #Navegamos a la página de detalles
        browser2.get(details_url)
        
        try: 
            sections = browser2.find_elements(By.CLASS_NAME, "mdl-card__supporting-text")
            for section in sections:
                elements = section.find_elements(By.CLASS_NAME, "mdl-cell")
                for element in elements:
                    #Calculamos el total payload
                    text = element.text.strip()
                    if "Payload to LEO" in text:
                        number = int(text.split(":")[1].replace(",", "").replace(" kg", "").strip())
                        total_payload += number
                    elif "Payload to GTO" in text:
                        number = int(text.split(":")[1].replace(",", "").replace(" kg", "").strip())
                        total_payload += number
                    
                    #Calculamos la altura del cohete y el diámetro y altura de la cofia
                    elif "Rocket Height" in text:
                        number = float(text.split(":")[1].replace("m", "").strip())
                        rocket_height += number
                    elif "Fairing Diameter" in text:
                        number = float(text.split(":")[1].replace("m", "").strip())
                        fairing_diameter += number
                    elif "Fairing Height" in text:
                        number = float(text.split(":")[1].replace("m", "").strip())
                        fairing_height += number
                        
                    #Obtenermos el precio
                    elif "Price" in text:
                        number = float(text.split(":")[1].replace(",", "").replace("$", "").replace("million", "").strip())
                        price += number
        except Exception as e:
            print(f"Error al obtener detalles de la URL {details_url}: {e}")
        
        # Determinamos si el lanzamiento fue exitoso, fallido o parcialmente fallido
        if color == "rgb(69, 207, 93)":
            success = "Success"
        elif color == "rgb(218, 52, 50)":
            success = "Failed"
        else:
            success = "Partial Failure"
            
    # Creamos el diccionario con los datos obtenidos
        launch_dict = {'company': company, 
                    'rocket_and_payload': rocket_and_payload, 
                    'date_and_location': date_and_location, 
                    'details_url': details_url,
                    'total_payload': total_payload,
                    'fairing_diameter': fairing_diameter,
                    'fairing_height': fairing_height,
                    'rocket_height': rocket_height,
                    'price': price,
                    'volume': volume,
                    'success': success}

        launch_data.append(launch_dict)


browser.quit()
browser2.quit()

# Crear DataFrame
df = pd.DataFrame(launch_data, columns=['company', 'rocket_and_payload', 'date_and_location', 'details_url',
                                        'total_payload', 'fairing_diameter', 'fairing_height', 'rocket_height',
                                        'price','volume','success'])
df.size
df.shape



# Función para establecer el formato de la fecha


In [43]:
def clean_date(scraped_date_string):
    months = { "Jan":"01", "Feb":"02", "Mar":"03", "Apr":"04", "May":"05", "Jun":"06",
               "Jul":"07", "Aug":"08", "Sep":"09", "Oct":"10", "Nov":"11", "Dec":"12" }
    parts = scraped_date_string.split(" ")
    month = months[parts[1]]
    day = parts[2].replace(",", "")
    year = parts[3]
    return f"{year}/{month}/{day}"

# Normalización de los datos

En esta sección nos aseguramos que el data frame no tenga ruido en los datos y los formateamos según el uso que se le quiere entregar.

In [None]:
#Separación de columnas y limpieza de datos
df["rocket"] = df["rocket_and_payload"].str.split("|").str[0]
df["payload"] = df["rocket_and_payload"].str.split("|").str[1]
df["date"] = df["date_and_location"].str.split("\n").str[0]
df["location"] = df["date_and_location"].str.split("\n").str[1]
# Eliminamos las columnas que ya no son necesarias
df.drop(columns=["rocket_and_payload", "date_and_location"], inplace=True)

#Calculamos el volumen de la cofia y del cohete
mean_height = df.loc[df["fairing_height"] != 0.0, "fairing_height"].mean().round(2)
df.loc[df["fairing_height"] == 0.0, "fairing_height"] = mean_height
df["volume"] = (1/3) * math.pi * (df["fairing_diameter"]/2)**2 * df["fairing_height"] + math.pi * (df["fairing_diameter"]/2)**2 * df["rocket_height"]
df["volume"] = df["volume"].round(2)

#Calculamos el precio medio y reemplazamos los valores 0.0 por la media
mean_price = df.loc[df["price"] != 0.0, "price"].mean().round(2)
df.loc[df["price"] == 0.0, "price"] = mean_price
# Limpiamos la columna de fecha
df["date"] = df["date"].apply(clean_date)



df
# Exportar a CSV
df.to_csv('launch_data.csv', index=False)