In [4]:
import os
import requests
import pandas as pd
import numpy as np
%run -i assets/lists.ipynb

def download_if_missing(url, destination):
    if not os.path.exists(destination):
        print(f"Downloading {destination}...")
        r = requests.get(url)
        with open(destination, 'wb') as f:
            f.write(r.content)
        print("Done.")
    else:
        print(f"{destination} already exists.")

def extract_lum_class(s):
    if not isinstance(s, str):
        return "unknown"
    s = s.replace('\\', ' ').replace('/', ' ').replace('-', ' ').strip()
    tokens = s.split()
    lum_classes = {'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', '0'}
    return tokens[-1] if len(tokens) > 1 and tokens[-1] in lum_classes else "unknown"

def estimate_planet_density(row):
    if pd.notnull(row["pl_dens"]):
        return row["pl_dens"]  # Copy actual value if it exists
    elif pd.notnull(row["pl_bmasse"]) and pd.notnull(row["pl_rade"]):
        vol = (4/3) * np.pi * row["pl_rade"]**3
        return (row["pl_bmasse"] / vol) * 5.51
    else:
        return np.nan
    
def estimate_stellar_density(row):
    if pd.notnull(row["st_dens"]):
        return row["st_dens"]  # Copy actual value if it exists
    elif pd.notnull(row["st_mass"]) and pd.notnull(row["st_rad"]):
        vol = (4/3) * np.pi * row["st_rad"]**3
        return (row["st_mass"] / vol) * 5.51
    else:
        return np.nan

def estimate_teffclass(row):
    # Try from st_spectype first
    spectype = row.get("st_spectype")
    if isinstance(spectype, str) and len(spectype) > 0:
        base = spectype[0].upper()
        return base, base  # Use the same value for both actual and estimated
    
    # Fallback to numeric classification from st_teff
    teff = row.get("st_teff", np.nan)
    estimate = "unknown"
    if pd.notnull(teff):
        if teff >= 30000: estimate = "O"
        elif teff >= 10000: estimate = "B"
        elif teff >= 7500: estimate = "A"
        elif teff >= 6000: estimate = "F"
        elif teff >= 5200: estimate = "G"
        elif teff >= 3700: estimate = "K"
        elif teff > 0: estimate = "M"
    
    return "unknown", estimate  # Default case if no classification can be made

def estimate_lumclass(row):
    # Try from st_spectype first
    spectype = row.get("st_spectype")
    if isinstance(spectype, str) and len(spectype) > 0:
        base = extract_lum_class(spectype)
        return base, base  # Use the same value for both actual and estimated
    
    # Fallback from logg or radius
    logg = row.get("st_logg", np.nan)
    estimate = "unknown"
    if pd.notnull(logg):
        if logg > 4.6: estimate = "VII"
        elif logg > 4.2: estimate = "VI"
        elif logg > 3.6: estimate = "V"
        elif logg > 3.0: estimate = "IV"
        elif logg > 2.0: estimate = "III"
        elif logg > 1.0: estimate = "II"
        elif logg >= 0.0: estimate = "I"
        elif logg < 0.0: estimate = "0"
    
    return "unknown", estimate

def enrich_exoplanet_data():
    ps = pd.read_csv("assets/Planetary_Systems.csv", comment="#", low_memory=False)
    # sh = pd.read_csv("assets/Stellar_Hosts.csv", comment="#", low_memory=False)
    print("Enriching planet data...")
    ps["pl_type"] = ps["pl_name"].apply(
        lambda x: "terrestrial" if x in terrestrial # type: ignore
        else "super_earth" if x in super_earths # type: ignore
        else "unknown" if x in unknown # type: ignore
        else "neptune_like" if x in neptune_like # type: ignore
        else "gas_giant" if x in gas_giants # type: ignore
        else "tba"
    )
    ps["st_metratio"] = ps["st_metratio"].apply(
        lambda x: "[M/H]" if str(x).strip().lower() == "[m/h]" else
        "[Fe/H]" if str(x).strip().lower() == "[fe/h]" else x
    )
    ps["disc_locale"] = ps["disc_locale"].apply(
        lambda x: "Ground" if str(x).strip().lower() == "ground" else
        "Space" if str(x).strip().lower() == "space" else x
    )
    ps["is_solar"] = False
    ps["pl_densest"] = ps.apply(estimate_planet_density, axis=1)
    ps["st_densest"] = ps.apply(estimate_stellar_density, axis=1)
    ps[["st_teffclass", "st_teffclassest"]] = ps.apply(estimate_teffclass, axis=1, result_type="expand")
    ps[["st_lumclass", "st_lumclassest"]] = ps.apply(estimate_lumclass, axis=1, result_type="expand")
    ps.to_csv("assets/Planetary_Systems.csv", index=False)
    
    # sh["is_solar"] = False
    # sh["st_densest"] = sh.apply(estimate_stellar_density, axis=1)
    # sh[["st_teffclass", "st_teffclassest"]] = sh.apply(estimate_teffclass, axis=1, result_type="expand")
    # sh[["st_lumclass", "st_lumclassest"]] = sh.apply(estimate_lumclass, axis=1, result_type="expand")
    # sh.to_csv("assets/Stellar_Hosts.csv", index=False)

    print("Enriched data saved.")

def create_solar_data():
    print(f"Creating assets/Solar_Values.csv...")
    univ_props = {
        "hostname": "Sun", "discoverymethod": "Known since antiquity", "st_teff": 5778, "st_rad": 1, "st_mass": 1, "st_dens": 1.408, "st_densest": 1.408, "st_logg": 4.44, "st_age": 4.6, "st_rotp": 25.4, "st_vsin": 1.9997, "st_met": 0, "st_lum": 0,
        "sy_umag": 6.39, "sy_gmag": 5.12, "sy_rmag": 4.68, "sy_imag": 4.57, "sy_zmag": 4.52, "sy_bmag": 5.44, "sy_vmag": 4.81, "sy_icmag": 4.08, "sy_kepmag": 4.64, "default_flag": True, "pl_controv_flag": False, "st_spectype": "G2 V",
        "sy_gaiamag": 4.67, "sy_tmag": 4.68, "sy_jmag": 3.65, "sy_hmag": 3.30, "sy_kmag": 3.25, "sy_w1mag": 3.24, "sy_w2mag": 3.27, "sy_w3mag": 3.23, "sy_w4mag": 3.24, "is_solar": True, "st_teffclass": "G", "st_lumclass": "V", "st_teffclassest": "G", "st_lumclassest": "V",
        "pl_trandur": np.nan, "pl_tranmid": np.nan, "pl_trandep": np.nan, "pl_imppar": np.nan, "pl_projobliq": np.nan, "pl_occdep": np.nan, "lastupdate": np.nan, "st_radv": np.nan, "sy_plx": np.nan, "sy_dist": np.nan, "ra": np.nan, "dec": np.nan, "sy_snum": 1, "sy_pnum": 8, "sy_mnum": 416
    }
    # Anything transit related is nan, as well as proj_obliq, pl_imppar, and pl_occdep, lastupdate, st_radv, sy_plx, sy_dist
    solar_planets = pd.DataFrame([
        {**{"pl_name": "Mercury", "pl_type": "terrestrial", "pl_orbper": 88,       "pl_orbsmax": 0.387, "pl_orbtper": 2451590.257, "pl_orblper": 29.124,  "pl_trueobliq": 0.1,   "sy_dist": 0.00001, "pl_bmasse": 0.055, "pl_rade": 0.383, "pl_dens": 5.43, "pl_densest": 5.43, "pl_orbeccen": 0.206, "pl_insol": 6.67,   "pl_eqt": 440, "pl_rvamp": 0.008}, **univ_props},
        {**{"pl_name": "Venus",   "pl_type": "terrestrial", "pl_orbper": 225,      "pl_orbsmax": 0.723, "pl_orbtper": 2451996.500, "pl_orblper": 54.884,  "pl_trueobliq": 177.4, "sy_dist": 0.00001, "pl_bmasse": 0.815, "pl_rade": 0.949, "pl_dens": 5.24, "pl_densest": 5.24, "pl_orbeccen": 0.007, "pl_insol": 1.91,   "pl_eqt": 328, "pl_rvamp": 0.086}, **univ_props},
        {**{"pl_name": "Earth",   "pl_type": "terrestrial", "pl_orbper": 365.25,   "pl_orbsmax": 1.000, "pl_orbtper": 2451547.507, "pl_orblper": 114.207, "pl_trueobliq": 23.45, "sy_dist": 0.00001, "pl_bmasse": 1.0,   "pl_rade": 1.0,   "pl_dens": 5.51, "pl_densest": 5.51, "pl_orbeccen": 0.017, "pl_insol": 1.00,   "pl_eqt": 255, "pl_rvamp": 0.089}, **univ_props},
        {**{"pl_name": "Mars",    "pl_type": "terrestrial", "pl_orbper": 687,      "pl_orbsmax": 1.524, "pl_orbtper": 2452195.000, "pl_orblper": 286.502, "pl_trueobliq": 25.19, "sy_dist": 0.00001, "pl_bmasse": 0.107, "pl_rade": 0.532, "pl_dens": 3.93, "pl_densest": 3.93, "pl_orbeccen": 0.093, "pl_insol": 0.43,   "pl_eqt": 210, "pl_rvamp": 0.008}, **univ_props},
        {**{"pl_name": "Jupiter", "pl_type": "gas_giant",   "pl_orbper": 4332.82,  "pl_orbsmax": 5.20,  "pl_orbtper": 2455636.000, "pl_orblper": 273.867, "pl_trueobliq": 3.12,  "sy_dist": 0.00001, "pl_bmasse": 317.8, "pl_rade": 11.2,  "pl_dens": 1.33, "pl_densest": 1.33, "pl_orbeccen": 0.049, "pl_insol": 0.037,  "pl_eqt": 112, "pl_rvamp": 12.4}, **univ_props},
        {**{"pl_name": "Saturn",  "pl_type": "gas_giant",   "pl_orbper": 10755.7,  "pl_orbsmax": 9.54,  "pl_orbtper": 2452830.000, "pl_orblper": 339.392, "pl_trueobliq": 26.73, "sy_dist": 0.00001, "pl_bmasse": 95.2,  "pl_rade": 9.45,  "pl_dens": 0.69, "pl_densest": 0.69, "pl_orbeccen": 0.056, "pl_insol": 0.011,  "pl_eqt": 84,  "pl_rvamp": 2.75}, **univ_props},
        {**{"pl_name": "Uranus",  "pl_type": "neptune_like",   "pl_orbper": 30687.15, "pl_orbsmax": 19.19, "pl_orbtper": 2451545.000, "pl_orblper": 96.998,  "pl_trueobliq": 97.86, "sy_dist": 0.00001, "pl_bmasse": 14.5,  "pl_rade": 4.0,   "pl_dens": 1.27, "pl_densest": 1.27, "pl_orbeccen": 0.046, "pl_insol": 0.0037, "pl_eqt": 59,  "pl_rvamp": 0.30}, **univ_props},
        {**{"pl_name": "Neptune", "pl_type": "neptune_like",   "pl_orbper": 60190.03, "pl_orbsmax": 30.06, "pl_orbtper": 2451545.000, "pl_orblper": 276.336, "pl_trueobliq": 29.56, "sy_dist": 0.00001, "pl_bmasse": 17.1,  "pl_rade": 3.88,  "pl_dens": 1.64, "pl_densest": 1.64, "pl_orbeccen": 0.010, "pl_insol": 0.0015, "pl_eqt": 46,  "pl_rvamp": 0.28}, **univ_props}
    ])
    solar_planets.loc[solar_planets["pl_name"] == "Uranus", [
        "discoverymethod", "disc_refname", "disc_locale",
        "disc_facility", "disc_telescope", "disc_instrument", "disc_year"
    ]] = [
        "Imaging",
        "Herschel (1781)",
        "Bath, England",
        "Private Observatory",
        "6.2-inch Reflecting Telescope",
        "Homemade Reflector",
        1781
    ]
    solar_planets.loc[solar_planets["pl_name"] == "Neptune", [
        "discoverymethod", "disc_refname", "disc_locale",
        "disc_facility", "disc_telescope", "disc_instrument", "disc_year"
    ]] = [
        "Imaging + Prediction",
        "Galle & d'Arrest (1846), after Le Verrier",
        "Berlin Observatory, Germany",
        "Berlin Observatory",
        "9.6-inch Fraunhofer Refractor",
        "Fraunhofer Refractor",
        1846
    ]

    master_column_list = list(solar_planets.columns)
    expected_cols = set(master_column_list)  # Your full schema
    existing_cols = set(solar_planets.columns)
    missing_cols = expected_cols - existing_cols

    for col in missing_cols:
        solar_planets[col] = np.nan
    
    solar_planets.to_csv("assets/Solar_Values.csv", index=False)
    print("Done.")

download_if_missing(
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+sy_snum,sy_pnum,sy_mnum,pl_orbper,pl_orbsmax,pl_orbtper,pl_orblper,pl_projobliq,pl_trueobliq,pl_rade,pl_bmasse,pl_dens,pl_orbeccen,pl_insol,pl_eqt,pl_trandur,ra,dec,pl_tranmid,pl_trandep,pl_imppar,pl_occdep,pl_rvamp,disc_year,rowupdate,releasedate,st_teff,st_rad,st_mass,st_dens,st_logg,st_age,st_rotp,st_vsin,st_radv,st_met,st_lum,sy_plx,sy_dist,sy_umag,sy_bmag,sy_gmag,sy_vmag,sy_kepmag,sy_rmag,sy_gaiamag,sy_imag,sy_icmag,sy_tmag,sy_zmag,sy_jmag,sy_hmag,sy_kmag,sy_w1mag,sy_w2mag,sy_w3mag,sy_w4mag,pl_name,default_flag,pl_controv_flag,hostname,discoverymethod,disc_refname,disc_locale,disc_facility,disc_telescope,disc_instrument,hostname,st_spectype,st_metratio+from+ps&format=csv",
    "assets/Planetary_Systems.csv"
)
# download_if_missing(
#     "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+sy_snum,sy_pnum,sy_mnum,st_teff,st_rad,st_mass,st_dens,st_logg,st_age,st_rotp,st_vsin,st_radv,st_met,st_lum,sy_plx,sy_dist,sy_umag,sy_bmag,sy_gmag,sy_vmag,sy_kepmag,sy_rmag,sy_gaiamag,sy_imag,sy_icmag,sy_tmag,sy_zmag,sy_jmag,sy_hmag,sy_kmag,sy_w1mag,sy_w2mag,sy_w3mag,sy_w4mag,hostname,hostname,st_spectype,st_metratio+from+stellarhosts&format=csv",
#     "assets/Stellar_Hosts.csv"
# )
enrich_exoplanet_data()
download_if_missing(
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+pl_name+from+spectra&format=csv",
    "assets/Atmospheric_Spectroscopy.csv"
)
create_solar_data()



assets/Planetary_Systems.csv already exists.
Enriching planet data...
Enriched data saved.
assets/Atmospheric_Spectroscopy.csv already exists.
Creating assets/Solar_Values.csv...
Done.
