In [None]:
import os
import requests
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import is_object_dtype, is_numeric_dtype

%run -i assets/lists.ipynb

downloaded_original = False

def download_if_missing(url, destination):
    if os.path.exists(destination):
        response = input(f"'{destination}' already exists. Overwrite? (y/n): ").strip().lower()
        if response != 'y':
            print("Skipping creation — user declined overwrite.")
            return
    global downloaded_original
    downloaded_original = True # this only matters if it's run on planetary systems; if it is, then automatically bypasses the confirmation to run the estimates later
    print(f"Downloading {destination}...")
    r = requests.get(url)
    with open(destination, 'wb') as f:
        f.write(r.content)
    print(f"Downloaded {destination}.")

def extract_lum_class(s):
    if not isinstance(s, str):
        return np.nan
    s = s.replace('\\', ' ').replace('/', ' ').replace('-', ' ').strip()
    tokens = s.split()
    lum_classes = {'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', '0'}
    return tokens[-1] if len(tokens) > 1 and tokens[-1] in lum_classes else np.nan
def estimate_lumclass(row):
    # Try from st_spectype first
    spectype = row.get("st_spectype")
    if isinstance(spectype, str) and len(spectype) > 0:
        base = extract_lum_class(spectype)
        return base  # Use the same value for both actual and estimated
    return np.nan
def estimate_teffclass(row):
    # Try from st_spectype first
    spectype = row.get("st_spectype")
    if isinstance(spectype, str) and len(spectype) > 0:
        base = spectype[0].upper()
        return base
    return np.nan
def enrich_exoplanet_data():
    path = "assets/Planetary_Systems.csv"
    enrichment_columns = ["pl_type", "st_metratio", "disc_locale", "is_solar", "st_lumclass", "st_teffclass"]

    ps = pd.read_csv(path, comment="#", low_memory=False)

    # Check if columns already exist
    already_enriched = all(col in ps.columns for col in enrichment_columns)
    if already_enriched:
        response = input("Enriched columns already exist. Re-run enrichment and overwrite them? (y/n): ").strip().lower()
        if response != 'y':
            print("Enrichment skipped — existing columns retained.")
            return

    print("Enriching planet data...")

    # Categorical fixes
    ps["pl_type"] = ps["pl_name"].apply(
        lambda x: "terrestrial" if x in terrestrial else # type: ignore
        "super_earth" if x in super_earths else # type: ignore
        "unknown" if x in unknown else # type: ignore
        "neptune_like" if x in neptune_like else # type: ignore
        "gas_giant" if x in gas_giants else "tba" # type: ignore
    )
    ps["st_metratio"] = ps["st_metratio"].apply(
        lambda x: "[M/H]" if str(x).strip().lower() == "[m/h]" else
        "[Fe/H]" if str(x).strip().lower() == "[fe/h]" else x
    )
    ps["disc_locale"] = ps["disc_locale"].apply(
        lambda x: "Ground" if str(x).strip().lower() == "ground" else
        "Space" if str(x).strip().lower() == "space" else x
    )
    ps["is_solar"] = False
    ps["st_lumclass"] = ps.apply(estimate_lumclass, axis=1)
    ps["st_teffclass"] = ps.apply(estimate_teffclass, axis=1)

    ps.to_csv(path, index=False)
    print("Enriched data saved.")

def estimate_planet_density(row):
    if pd.notnull(row["pl_dens"]):
        return row["pl_dens"]  # Copy actual value if it exists
    elif pd.notnull(row["pl_bmasse"]) and pd.notnull(row["pl_rade"]):
        vol = (4/3) * np.pi * row["pl_rade"]**3
        return (row["pl_bmasse"] / vol) * 5.51
    else:
        return np.nan
def estimate_stellar_density(row):
    if pd.notnull(row["st_dens"]):
        return row["st_dens"]  # Copy actual value if it exists
    elif pd.notnull(row["st_mass"]) and pd.notnull(row["st_rad"]):
        vol = (4/3) * np.pi * row["st_rad"]**3
        return (row["st_mass"] / vol) * 1.41
    else:
        return np.nan
def estimate_stellar_surface_gravity(row):
    if pd.notnull(row["st_logg"]):
        return row["st_logg"]  # Already in log10(cm/s²)
    if pd.notnull(row["st_mass"]) and pd.notnull(row["st_rad"]):
        g_earth = 9.807  # m/s²
        gravity_m_s2 = g_earth * row["st_mass"] / row["st_rad"]**2
        gravity_cm_s2 = gravity_m_s2 * 100  # Convert to cm/s²
        return np.log10(gravity_cm_s2)
    return np.nan
def train_general_estimator(df, min_samples=1, max_missing_fraction=0.95):
    try:
        user_input = input("Enter number of trees for Random Forest (default 100): ").strip()
        n_trees = int(user_input) if user_input else 100
        if n_trees > 500:
            print("Large tree count may increase runtime significantly. Please be patient.")
    except ValueError:
        print("Invalid input — using default: 100 trees.")
        n_trees = 100
    for target in df.columns:
    # run your existing classification/regression estimation logic
        if target in ["pl_name", "pl_controv_flag", "hostname", "hostname.1", "is_solar", "default_flag", "disc_refname", "rowupdate", "releasedate"]:
            continue
        # Skip columns already estimated or with too many missing values
        if target.endswith("_estimated") or df[target].isna().mean() > max_missing_fraction:
            continue

        # Choose features based on prefix
        prefix = target[:3]
        features = [
            col for col in df.columns
            if col != target
            and col.startswith(prefix)
            and df[col].notna().mean() > 0.75
            and is_numeric_dtype(df[col])
        ]

        if not features:
            print(f"Skipping '{target}' — no usable features.")
            continue

        # Prepare training data
        valid_mask = df[target].notna() & df[features].notna().all(axis=1)
        df_valid = df.loc[valid_mask]

        entries = df.loc[valid_mask, target].astype(str)
        entries_clean = entries[entries != "nan"]
        if entries_clean.empty:
            print(f"Skipping '{target}' — no encodable values.")
            continue

        if len(df_valid) < min_samples:
            print(f"Skipping '{target}' — not enough valid rows.")
            continue

        try:
            if is_object_dtype(df[target]) or df[target].nunique() < 10:
                # Classification for strings or low-cardinality discrete
                label_encoder = LabelEncoder()
                encoded_col = target + "_encoded"

                if is_object_dtype(df[target]):
                    # Preserve true NaNs before cast
                    missing_mask = df[target].isna()
                    df[target] = df[target].astype(str)
                    df.loc[missing_mask, target] = np.nan  # Restore real NaNs
                entries_clean = df.loc[valid_mask, target].astype(str)
                entries_clean = entries_clean[~entries_clean.isin(["nan", "None", "NaN", ""])]

                # Build encoded column
                encoded_values = label_encoder.fit_transform(entries_clean).astype(float)
                df[encoded_col] = pd.Series(np.nan, index=df.index)

                df.loc[entries_clean.index, encoded_col] = pd.Series(encoded_values, index=entries_clean.index, dtype='float64')

                # Now validate before training
                if encoded_col not in df.columns or df[encoded_col].dropna().empty:
                    print(f"Encoding failed for '{target}' — column not created or contains no data.")
                    continue

                # Align encoded values with valid training rows
                training_mask = df_valid.index.intersection(df[encoded_col].dropna().index)
                if len(training_mask) < min_samples:
                    print(f"Skipping '{target}' — not enough encoded samples for training.")
                    continue

                model = RandomForestClassifier(n_estimators=n_trees, max_depth=6, random_state=42)
                model.fit(df.loc[training_mask, features], df.loc[training_mask, encoded_col])

                threshold = int(len(features) * 0.75)
                if target == "pl_type":
                    predict_mask = df[target].isin(["unknown", "tba"]) & (df[features].notna().sum(axis=1) >= threshold)
                else:
                    predict_mask = df[target].isna() & (df[features].notna().sum(axis=1) >= threshold)
                df_missing = df.loc[predict_mask]
                if target == "pl_type":
                    skipped_mask = df[target].isin(["unknown", "tba"]) & (df[features].notna().sum(axis=1) < threshold)
                else:
                    skipped_mask = df[target].isna() & (df[features].notna().sum(axis=1) < threshold)

                if not df_missing.empty:
                    predictions = model.predict(df_missing[features])
                    predictions = predictions.astype(int)

                    decoded = label_encoder.inverse_transform(predictions)
                    df[target + "_estimated"] = df[target]
                    df.loc[df_missing.index, target] = decoded

                if encoded_col in df.columns:
                    df.drop(columns=[encoded_col], inplace=True)
                remaining = df[target].isna().sum()
                print(f"Estimated '{target}' using {len(df_valid)} samples. {df_missing.shape[0]} rows estimated, {remaining} missing rows remaining ({skipped_mask.sum()} skipped due to insufficient features).")
            else:
                # Regression for continuous numeric
                model = RandomForestRegressor(n_estimators=n_trees, max_depth=6, random_state=42)
                model.fit(df_valid[features], df_valid[target])

                threshold = int(len(features) * 0.75)
                predict_mask = df[target].isna() & (df[features].notna().sum(axis=1) >= threshold)
                df_missing = df.loc[predict_mask]
                skipped_mask = df[target].isna() & (df[features].notna().sum(axis=1) < threshold)

                if not df_missing.empty:
                    predictions = model.predict(df_missing[features])
                    df[target + "_estimated"] = df[target]
                    df.loc[df_missing.index, target] = predictions
                
                remaining = df[target].isna().sum()
                print(f"Estimated '{target}' using {len(df_valid)} samples. {df_missing.shape[0]} rows estimated, {remaining} missing rows remaining ({skipped_mask.sum()} skipped due to insufficient features).")
        except Exception as e:
            print(f"Error estimating '{target}': {e}")
    df.drop(columns=[col for col in df.columns if col.endswith("_estimated")], inplace=True)
    return df

def create_estimates():
    output_path = "assets/Planetary_Systems_Estimated.csv"

    if os.path.exists(output_path) and not downloaded_original:
        response = input(f"'{output_path}' already exists. Overwrite? (y/n): ").strip().lower()
        if response != 'y':
            print("Skipping creation — user declined overwrite.")
            return

    print(f"Creating {output_path}...")

    df = pd.read_csv("assets/Planetary_Systems.csv", comment="#", low_memory=False)
    df_est = df.copy()

    df_est["pl_dens"] = df_est.apply(estimate_planet_density, axis=1)
    df_est["st_dens"] = df_est.apply(estimate_stellar_density, axis=1)
    df_est["st_logg"] = df_est.apply(estimate_stellar_surface_gravity, axis=1)

    df_est = train_general_estimator(df_est)
    df_est.to_csv(output_path, index=False)

    print(f"Created {output_path}.")
    
def create_solar_data():
    output_path = "assets/Solar_Values.csv"

    if os.path.exists(output_path):
        response = input(f"'{output_path}' already exists. Overwrite? (y/n): ").strip().lower()
        if response != 'y':
            print("Skipping creation — user declined overwrite.")
            return

    print(f"Creating {output_path}...")
    univ_props = {
        "hostname": "Sun", "discoverymethod": "Known Since Antiquity", "st_teff": 5778, "st_rad": 1, "st_mass": 1, "st_dens": 1.408, "st_densest": 1.408, "st_logg": 4.44, "st_age": 4.6, "st_rotp": 25.4, "st_vsin": 1.9997, "st_met": 0, "st_lum": 0,
        "sy_umag": 6.39, "sy_gmag": 5.12, "sy_rmag": 4.68, "sy_imag": 4.57, "sy_zmag": 4.52, "sy_bmag": 5.44, "sy_vmag": 4.81, "sy_icmag": 4.08, "sy_kepmag": 4.64, "default_flag": True, "pl_controv_flag": False, "st_spectype": "G2 V",
        "sy_gaiamag": 4.67, "sy_tmag": 4.68, "sy_jmag": 3.65, "sy_hmag": 3.30, "sy_kmag": 3.25, "sy_w1mag": 3.24, "sy_w2mag": 3.27, "sy_w3mag": 3.23, "sy_w4mag": 3.24, "is_solar": True, "st_teffclass": "G", "st_lumclass": "V", "st_teffclassest": "G", "st_lumclassest": "V",
        "pl_trandur": np.nan, "pl_tranmid": np.nan, "pl_trandep": np.nan, "pl_imppar": np.nan, "pl_projobliq": np.nan, "pl_occdep": np.nan, "lastupdate": np.nan, "st_radv": np.nan, "sy_plx": np.nan, "sy_dist": np.nan, "ra": np.nan, "dec": np.nan, "sy_snum": 1, "sy_pnum": 8, "sy_mnum": 416
    }
    # Anything transit related is nan, as well as proj_obliq, pl_imppar, and pl_occdep, lastupdate, st_radv, sy_plx, sy_dist
    solar_planets = pd.DataFrame([
        {**{"pl_name": "Mercury", "pl_type": "terrestrial", "pl_orbper": 88,       "pl_orbsmax": 0.387, "pl_orbtper": 2451590.257, "pl_orblper": 29.124,  "pl_trueobliq": 0.1,   "sy_dist": 0.00001, "pl_bmasse": 0.055, "pl_rade": 0.383, "pl_dens": 5.43, "pl_densest": 5.43, "pl_orbeccen": 0.206, "pl_insol": 6.67,   "pl_eqt": 440, "pl_rvamp": 0.008}, **univ_props},
        {**{"pl_name": "Venus",   "pl_type": "terrestrial", "pl_orbper": 225,      "pl_orbsmax": 0.723, "pl_orbtper": 2451996.500, "pl_orblper": 54.884,  "pl_trueobliq": 177.4, "sy_dist": 0.00001, "pl_bmasse": 0.815, "pl_rade": 0.949, "pl_dens": 5.24, "pl_densest": 5.24, "pl_orbeccen": 0.007, "pl_insol": 1.91,   "pl_eqt": 328, "pl_rvamp": 0.086}, **univ_props},
        {**{"pl_name": "Earth",   "pl_type": "terrestrial", "pl_orbper": 365.25,   "pl_orbsmax": 1.000, "pl_orbtper": 2451547.507, "pl_orblper": 114.207, "pl_trueobliq": 23.45, "sy_dist": 0.00001, "pl_bmasse": 1.0,   "pl_rade": 1.0,   "pl_dens": 5.51, "pl_densest": 5.51, "pl_orbeccen": 0.017, "pl_insol": 1.00,   "pl_eqt": 255, "pl_rvamp": 0.089}, **univ_props},
        {**{"pl_name": "Mars",    "pl_type": "terrestrial", "pl_orbper": 687,      "pl_orbsmax": 1.524, "pl_orbtper": 2452195.000, "pl_orblper": 286.502, "pl_trueobliq": 25.19, "sy_dist": 0.00001, "pl_bmasse": 0.107, "pl_rade": 0.532, "pl_dens": 3.93, "pl_densest": 3.93, "pl_orbeccen": 0.093, "pl_insol": 0.43,   "pl_eqt": 210, "pl_rvamp": 0.008}, **univ_props},
        {**{"pl_name": "Jupiter", "pl_type": "gas_giant",   "pl_orbper": 4332.82,  "pl_orbsmax": 5.20,  "pl_orbtper": 2455636.000, "pl_orblper": 273.867, "pl_trueobliq": 3.12,  "sy_dist": 0.00001, "pl_bmasse": 317.8, "pl_rade": 11.2,  "pl_dens": 1.33, "pl_densest": 1.33, "pl_orbeccen": 0.049, "pl_insol": 0.037,  "pl_eqt": 112, "pl_rvamp": 12.4}, **univ_props},
        {**{"pl_name": "Saturn",  "pl_type": "gas_giant",   "pl_orbper": 10755.7,  "pl_orbsmax": 9.54,  "pl_orbtper": 2452830.000, "pl_orblper": 339.392, "pl_trueobliq": 26.73, "sy_dist": 0.00001, "pl_bmasse": 95.2,  "pl_rade": 9.45,  "pl_dens": 0.69, "pl_densest": 0.69, "pl_orbeccen": 0.056, "pl_insol": 0.011,  "pl_eqt": 84,  "pl_rvamp": 2.75}, **univ_props},
        {**{"pl_name": "Uranus",  "pl_type": "neptune_like",   "pl_orbper": 30687.15, "pl_orbsmax": 19.19, "pl_orbtper": 2451545.000, "pl_orblper": 96.998,  "pl_trueobliq": 97.86, "sy_dist": 0.00001, "pl_bmasse": 14.5,  "pl_rade": 4.0,   "pl_dens": 1.27, "pl_densest": 1.27, "pl_orbeccen": 0.046, "pl_insol": 0.0037, "pl_eqt": 59,  "pl_rvamp": 0.30}, **univ_props},
        {**{"pl_name": "Neptune", "pl_type": "neptune_like",   "pl_orbper": 60190.03, "pl_orbsmax": 30.06, "pl_orbtper": 2451545.000, "pl_orblper": 276.336, "pl_trueobliq": 29.56, "sy_dist": 0.00001, "pl_bmasse": 17.1,  "pl_rade": 3.88,  "pl_dens": 1.64, "pl_densest": 1.64, "pl_orbeccen": 0.010, "pl_insol": 0.0015, "pl_eqt": 46,  "pl_rvamp": 0.28}, **univ_props}
    ])
    solar_planets.loc[solar_planets["pl_name"] == "Uranus", [
        "discoverymethod", "disc_refname", "disc_locale",
        "disc_facility", "disc_telescope", "disc_instrument", "disc_year"
    ]] = [
        "Imaging",
        "Herschel (1781)",
        "Ground",
        "Private Observatory",
        "6.2-inch Reflecting Telescope",
        "Homemade Reflector",
        1781
    ]
    solar_planets.loc[solar_planets["pl_name"] == "Neptune", [
        "discoverymethod", "disc_refname", "disc_locale",
        "disc_facility", "disc_telescope", "disc_instrument", "disc_year"
    ]] = [
        "Imaging",
        "Galle & d'Arrest (1846), after Le Verrier",
        "Ground",
        "Berlin Observatory",
        "9.6-inch Fraunhofer Refractor",
        "Fraunhofer Refractor",
        1846
    ]

    master_column_list = list(solar_planets.columns)
    expected_cols = set(master_column_list)  # Your full schema
    existing_cols = set(solar_planets.columns)
    missing_cols = expected_cols - existing_cols

    for col in missing_cols:
        solar_planets[col] = np.nan
    
    solar_planets.to_csv("assets/Solar_Values.csv", index=False)
    print(f"Created {output_path}.")


download_if_missing(
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+sy_snum,sy_pnum,sy_mnum,pl_orbper,pl_orbsmax,pl_orbtper,pl_orblper,pl_projobliq,pl_trueobliq,pl_rade,pl_bmasse,pl_dens,pl_orbeccen,pl_insol,pl_eqt,pl_trandur,ra,dec,pl_tranmid,pl_trandep,pl_imppar,pl_occdep,pl_rvamp,disc_year,rowupdate,releasedate,st_teff,st_rad,st_mass,st_dens,st_logg,st_age,st_rotp,st_vsin,st_radv,st_met,st_lum,sy_plx,sy_dist,sy_umag,sy_bmag,sy_gmag,sy_vmag,sy_kepmag,sy_rmag,sy_gaiamag,sy_imag,sy_icmag,sy_tmag,sy_zmag,sy_jmag,sy_hmag,sy_kmag,sy_w1mag,sy_w2mag,sy_w3mag,sy_w4mag,pl_name,default_flag,pl_controv_flag,hostname,discoverymethod,disc_refname,disc_locale,disc_facility,disc_telescope,disc_instrument,hostname,st_spectype,st_metratio+from+ps&format=csv",
    "assets/Planetary_Systems.csv"
)
enrich_exoplanet_data()
create_estimates()
download_if_missing(
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+pl_name+from+spectra&format=csv",
    "assets/Atmospheric_Spectroscopy.csv"
)
# create_solar_data() # If you want to make changes to the solar data, uncomment this line and edit create_solar_data() function
print("All tasks completed.")



Skipping creation — user declined overwrite.
Enrichment skipped — existing columns retained.
Skipping creation — user declined overwrite.
Skipping creation — user declined overwrite.
Creating assets/Solar_Values.csv...
Created assets/Solar_Values.csv.
