In [None]:
# UMAP (Credit to Microsoft Copilot)
# Note to self: figure out how the heck this works)
# For UMAP, use 2-10 neighbors (local) and 30-100 (global). Put after PCA
import dash
import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output, State, ctx, MATCH, ALL
from dash.exceptions import PreventUpdate
import plotly.graph_objects as go
import plotly.express as px
import umap
import os
import signal
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

%run -i assets/lists.ipynb

# imports Planetary systems csv
pl_s = pd.read_csv('assets/Planetary_Systems.csv', comment="#", low_memory=False)
pl_es = pd.read_csv('assets/Planetary_Systems_Estimated.csv', comment="#", low_memory=False)

# imports atmospheric list for one filter.
at_s = pd.read_csv('assets/Atmospheric_Spectroscopy.csv', comment="#", low_memory=False)

solar_planets = pd.read_csv('assets/Solar_Values.csv', comment="#", low_memory=False)

# Dash app
app = Dash(__name__, external_stylesheets=['assets/style.css'])

# List of database columns to take into account for graphs
planet_features = {
    'Planet (Compare Planets only)': [
        {'label': "Orbital Period (days, pl_orbper, Recommended)", 'value': "pl_orbper"},
        {'label': "Orbit Semi-Major Axis (au, pl_orbsmax)", 'value': "pl_orbsmax"},
        {'label': "Epoch of Periastron (deg, pl_orbtper)", 'value': "pl_orbtper"},
        {'label': "Argument of Periastron (deg, pl_orblper)", 'value': "pl_orblper"},
        {'label': "Proj. Obliquity (deg, pl_projobliq)", 'value': "pl_projobliq"},
        {'label': "True Obliquity (deg, pl_trueobliq)", 'value': "pl_trueobliq"},
        {'label': "Radius (Earth radius, pl_rade, Recommended)", 'value': "pl_rade"},
        {'label': "Mass (Earth mass, pl_bmasse, Recommended)", 'value': "pl_bmasse"},
        {'label': "Density (g/cm^3, pl_dens, Recommended)", 'value': "pl_dens"},
        {'label': "Orbital Eccentricity (pl_orbeccen)", 'value': "pl_orbeccen"},
        {'label': "Insol. Flux (Earth flux, pl_insol)", 'value': "pl_insol"},
        {'label': "Equil. Temp. (K, pl_eqt, Recommended)", 'value': "pl_eqt"},
        {'label': "Transit Duration (hrs, pl_trandur)", 'value': "pl_trandur"},
        {'label': "Transit Midpoint (days, pl_tranmid)", 'value': "pl_tranmid"},
        {'label': "Transit Depth (%, pl_trandep)", 'value': "pl_trandep"},
        {'label': "Impact Parameter (pl_imppar)", 'value': "pl_imppar"},
        {'label': "Occulation Depth (%, pl_occdep)", 'value': "pl_occdep"},
        {'label': "Rad. Velocity Amplitude (m/s, pl_rvamp)", 'value': "pl_rvamp"},
        {'label': "Discovery Year (disc_year)", 'value': "disc_year"},
        {'label': "Last Update (rowupdate)", 'value': "rowupdate"},
    ],
    'Stellar': [
        {'label': "Effec. Temp. (K, st_teff, Recommended)", 'value': "st_teff"},
        {'label': "Radius (Solar radius, st_rad, Recommended)", 'value': "st_rad"},
        {'label': "Mass (Solar mass, st_mass, Recommended)", 'value': "st_mass"},
        {'label': "Density (g/cm^3, st_dens)", 'value': "st_dens"},
        {'label': "Surface Grav. (log10(cm/s^2), st_logg)", 'value': "st_logg"},
        {'label': "Age (Gyr, st_age)", 'value': "st_age"},
        {'label': "Rot. Period (days, st_rotp)", 'value': "st_rotp"},
        {'label': "Rot. Velocity (km/s, st_vsin)", 'value': "st_vsin"},
        {'label': "Rad. Velocity (km/s, st_radv)", 'value': "st_radv"},
        {'label': "Metallicity (dex, st_met, Recommended)", 'value': "st_met"},
        {'label': "Luminosity (log10(Solar), st_lum)", 'value': "st_lum"},
    ],
    'System': [
        {'label': "Parallax (mas, sy_plx)", 'value': "sy_plx"},
        {'label': "Dist from Earth (pc, sy_dist)", 'value': "sy_dist"},
        {'label': "No. Stars (sy_snum)", 'value': "sy_snum"},
        {'label': "No. Planets (sy_snum)", 'value': "sy_pnum"},
        {'label': "No. Moons (sy_snum)", 'value': "sy_mnum"},
        {'label': "u (Sloan) Magnitude (sy_umag, ~354 nm)", 'value': "sy_umag"},
        {'label': "B (Johnson) Magnitude (sy_bmag, ~442 nm)", 'value': "sy_bmag"},
        {'label': "g (Sloan) Magnitude (sy_gmag, ~475 nm)", 'value': "sy_gmag"},
        {'label': "V (Johnson) Magnitude (sy_vmag, ~540 nm)", 'value': "sy_vmag"},
        {'label': "Kepler Magnitude (sy_kepmag, ~600 nm)", 'value': "sy_kepmag"},
        {'label': "r (Sloan) Magnitude (sy_rmag, ~622 nm)", 'value': "sy_rmag"},
        {'label': "Gaia Magnitude (sy_gaiamag, ~673 nm)", 'value': "sy_gaiamag"},
        {'label': "i (Sloan) Magnitude (sy_imag, ~763 nm)", 'value': "sy_imag"},
        {'label': "I (Cousins) Magnitude (sy_icmag, ~786.5 nm)", 'value': "sy_icmag"},
        {'label': "TESS Magnitude (sy_tmag, ~800 nm)", 'value': "sy_tmag"},
        {'label': "z (Sloan) Magnitude (sy_zmag, ~905 nm)", 'value': "sy_zmag"},
        {'label': "J (2MASS) Magnitude (sy_jmag, ~1.25 μm)", 'value': "sy_jmag"},
        {'label': "H (2MASS) Magnitude (sy_hmag, ~1.65 μm)", 'value': "sy_hmag"},
        {'label': "Ks (2MASS) Magnitude (sy_kmag, ~2,15 μm)", 'value': "sy_kmag"},
        {'label': "W1 (WISE) Magnitude (sy_w1mag, ~3.4 μm)", 'value': "sy_w1mag"},
        {'label': "W2 (WISE) Magnitude (sy_w2mag, ~4.6 μm)", 'value': "sy_w2mag"},
        {'label': "W3 (WISE) Magnitude (sy_w3mag, ~12 μm)", 'value': "sy_w3mag"},
        {'label': "W4 (WISE) Magnitude (sy_w4mag, ~22 μm)", 'value': "sy_w4mag"},
    ]
}
initial_values = [["pl_rade", "pl_bmasse", "pl_dens", "pl_eqt", "pl_orbper"], ["st_teff", "st_rad", "st_mass", "st_met"], []]

app.layout = html.Div([
    html.H1("UMAP Analysis"),

    html.H2("Data filters:"),
    dcc.Checklist(
        id='filter-checklist',
        options=[
            {'label': 'Has atmospheric data (Compare Planets only)', 'value': 'atmoData'},
            {'label': 'Default parameter set', 'value': 'default'},
            {'label': 'No controversial flag', 'value': 'noControv'},
            {'label': 'Water to Metal Density (Compare Planets only)', 'value': 'densRange'},
            {'label': 'In Target Star Catalog', 'value': 'target'},
            {'label': 'Include Solar System', 'value': 'solar'},
        ],
        value=['default', 'noControv', 'solar', 'target'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.Details([
        html.Summary("Planet type (Compare Planets only)"),
        dcc.Checklist(
            id='pltype-checklist',
            options=[
                {'label': 'Terrestrial', 'value': 'terrestrial'},
                {'label': 'Super-Earth', 'value': 'super_earth'},
                {'label': 'Neptune-like', 'value': 'neptune_like'},
                {'label': 'Gas Giant', 'value': 'gas_giant'},
                {'label': 'Unknown', 'value': 'unknown'},
                {'label': 'TBA', 'value': 'tba'},
            ],
            value=['terrestrial', 'super_earth', 'neptune_like', 'gas_giant', 'unknown', 'tba'],  # Default values to filter by
            className="checkbox-container"
        ),
    ]),

    html.Details([
        html.Summary("Discovery method (Compare Planets only)"),
        dcc.Checklist(
            id='discmethod-checklist',
            options=[
                {'label': 'Transit', 'value': 'Transit'},
                {'label': 'Transit Timing Variations', 'value': 'Transit Timing Variations'},
                {'label': 'Eclipse Timing Variations', 'value': 'Eclipse Timing Variations'},
                {'label': 'Orbital Brightness Modulation', 'value': 'Orbital Brightness Modulation'},
                {'label': 'Radial Velocity', 'value': 'Radial Velocity'},
                {'label': 'Astrometry', 'value': 'Astrometry'},
                {'label': 'Imaging', 'value': 'Imaging'},
                {'label': 'Disc Kinematics', 'value': 'Disc Kinematics'},
                {'label': 'Microlensing', 'value': 'Microlensing'},
                {'label': 'Pulsar Timing', 'value': 'Pulsar Timing'},
                {'label': 'Pulsation Timing Variations', 'value': 'Pulsation Timing Variations'},
                {'label': 'Known Since Antiquity', 'value': 'Known Since Antiquity'},
                {'label': 'Unknown', 'value': 'null'},
            ],
            value=['Transit', 'Radial Velocity', 'Imaging', 'Eclipse Timing Variations', 'Microlensing', 'Pulsar Timing', 'Pulsation Timing Variations', 'Orbital Brightness Modulation', 'Transit Timing Variations', 'Astrometry', 'Disc Kinematics', 'Known Since Antiquity', 'null'],  # automatically selects all
            className="checkbox-container"
        ),
    ]),  

    html.Details([
        html.Summary("Discovery locale (Compare Planets only)"),
        dcc.Checklist(
            id='disclocale-checklist',
            options=[
                {'label': 'Space', 'value': 'Space'},
                {'label': 'Ground', 'value': 'Ground'},
                {'label': 'Multiple Locales', 'value': 'Multiple Locales'},
                {'label': 'Unknown', 'value': 'null'},
            ],
            value=['Space', 'Ground', 'Multiple Locales', 'null'],  # Default values to filter by
            className="checkbox-container"
        ),
    ]), 
    
    html.Details([
        html.Summary("Harvard spectral classes"),
        dcc.Checklist(
            id='teff-checklist',
            options=[
                {'label': 'O type star (>33000 K)', 'value': 'O'},
                {'label': 'B type star (10000-33000 K)', 'value': 'B'},
                {'label': 'A type star (7300-10000 K)', 'value': 'A'},
                {'label': 'F type star (6000-7300 K)', 'value': 'F'},
                {'label': 'G type star (5300-6000 K)', 'value': 'G'},
                {'label': 'K type star (3900-5300 K, recommended)', 'value': 'K'},
                {'label': 'M type star (2300-3900 K)', 'value': 'M'},
                {'label': 'L type star (1300-2500 K)', 'value': 'L'},
                {'label': 'T type star (700-1300 K)', 'value': 'T'},
                {'label': 'Y type star (<700 K)', 'value': 'Y'},
                {'label': 'Wolf-Rayet star (>30000 K)', 'value': 'W'},
                {'label': 'White Dwarf (~5000-100000+ K)', 'value': 'D'},
                {'label': 'Unknown', 'value': 'null'},
            ],
            value=['O', 'B', 'A', 'F', 'G', 'K', 'M', 'null'],  # Default values to filter by
            className="checkbox-container"
        ),
    ]),

    html.Details([
        html.Summary("Yerkes spectral classes"),
        dcc.Checklist(
            id='lum-checklist',
            options=[
                # {'label': 'Hypergiant (0)', 'value': '0'},
                # {'label': 'Supergiant (I)', 'value': 'I'},
                {'label': 'Bright Giant (II)', 'value': 'II'},
                {'label': 'Giant (III)', 'value': 'III'},
                {'label': 'Subgiant (IV)', 'value': 'IV'},
                {'label': 'Main-sequence/Dwarf (V, recommended)', 'value': 'V'},
                {'label': 'Subdwarf (VI)', 'value': 'VI'},
                # {'label': 'White Dwarf (VII)', 'value': 'VII'},
                {'label': 'Unknown', 'value': 'null'},
            ],
            value=['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'null'],  # Default values to filter by
            className="checkbox-container"
        ),
    ]),
    
    html.Details([
        html.Summary("Stellar metallicity ratio"),
        dcc.Checklist(
            id='met-checklist',
            options=[
                {'label': 'Iron abundance (recommended)', 'value': '[Fe/H]'},
                {'label': 'General metal content', 'value': '[M/H]'},
                {'label': 'Unknown', 'value': 'null'},
            ],
            value=['[Fe/H]', '[M/H]', 'null'],  # Default values to filter by
            className="checkbox-container"
        ),
    ]), 

    html.H2("Select Features:"),
    *[
        html.Details([
            html.Summary(group),
            dcc.Checklist(
                id={'type': 'feature-checklist', 'index': i},
                options=options,
                value=initial_values[i],
                className="checkbox-container" 
            )
        ]) for i, (group, options) in enumerate(planet_features.items())
    ],

    html.H2("Group by:"),
    dcc.RadioItems(
        id="colorcode-radioitems",
        options=[
            {'label': 'Planet Type (Compare Planets only)', 'value': 'pl_type'},
            {'label': 'Harvard Spec. Class', 'value': 'st_teffclass'},
            {'label': 'Yerkes Spec. Class', 'value': 'st_lumclass'},
            {'label': 'Stellar Metallicity', 'value': 'st_metratio'},
            {'label': 'Discovery Method (Compare Planets only)', 'value': 'discoverymethod'},
            {'label': 'Discovery Locale (Compare Planets only)', 'value': 'disc_locale'},
            # {'label': 'Host Star', 'value': 'hostname'}, # too much computational power.
        ],  
        value="pl_type",
        className="checkbox-container"
    ),
    
    html.H2("How to handle NaN values:"),
    dcc.RadioItems(
        id="nanhandle-radioitems",
        options=[
            # {'label': 'Set to 0', 'value': 'zero'},
            {'label': 'Set to mean', 'value': 'mean'},
            {'label': 'Set to median', 'value': 'median'},
            {'label': 'Remove rows', 'value': 'remove'},
        ],
        value="median",
        className="checkbox-container"
    ),
    
    html.H2("Extra options:"),
    dcc.Checklist(
        id='extra-checklist',
        options=[
            {'label': html.Span(['Simplify planet types and discovery methods (Compare Planets only)'], style={'fontWeight': 'bold'}), 'value': 'simple'},
            {'label': html.Span(['Estimate missing values'], style={'fontWeight': 'bold'}), 'value': 'estimate'},
            {'label': html.Span(['Compare stars'], style={'fontWeight': 'bold'}), 'value': 'suns'},
        ],
        value=[],  # Start unchecked
        className="checkbox-container"
    ),
    
    html.H2("Scale Type:"),
    dcc.RadioItems(
        id="scale-radioitems",
        options=[
            {'label': 'Linear', 'value': 'linear'},
            {'label': 'Log (zero negatives)', 'value': 'logsafe'},
            {'label': 'Log (remove negatives)', 'value': 'logstrict'},
        ],
        value='linear',
        className="checkbox-container"
    ),

    html.H2("Neighbors:"),
    html.P("Recommended: Generate a 2-10 and 30-100 neighbors graph for each set of values."),
    dcc.Slider(
        id="neighbors-slider",
        min=2,
        max=100,
        step=1,
        value=15,
        marks={i: str(i) for i in range(0, 100, 10)},
        tooltip={"placement": "bottom", "always_visible": True}
    ),

    html.Button('Update Graph', id='update-button', n_clicks=1),

    dcc.Graph(id="umap", figure={}),

    html.P('NaN info:', id='naninfo-box'),

    html.P("Data sourced from:"),
    html.Div([
        html.A("https://www.doi.org/10.26133/NEA12", href="https://www.doi.org/10.26133/NEA12", target="_blank"),
        html.Br(),
        html.A("https://www.doi.org/10.26133/NEA36", href="https://www.doi.org/10.26133/NEA36", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/exoplanet-catalog/", href="https://science.nasa.gov/exoplanets/exoplanet-catalog/", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/target-star-catalog/", href="https://science.nasa.gov/exoplanets/target-star-catalog/", target="_blank"),
    ]),
    
    html.Button("Reset Settings", id="reset-button", n_clicks=0),

    dcc.Store(id='settings-store', storage_type='local'),
    dcc.Store(id='rehydration-complete', data=False),
    dcc.Store(id='init-flag', data=False)
], style={'color': 'black', 'font-family': 'Arial', 'backgroundColor': 'white', 'padding': '20px', 'text-align': 'center'})


def update_graph(update, filters, extra, scale, feature_vals, colorcode, pltype, discmethod, disclocale, teff, lum, met, nanhandle, neighbors):
    # Load dataset
    if 'estimate' in extra:
        df = pl_es.copy()
    else:
        df = pl_s.copy()

    # Combines all feature checkboxes into one list.
    if 'suns' in extra:
        features = sorted(set(val for group in feature_vals[1:] for val in group))
    else:
        features = sorted(set(val for group in feature_vals for val in group))

    # Apply filters
    if 'suns' in extra:
        if 'solar' in filters:
            first_solar_entry = solar_planets.iloc[[0]].copy()  # Get just the first row as a DataFrame
            df = pd.concat([df, first_solar_entry], ignore_index=True)
    else:
        if 'solar' in filters:
            df = pd.concat([df, solar_planets.copy()], ignore_index=True) # type: ignore
        if 'atmoData' in filters:
            df = df[df['pl_name'].isin(at_s['pl_name']) | df["is_solar"] == True]
        if 'densRange' in filters:
            df=df[(df['pl_dens'] > 1) & (df['pl_dens'] < 5.6)] # Only gets planets with a density between water and metallic iron
        df = df[df['pl_type'].isin(pltype)]  
        if 'simple' in extra:
            df["pl_type"] = df["pl_type"].replace({"super_earth": "terrestrial", "neptune_like": "gas_giant"})
            df["discoverymethod"] = df["discoverymethod"].replace({"Transit Timing Variations": "Transit", "Eclipse Timing Variations": "Transit", "Orbital Brightness Modulation": "Transit", "Astrometry": "Radial Velocity", "Pulsation Timing Variations": "Pulsar Timing", "Disc Kinematics": "Imaging"})  
        if 'null' in discmethod:
            df = df[df['discoverymethod'].isin(discmethod) | df['discoverymethod'].isna()]
        else:
            df = df[df['discoverymethod'].isin(discmethod)]
        if 'null' in disclocale:
            df = df[df['disc_locale'].isin(disclocale) | df['disc_locale'].isna()]
        else:
            df = df[df['disc_locale'].isin(disclocale)]
    if 'null' in teff:
        df = df[df['st_teffclass'].isin(teff) | df['st_teffclass'].isna()] 
    else:
        df = df[df['st_teffclass'].isin(teff)]   
    if 'null' in lum:
        df = df[df['st_lumclass'].isin(lum) | df['st_lumclass'].isna()] 
    else:
        df = df[df['st_lumclass'].isin(lum)]
    if 'default' in filters:
        df = df[df['default_flag'] == True]
    if 'noControv' in filters:
        df = df[df['pl_controv_flag'] == False]
    if 'target' in filters:
        df = df[df['hostname'].isin(target_stars) | df["is_solar"] == True] # type: ignore
    if 'null' in met:
        df = df[df['st_metratio'].isin(met) | df['st_metratio'].isna()]
    else:
        df = df[df['st_metratio'].isin(met)]

    # Select features to be evaluated (keeps certain others for hover info)
    if 'suns' in extra:
        df = df[features +
            ["hostname"] + ["is_solar"] + ["st_spectype"] + ["st_teffclass"] + ["st_lumclass"] + ["st_metratio"]
        ]
        def get_mode(series):
            mode_vals = series.mode()
            if not mode_vals.empty:
                return mode_vals.iloc[0]  # Return the first mode (if multiple)
            return np.nan  # Or return None, depending on preference
        planet_features = feature_vals[0]  # First group of features
        df = df.drop(columns=[col for col in planet_features if col in df.columns]) # Drop columns in the first feature group
        df = df.groupby("hostname", as_index=False).agg(get_mode) #gets the mode for each column in the group
    else:
        df = df[["pl_name"] + ["pl_type"] +
            ["discoverymethod"] + ["disc_refname"] + ["disc_locale"] + ["disc_facility"] + ["disc_telescope"] + ["disc_instrument"] +
            ["hostname"] + ["is_solar"] + ["st_spectype"] + ["st_teffclass"] + ["st_lumclass"] + ["st_metratio"] +
            features
        ]

    # Returns empty graphs and error message for certain anomalies
    missing_features = [f for f in features if f not in df.columns]
    num_components = 4 if colorcode == "PC" else 3
    if missing_features:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), "ERROR: Missing columns " + ", ".join(missing_features)
    if df.empty:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: filtered dataset is empty'
    if len(features) < num_components:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: not enough features selected'
    if 'suns' in extra and (colorcode == "pl_type" or colorcode == 'disc_locale' or colorcode == 'discoverymethod'):
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), "ERROR: Cannot filter by planet type when using star data."
    
    # converting date columns to seconds since epoch
    epoch = pd.Timestamp("1970-01-01")
    if "rowupdate" in df.columns:
        df["rowupdate"] = pd.to_datetime(df["rowupdate"], errors="coerce")
        df["rowupdate"] = (df["rowupdate"] - epoch).dt.total_seconds()
    if "releasedate" in df.columns:
        df["releasedate"] = pd.to_datetime(df["releasedate"], errors="coerce")
        df["releasedate"] = (df["releasedate"] - epoch).dt.total_seconds()

    # Gets information on NaNs in each column
    naninfo = [f"Feature info:\nTotal entries: {len(df)}\nRows without NaNs: {(~df[features].isna().any(axis=1)).sum()}"]
    nan_counts = df[features].isna().sum()
    most_nans_index = nan_counts.idxmax()
    threshold = len(df) / 2
    for col_name, count in nan_counts.items():
        flag = " (WARNING: >50% missing)" if count > threshold else ""
        suffix = " (highest)" if col_name == most_nans_index else ""
        naninfo.append(f"{col_name}{flag}: {count} NaNs{suffix}, mean {df[col_name].mean():.2f}, median {df[col_name].median():.2f}, variance {df[col_name].var():.2f}")
    nanstring = "\n".join(naninfo)

    df_cleaned = None  # Will hold the cleaned DataFrame later

    # Replace NaNs with the value given by nanhandle for PCA 
    if nanhandle in ['mean', 'median']:
        cleaned_groups = []
        if colorcode == "PC":
            df_sub = df.copy()
            fill_vals = (
                df_sub[features].mean() if nanhandle == 'mean'
                else df_sub[features].median()
            )
            fill_vals = fill_vals.fillna(0)
            df_sub[features] = df_sub[features].fillna(fill_vals)
            cleaned_groups.append(df_sub)

        else:
            for group_val, df_group in df.groupby(colorcode):
                df_sub = df_group.copy()
                fill_vals = (
                    df_sub[features].mean() if nanhandle == 'mean'
                    else df_sub[features].median()
                )
                fill_vals = fill_vals.fillna(0)
                df_sub[features] = df_sub[features].fillna(fill_vals)
                cleaned_groups.append(df_sub)

        df_cleaned = pd.concat(cleaned_groups, ignore_index=True)
    else:
        df_cleaned = df.dropna(subset=features)

    # Final check
    if df_cleaned.empty:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: filtered dataset is empty'

    # removes values that are not valid for log scaling if logstrict used
    if scale == 'logstrict':
        df_cleaned = df_cleaned[np.all(df_cleaned[features] > 0, axis=1)]

    X = df_cleaned[features].values
    
    if scale in ('logsafe', 'logstrict'):
        # Apply log transformation to the data, avoiding log(0) issues
        X = np.where(X > 0, np.log(X), 0)
        
    # PCA-required data normalization
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    X_std[X_std == 0] = 1  # prevents divide-by-zero
    X_normalized = (X - X_mean) / X_std

    # Numpy handles covariance matrix, eigenvalues, and eigenvectors
    cov_matrix = np.cov(X_normalized, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

    # Sort eigenvectors by descending eigenvalues
    idx = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, idx]
    eigenvalues = eigenvalues[idx]

    # make PCA array
    loadings = eigenvectors[:, :2] # Extract first 2 principal components to keep
    X_pca = np.dot(X_normalized, loadings)

    # UMAP
    embedding = umap.UMAP(n_neighbors=neighbors, min_dist=0.1).fit_transform(X_pca)

    # Create DataFrame for visualization by combining UMAP columns and original DataFrame
    df_umap = df_cleaned.join(pd.DataFrame(embedding, columns=["UMAP1", "UMAP2"], index=df_cleaned.index))
    df_umap = df_umap.dropna(subset=["pl_name"])  # Remove rows where pl_name is missing

    # Creates list of additional features for hover info
    exclude_cols = ["UMAP1", "UMAP2", "pl_type", "st_teffclass", "st_lumclass"]
    hover_features = [col for col in df_umap.columns if col not in exclude_cols] # Select all columns except the excluded ones
    hover_data_dict = {col: True for col in hover_features} # Convert list to dictionary format for hover_data

    #color coding
    color_map = {
        "terrestrial": "blue", "super_earth": "red", "neptune_like": "green", "gas_giant": "purple", "tba": "black",
        "II": "yellow", "III": "orange", "IV": "green", "V": "blue", "VI": "black",
        "B": "darkblue", "A": "royalblue", "F": "seagreen", "G": "yellow", "K": "orange", "M": "red",
        "Transit": "royalblue", "Transit Timing Variations": "slateblue", "Eclipse Timing Variations": "mediumslateblue", "Orbital Brightness Modulation": "cornflowerblue", "Radial Velocity": "crimson", "Astrometry": "firebrick", "Imaging": "seagreen", "Disc Kinematics": "mediumseagreen", "Microlensing": "darkgoldenrod", "Pulsar Timing": "mediumvioletred", "Pulsation Timing Variations": "orchid", "Known Since Antiquity": "black",
        "Space": "deepskyblue", "Ground": "sienna", "Multiple Locales": "mediumorchid",
        "[Fe/H]": "steelblue", "[M/H]": "darkcyan",
        "unknown": "gray",
    }
    
    # relabels discrete columns for color coding
    if colorcode in df.columns and df[colorcode].dtype == "object":
        # Count values
        label_counts = df[colorcode].value_counts().to_dict()

        # Create a mapping: e.g., "terrestrial" → "terrestrial (42)"
        labeled_with_counts = {
            key: f"{key} ({label_counts.get(key, 0)})" for key in df[colorcode].unique()
        }

        # Apply the relabeling
        df_umap["colorcode_labeled"] = df_umap[colorcode].map(labeled_with_counts)
        colorcode_use = "colorcode_labeled"

        # Generate fallback color iterator from Plotly palette
        default_colors = px.colors.qualitative.Plotly
        default_color_cycle = iter(default_colors)

        color_map_labeled = {}
        for key in df[colorcode].unique():
            label = labeled_with_counts[key]
            if key in color_map:
                color_map_labeled[label] = color_map[key]
            else:
                color_map_labeled[label] = next(default_color_cycle)
    else:
        # For continuous or numeric colorcodes, no relabeling
        colorcode_use = colorcode
        color_map_labeled = color_map
    
    # Increases size of solar system planets
    df_umap["is_solar"] = df_umap["is_solar"].map(lambda x: bool(x) if pd.notnull(x) else False)
    df_umap["marker_size"] = df_umap["is_solar"].fillna(False).apply(lambda x: 3 if x else 1)
    
    # Scatterplot
    umap_graph = px.scatter(
        df_umap, x="UMAP1", y="UMAP2", color=colorcode_use, color_discrete_map=color_map_labeled if df_umap[colorcode_use].dtype == "object" else None, hover_data=hover_data_dict, size="marker_size"
    )
    umap_graph.update_traces(
        hoverlabel=dict(
            font=dict(color="black"),
            bgcolor="white",
        )
    )
    umap_graph.update_layout(
        legend=dict(
            font=dict(size=20),  # Adjust size here
            itemsizing='constant'  # Keeps marker size consistent
        )
    )
    return umap_graph, html.Pre(nanstring)

# updates graphs based on user input or stored settings
@app.callback(
    [Output("umap", "figure"),
     Output("naninfo-box", "children"),
     Output('settings-store', 'data')],
    [Input('update-button', 'n_clicks'),
     Input('settings-store', 'data')],
    [State('filter-checklist', 'value'),
     State('extra-checklist', 'value'),
     State('scale-radioitems', 'value'),
     State({'type': 'feature-checklist', 'index': ALL}, 'value'),
     State('colorcode-radioitems', 'value'),
     State('pltype-checklist', 'value'),
     State('discmethod-checklist', 'value'),
     State('disclocale-checklist', 'value'),
     State('teff-checklist', 'value'),
     State('lum-checklist', 'value'),
     State('met-checklist', 'value'),
     State('nanhandle-radioitems', 'value'),
     State('neighbors-slider', 'value')],
    prevent_initial_call=False
)
def update_figures(n_clicks, settings, *states):
    # Check if the callback was triggered by a button click or settings change
    ctx = dash.callback_context
    if not ctx.triggered:
        raise dash.exceptions.PreventUpdate
    trigger = ctx.triggered[0]['prop_id']

    if trigger == 'update-button.n_clicks':
        # Use current inputs
        fig_umap, naninfo = update_graph(n_clicks, *states)
        settings_dict = dict(zip([
            'filter', 'extra', 'scale', 'features', 'colorcode', 'pltype',
            'discmethod', 'disclocale', 'teff', 'lum', 'met', 'nanhandle', 'neighbors'
        ], states))
        return fig_umap, naninfo, settings_dict

    elif trigger == 'settings-store.data':
        # Use stored settings, not states
        if not settings:
            raise dash.exceptions.PreventUpdate
        fig_umap, naninfo = update_graph(0, *unpack_settings(settings))
        return fig_umap, naninfo, dash.no_update
    
# Restore settings from local storage when the app starts
@app.callback(
    [Output('filter-checklist', 'value'),
     Output('extra-checklist', 'value'),
     Output('scale-radioitems', 'value'),
     Output({'type': 'feature-checklist', 'index': ALL}, 'value'),
     Output('colorcode-radioitems', 'value'),
     Output('pltype-checklist', 'value'),
     Output('discmethod-checklist', 'value'),
     Output('disclocale-checklist', 'value'),
     Output('teff-checklist', 'value'),
     Output('lum-checklist', 'value'),
     Output('met-checklist', 'value'),
     Output('nanhandle-radioitems', 'value'),
     Output('neighbors-slider', 'value'),
     Output('init-flag', 'data')],
    Input('settings-store', 'data'),
    State('init-flag', 'data'),
    prevent_initial_call="initial_duplicate"
)
def restore_settings(settings, already_initialized):
    if not settings or already_initialized:
        raise dash.exceptions.PreventUpdate
    return (
        settings['filter'], settings['extra'], settings['scale'],
        settings['features'], settings['colorcode'], settings['pltype'],
        settings['discmethod'], settings['disclocale'], settings['teff'],
        settings['lum'], settings['met'], settings['nanhandle'], settings['neighbors'], True
    )

# Unpack settings from the stored dictionary
def unpack_settings(settings):
    return (
        settings["filter"],
        settings["extra"],
        settings["scale"],
        settings["features"],
        settings["colorcode"],
        settings["pltype"],
        settings["discmethod"],
        settings["disclocale"],
        settings["teff"],
        settings["lum"],
        settings["met"],
        settings["nanhandle"],
        settings["neighbors"]
    )

# Clear the store when the reset button is clicked
@app.callback(
    Output('settings-store', 'data', allow_duplicate=True),
    Input('reset-button', 'n_clicks'),
    prevent_initial_call=True
)
def clear_store(n):
    return None  # or {}

if __name__ == '__main__':
    app.run(debug=True, port=8052)
    print("running on localhost:8052")