In [3]:
# PCA (Credit to Microsoft Copilot)
# Note to self: figure out how the heck this works)
import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output, ctx, MATCH, ALL
import plotly.graph_objects as go
import plotly.express as px
%run -i assets/lists.ipynb

# imports Planetary systems csv
pl_s = pd.read_csv('assets/Planetary_Systems.csv', comment="#", low_memory=False)

# imports atmospheric list for one filter.
at_s = pd.read_csv('assets/Atmospheric_Spectroscopy.csv', comment="#", low_memory=False)

solar_planets = pd.read_csv('assets/Solar_Values.csv', comment="#", low_memory=False)

# Dash app
app = Dash(__name__, external_stylesheets=["assets/style.css"])

# List of database columns to take into account for graphs
planet_features = {
    'Planet': [
        {'label': "Orbital Period (pl_orbper)", 'value': "pl_orbper"}, # in days (Not in values dataset)
        {'label': "Orbit Semi-Major Axis (pl_orbsmax)", 'value': "pl_orbsmax"}, # In Astronomical Units ( Not in values dataset)
        {'label': "Epoch of Periastron (pl_orbtper)", 'value': "pl_orbtper"}, # in degrees (Not in values dataset)
        {'label': "Argument of Periastron (pl_orblper)", 'value': "pl_orblper"}, # in degrees (Not in values dataset)
        {'label': "Proj. Obliquity (pl_projobliq)", 'value': "pl_projobliq"}, # in degrees (Not in values dataset)
        {'label': "True Obliquity (pl_trueobliq)", 'value': "pl_trueobliq"}, # in degrees (Not in values dataset)
        {'label': "Radius (pl_rade)", 'value': "pl_rade"}, # in Earth radiuses (Not in values dataset)
        {'label': "Mass (pl_bmasse)", 'value': "pl_bmasse"}, # estimation, in Earth masses ( Not in values dataset)
        {'label': "Density (pl_dens)", 'value': "pl_dens"}, # in g/cm^3 (Not in values dataset)
        {'label': "Orbital Eccentricity (pl_orbeccen)", 'value': "pl_orbeccen"}, # (Not in values dataset)
        {'label': "Insol. Flux (pl_insol)", 'value': "pl_insol"}, # in Earth flux (Not in values dataset)
        {'label': "Equil. Temp. (pl_eqt)", 'value': "pl_eqt"},  # in Kelvin (Not in values dataset)
        {'label': "Transit Duration (pl_trandur)", 'value': "pl_trandur"}, # in hours (Not in values dataset)
        {'label': "Transit Midpoint (pl_tranmid)", 'value': "pl_tranmid"}, # in days (Not in values dataset)
        {'label': "Transit Depth (pl_trandep)", 'value': "pl_trandep"}, # percentage (Not in values dataset)
        {'label': "Impact Parameter (pl_imppar)", 'value': "pl_imppar"}, # (Not in values dataset)
        {'label': "Occulation Depth (pl_occdep)", 'value': "pl_occdep"}, # percentage (Not in values dataset)
        {'label': "Rad. Velocity Amplitude (pl_rvamp)", 'value': "pl_rvamp"}, # in m/s (ot in values dataset)
        {'label': "Discovery Year (disc_year)", 'value': "disc_year"},
        {'label': "Last Update (rowupdate)", 'value': "rowupdate"}, # last update of parameters
        # {'label': "Public Release Date", 'value': "releasedate"}, # date publicly released
    ],
    'Stellar': [
        {'label': "Effec. Temp. (st_teff)", 'value': "st_teff"}, # in Kelvin
        {'label': "Radius (st_rad)", 'value': "st_rad"}, # in Solar radiuses
        {'label': "Mass (st_mass)", 'value': "st_mass"},
        {'label': "Density (st_dens)", 'value': "st_dens"},
        {'label': "Surface Grav. (st_logg)", 'value': "st_logg"},
        {'label': "Age (st_age)", 'value': "st_age"}, # in gigayears
        {'label': "Rot. Period (st_rotp)", 'value': "st_rotp"},
        {'label': "Rot. Velocity (st_vsin)", 'value': "st_vsin"},
        {'label': "Rad. Velocity (st_radv)", 'value': "st_radv"},
        {'label': "Metallicity (st_met)", 'value': "st_met"},
        {'label': "Luminosity (st_lum)", 'value': "st_lum"},
    ],
    'System': [
        {'label': "Parallax (sy_plx)", 'value': "sy_plx"},
        {'label': "Dist from Earth (sy_dist)", 'value': "sy_dist"}, # in parsecs
        {'label': "No. Stars (sy_snum)", 'value': "sy_snum"},
        {'label': "No. Planets (sy_snum)", 'value': "sy_pnum"},
        {'label': "No. Moons (sy_snum)", 'value': "sy_mnum"},
        {'label': "u (Sloan) Magnitude (sy_umag, ~354 nm)", 'value': "sy_umag"},
        {'label': "B (Johnson) Magnitude (sy_bmag, ~442 nm)", 'value': "sy_bmag"},
        {'label': "g (Sloan) Magnitude (sy_gmag, ~475 nm)", 'value': "sy_gmag"},
        {'label': "V (Johnson) Magnitude (sy_vmag, ~540 nm)", 'value': "sy_vmag"},
        {'label': "Kepler Magnitude (sy_kepmag, ~600 nm)", 'value': "sy_kepmag"},
        {'label': "r (Sloan) Magnitude (sy_rmag, ~622 nm)", 'value': "sy_rmag"},
        {'label': "Gaia Magnitude (sy_gaiamag, ~673 nm)", 'value': "sy_gaiamag"},
        {'label': "i (Sloan) Magnitude (sy_imag, ~763 nm)", 'value': "sy_imag"},
        {'label': "I (Cousins) Magnitude (sy_icmag, ~786.5 nm)", 'value': "sy_icmag"},
        {'label': "TESS Magnitude (sy_tmag, ~800 nm)", 'value': "sy_tmag"},
        {'label': "z (Sloan) Magnitude (sy_zmag, ~905 nm)", 'value': "sy_zmag"},
        {'label': "J (2MASS) Magnitude (sy_jmag, ~1.25 μm)", 'value': "sy_jmag"},
        {'label': "H (2MASS) Magnitude (sy_hmag, ~1.65 μm)", 'value': "sy_hmag"},
        {'label': "Ks (2MASS) Magnitude (sy_kmag, ~2,15 μm)", 'value': "sy_kmag"},
        {'label': "W1 (WISE) Magnitude (sy_w1mag, ~3.4 μm)", 'value': "sy_w1mag"},
        {'label': "W2 (WISE) Magnitude (sy_w2mag, ~4.6 μm)", 'value': "sy_w2mag"},
        {'label': "W3 (WISE) Magnitude (sy_w3mag, ~12 μm)", 'value': "sy_w3mag"},
        {'label': "W4 (WISE) Magnitude (sy_w4mag, ~22 μm)", 'value': "sy_w4mag"},
    ]
}
initial_values = [['pl_orbper'], [], ["sy_plx", "sy_dist", "sy_bmag", "sy_vmag", "sy_jmag", "sy_hmag", "sy_kmag", "sy_gaiamag", "sy_tmag"]]

app.layout = html.Div([
    html.H1("PCA Analysis"),

    html.H2("Data filters:"),
     dcc.Checklist(
        id='filter-checklist',
        options=[
            {'label': 'Has atmospheric data', 'value': 'atmoData'},
            {'label': 'Default parameter set', 'value': 'default'},
            {'label': 'No controversial flag', 'value': 'noControv'},
            {'label': 'Water to Metal Density', 'value': 'densRange'},
            {'label': 'In Target Star Catalog', 'value': 'target'},
            {'label': 'Include Solar System', 'value': 'solar'},
        ],
        value=['default', 'noControv', 'solar'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Planet type:"),
    dcc.Checklist(
        id='pltype-checklist',
        options=[
            {'label': 'Terrestrial', 'value': 'terrestrial'},
            {'label': 'Super-Earth', 'value': 'super_earth'},
            {'label': 'Neptune-like', 'value': 'neptune_like'},
            {'label': 'Gas Giant', 'value': 'gas_giant'},
            {'label': 'Unknown', 'value': 'unknown'},
            {'label': 'TBA', 'value': 'tba'},
        ],
        value=['terrestrial', 'super_earth', 'neptune_like', 'gas_giant', 'unknown', 'tba'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='simplification-checkbox',
            options=[
                {'label': html.Span(['Simplify planet types?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H3("Stellar effective temp:"),
    dcc.Checklist(
        id='teff-checklist',
        options=[
            {'label': 'O type (>33000 K)', 'value': 'O'},
            {'label': 'B type (10000-33000 K)', 'value': 'B'},
            {'label': 'A type (7300-10000 K)', 'value': 'A'},
            {'label': 'F type (6000-7300 K)', 'value': 'F'},
            {'label': 'G type (5300-6000 K)', 'value': 'G'},
            {'label': 'K type (3900-5300 K, recommended)', 'value': 'K'},
            {'label': 'M type (2300-3900 K)', 'value': 'M'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['O', 'B', 'A', 'F', 'G', 'K', 'M', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Stellar luminosity:"),
    dcc.Checklist(
        id='lum-checklist',
        options=[
            {'label': 'Hypergiant', 'value': '0'},
            {'label': 'Supergiant', 'value': 'I'},
            {'label': 'Bright Giant', 'value': 'II'},
            {'label': 'Giant', 'value': 'III'},
            {'label': 'Subgiant', 'value': 'IV'},
            {'label': 'Main-sequence/Dwarf (recommended)', 'value': 'V'},
            {'label': 'Subdwarf', 'value': 'VI'},
            {'label': 'White Dwarf', 'value': 'VII'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),
    
    html.H3("Stellar metallicity ratio:"),
    dcc.Checklist(
        id='met-checklist',
        options=[
            {'label': 'Iron abundance (recommended)', 'value': '[Fe/H]'},
            {'label': 'General metal content', 'value': '[M/H]'},
            {'label': 'Unknown', 'value': 'null'},
        ],
        value=['[Fe/H]', '[M/H]', 'null'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H2("Select Features:"),
    *[
        html.Div([
            html.H3(group),
            dcc.Checklist(
                id={'type': 'feature-checklist', 'index': i},
                options=options,
                value=initial_values[i],
                className="checkbox-container" 
            )
        ]) for i, (group, options) in enumerate(planet_features.items())
    ],

    html.H2("Color coding:"),
    dcc.RadioItems(
        id="colorcode-radioitems",
        options=[
            {'label': 'Planet Type', 'value': 'pl_type'},
            {'label': 'St. Effective Temp.', 'value': 'st_teffclass'},
            {'label': 'Stellar Luminosity', 'value': 'st_lumclass'},
            {'label': 'Next Principal Component', 'value': 'PC'},
            # {'label': 'Host Star', 'value': 'hostname'}, # too much computational power.
        ],
        value="pl_type",
        className="checkbox-container"
    ),

    html.H2("How to handle NaN values:"),
    dcc.RadioItems(
        id="nanhandle-radioitems",
        options=[
            # {'label': 'Set to 0', 'value': 'zero'},
            {'label': 'Set to mean', 'value': 'mean'},
            {'label': 'Set to median', 'value': 'median'},
            {'label': 'Remove rows', 'value': 'remove'},
        ],
        value="median",
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='estimates-checkbox',
            options=[
                {'label': html.Span(['Estimate certain values?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),
    
    dcc.Tabs(id="tabs", value="tab-1", children=[
        dcc.Tab(label="3D PCA Scatter Plot", value="tab-1", className="tab"),
        dcc.Tab(label="2D PCA Scatter Plot", value="tab-2", className="tab"),
        dcc.Tab(label="Explained Variance", value="tab-3", className="tab"),
        dcc.Tab(label="Feature Loadings Heatmap", value="tab-4", className="tab"),
    ]),
    html.Div(id="tabs-content", children=[
        dcc.Graph(id="graph-3d", figure={}, style={"display": "block"}),
        dcc.Graph(id="graph-2d", figure={}, style={"display": "none"}),
        dcc.Graph(id="variance-bar", figure={}, style={"display": "none"}),
        dcc.Graph(id="loadings-heatmap", figure={}, style={"display": "none"}),
    ]),  # This will hold the selected graph

    html.P('NaN info:', id='naninfo-box'),

    html.P("Data sourced from:"),
    html.Div([
        html.A("https://www.doi.org/10.26133/NEA12", href="https://www.doi.org/10.26133/NEA12", target="_blank"),
        html.Br(),
        html.A("https://www.doi.org/10.26133/NEA36", href="https://www.doi.org/10.26133/NEA36", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/exoplanet-catalog/", href="https://science.nasa.gov/exoplanets/exoplanet-catalog/", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/target-star-catalog/", href="https://science.nasa.gov/exoplanets/target-star-catalog/", target="_blank"),
    ])
], style={'color': 'black', 'font-family': 'Arial', 'backgroundColor': 'white', 'padding': '20px', 'text-align': 'center'})

@app.callback(
    [Output('graph-3d', 'figure'),
     Output('graph-2d', 'figure'),
     Output('variance-bar', 'figure'),
     Output('loadings-heatmap', 'figure'),
     Output('naninfo-box', 'children')],
    [Input('filter-checklist', 'value'),
     Input('simplification-checkbox', 'value'),
     Input('estimates-checkbox', 'value'),
     Input({'type': 'feature-checklist', 'index': ALL}, 'value'),
     Input('colorcode-radioitems', 'value'),
     Input('pltype-checklist', 'value'),
     Input('teff-checklist', 'value'),
     Input('lum-checklist', 'value'),
     Input('met-checklist', 'value'),
     Input('nanhandle-radioitems', 'value'),]
)
def update_graph(filters, simple, estimate, feature_vals, colorcode, pltype, teff, lum, met, nanhandle):
    # Combines all feature checkboxes into one list.
    features = sorted(set(val for group in feature_vals for val in group))

    # Load dataset
    df = pl_s.copy()
    # print(df[df["pl_type"]=="none"]["pl_name"])

    # Apply filters
    if 'solar' in filters:
        df = pd.concat([pl_s.copy(), solar_planets.copy()], ignore_index=True) # type: ignore
    if 'atmoData' in filters:
        df = df[df['pl_name'].isin(at_s['pl_name']) | df["is_solar"] == True]
    if 'default' in filters:
        df = df[df['default_flag'] == True]
    if 'noControv' in filters:
        df = df[df['pl_controv_flag'] == False]
    if 'densRange' in filters:
        df=df[(df['pl_dens'] > 1) & (df['pl_dens'] < 5.6)] # Only gets planets with a density between water and metallic iron
    if 'target' in filters:
        df = df[df['hostname'].isin(target_stars) | df["is_solar"] == True] # type: ignore
    df = df[df['pl_type'].isin(pltype)]  
    df = df[df['st_teffclass'].isin(teff)]   
    df = df[df['st_lumclass'].isin(lum)]
    if 'null' in met:
        df = df[df['st_metratio'].isin(met) | df['st_metratio'].isna()]
    else:
        df = df[df['st_metratio'].isin(met)]
    if simple:
        df["pl_type"] = df["pl_type"].replace({"super_earth": "terrestrial", "neptune_like": "gas_giant"})
    if estimate:
        df["pl_dens"] = df["pl_dens"].fillna(df["pl_densest"])
        df["st_teffclass"] = df["st_teffclass"].replace("unknown", np.nan).fillna(df["st_teffclassest"])
        df["st_lumclass"] = df["st_lumclass"].replace("unknown", np.nan).fillna(df["st_lumclassest"])

        
    # Returns empty graphs and error message for certain anomalies
    missing_features = [f for f in features if f not in df.columns]
    num_components = 4 if colorcode == "PC" else 3
    if missing_features:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), "ERROR: Missing columns " + ", ".join(missing_features)
    if df.empty:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: filtered dataset is empty'
    if len(features) < num_components:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: not enough features selected'
    
    # Select features to be evaluated (keeps certain others for hover info)
    df = df[["pl_name"] + ["pl_type"] +
        ["discoverymethod"] + ["disc_refname"] + ["disc_locale"] + ["disc_facility"] + ["disc_telescope"] + ["disc_instrument"] +
        ["hostname"] + ["is_solar"] + ["st_spectype"] + ["st_teffclass"] + ["st_lumclass"] +
        features
    ]
    epoch = pd.Timestamp("1970-01-01")
    if "rowupdate" in df.columns:
        df["rowupdate"] = pd.to_datetime(df["rowupdate"], errors="coerce")
        df["rowupdate"] = (df["rowupdate"] - epoch).dt.total_seconds()
    if "releasedate" in df.columns:
        df["releasedate"] = pd.to_datetime(df["releasedate"], errors="coerce")
        df["releasedate"] = (df["releasedate"] - epoch).dt.total_seconds()

    # Gets information on NaNs in each column
    naninfo = [f"Feature info:\nTotal entries: {len(df)}\nRows without NaNs: {(~df[features].isna().any(axis=1)).sum()}"]
    nan_counts = df[features].isna().sum()  # Column-wise NaNs in df
    most_nans_index = nan_counts.idxmax()  # Find index of the feature with the most NaNs
    for col_name, count in nan_counts.items():
        suffix = " (highest)" if col_name == most_nans_index else ""
        naninfo.append(f"{col_name}: {count} NaNs{suffix}, mean {df[col_name].mean():.2f}, median {df[col_name].median():.2f}, variance {df[col_name].var():.2f}")
    nanstring = "\n".join(naninfo)

    df_cleaned = None  # Will hold the cleaned DataFrame later

    # Replace NaNs with the value given by nanhandle for PCA 
    if nanhandle in ['mean', 'median']:
        cleaned_groups = []
        for pl_type, df_group in df.groupby('pl_type'):
            df_sub = df_group.copy()
            fill_vals = (
                df_sub[features].mean() if nanhandle == 'mean'
                else df_sub[features].median()
            )
            fill_vals = fill_vals.fillna(0)  # fallback for all-NaN columns
            df_sub[features] = df_sub[features].fillna(fill_vals)
            cleaned_groups.append(df_sub)
        df_cleaned = pd.concat(cleaned_groups)
    else:
        df_cleaned = df.dropna(subset=features)

    # Final check
    if df_cleaned.empty:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: filtered dataset is empty'

    X = df_cleaned[features].values
    
    # PCA-required data normalization
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    X_std[X_std == 0] = 1  # prevents divide-by-zero
    X_normalized = (X - X_mean) / X_std

    # Numpy handles covariance matrix, eigenvalues, and eigenvectors
    cov_matrix = np.cov(X_normalized, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

    # Sort eigenvectors by descending eigenvalues
    idx = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, idx]
    eigenvalues = eigenvalues[idx]
    explained_variance_ratio = eigenvalues / np.sum(eigenvalues)

    # make PCA array
    loadings = eigenvectors[:, :num_components]
    X_pca = np.dot(X_normalized, loadings)

    # Create DataFrame for visualization by combining PCA columns and original DataFrame
    pca_columns = [f"PC{i+1}" for i in range(num_components)]
    df_pca = df_cleaned.join(pd.DataFrame(X_pca, columns=pca_columns, index=df_cleaned.index))
    df_pca = df_pca.dropna(subset=["pl_name"])  # Remove rows where pl_name is missing

    # Creates list of additional features for hover info
    exclude_cols = ["PC1", "PC2", "PC3", "PC4", "pl_type", "st_teffclass", "st_lumclass"]
    hover_features = [col for col in df_pca.columns if col not in exclude_cols] # Select all columns except the excluded ones
    hover_data_dict = {col: True for col in hover_features} # Convert list to dictionary format for hover_data
    
    color_map = {
        "terrestrial": "blue", "super_earth": "red", "neptune_like": "green", "gas_giant": "purple", "tba": "black",
        "II": "yellow", "III": "orange", "IV": "green", "V": "blue", "VI": "black",
        "B": "darkblue", "A": "royalblue", "F": "seagreen", "G": "yellow", "K": "orange", "M": "red",
        "unknown": "gray",
    }

    # Increases size of solar system planets
    df_pca["marker_size"] = df_pca["is_solar"].fillna(False).apply(lambda x: 3 if x else 1)
    df_pca[colorcode] = df_pca[colorcode].fillna("unknown")

    def relabel_discrete_column(df, colname):
        if df[colname].dtype == "object" or df[colname].dtype.name == "category":
            counts = df[colname].value_counts().to_dict()
            label_map = {k: f"{k} ({counts.get(k, 0)})" for k in df[colname].unique()}
            labeled = df[colname].map(label_map)
            relabeled_color_map = {
                label_map[k]: color_map[k]
                for k in df[colname].unique()
                if k in color_map and k in label_map
            }
            return labeled, relabeled_color_map
        else:
            return df[colname], None  # Leave numeric columns alone

    # If using the PC color coding, then the column will be different for both graphs in this one.
    if colorcode != "PC":
        df_pca["colorcode_labeled"], relabeled_color_map = relabel_discrete_column(df_pca, colorcode)
        df_pca["colorcode_2d_labeled"] = df_pca["colorcode_labeled"]
        df_pca["colorcode_3d_labeled"] = df_pca["colorcode_labeled"]
        color_map_2d = color_map_3d = relabeled_color_map
    else:
        df_pca["colorcode_2d_labeled"] = df_pca["PC3"]
        df_pca["colorcode_3d_labeled"] = df_pca["PC4"]
        color_map_2d = color_map_3d = None  # continuous mode → let Plotly handle it


    # **Graph 1: 3D PCA Scatter Plot**
    fig_3d = px.scatter_3d(df_pca, x="PC1", y="PC2", z="PC3", color="colorcode_3d_labeled", color_discrete_map=color_map_3d or color_map, hover_data=hover_data_dict, size="marker_size")
    fig_3d.update_traces(
        hoverlabel=dict(
            font=dict(color="black"),
            bgcolor="white",
        ),
    )

    # **Graph 2: 2D PCA Scatter Plot**
    fig_2d = px.scatter(df_pca, x="PC1", y="PC2", color="colorcode_2d_labeled", color_discrete_map=color_map_2d or color_map, hover_data=hover_data_dict, size="marker_size")
    fig_2d.update_traces(
        hoverlabel=dict(
            font=dict(color="black"),
            bgcolor="white",
        ),
    )

    # **Graph 3: Explained Variance Bar Chart**
    explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    df_eigen = pd.DataFrame({
        "Principal Component": [f"PC{i+1}" for i in range(len(explained_variance_ratio))],
        "Explained Variance Ratio": explained_variance_ratio,
        "Cumulative Variance Ratio": cumulative_variance_ratio
    })
    fig_variance = px.bar(df_eigen, x="Principal Component", y="Explained Variance Ratio", text_auto=True)
    fig_variance.add_scatter(x=df_eigen["Principal Component"], y=df_eigen["Cumulative Variance Ratio"], mode="lines+markers", name="Cumulative Variance")


    # **Graph 4: Feature Loadings Heatmap**
    df_loadings = pd.DataFrame(loadings, index=features, columns=pca_columns)
    df_loadings = df_loadings.reindex(df_loadings["PC1"].abs().sort_values(ascending=False).index)
    fig_loadings = px.imshow(df_loadings, labels={"x": "Principal Component", "y": "Feature", "color": "Contribution",})

    return fig_3d, fig_2d, fig_variance, fig_loadings, html.Pre(nanstring)

@app.callback(
    [Output("graph-3d", "style"),
     Output("graph-2d", "style"),
     Output("variance-bar", "style"),
     Output("loadings-heatmap", "style")],
    Input("tabs", "value"),
)
def update_tabs(tab):
    styles = {
        "tab-1": [{"display": "block"}, {"display": "none"}, {"display": "none"}, {"display": "none"}],
        "tab-2": [{"display": "none"}, {"display": "block"}, {"display": "none"}, {"display": "none"}],
        "tab-3": [{"display": "none"}, {"display": "none"}, {"display": "block"}, {"display": "none"}],
        "tab-4": [{"display": "none"}, {"display": "none"}, {"display": "none"}, {"display": "block"}],
    }
    return styles.get(tab, [{"display": "block"}, {"display": "none"}, {"display": "none"}, {"display": "none"}])
    
if __name__ == '__main__':
    app.run(debug=True, port=8051)

# Hypothesis: Is stellar age related to the differentiation of planets?
# Move to Streamlit (must move to python base first)
# Understand in context of star system (Too computationaly taxing apparently.)
# Get filling nans to work before simplification, or rework simplification to be applied when coloring.
# Put this after the starmaps, suggestions:

In [None]:
# UMAP (Credit to Microsoft Copilot)
# Note to self: figure out how the heck this works)
# For UMAP, use 2-10 neighbors (local) and 30-100 (global). Put after PCA
import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output, State, ctx, MATCH, ALL
import plotly.graph_objects as go
import plotly.express as px
import umap
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
%run -i assets/lists.ipynb

# imports Planetary systems csv
pl_s = pd.read_csv('assets/Planetary_Systems.csv', comment="#", low_memory=False)

# imports atmospheric list for one filter.
at_s = pd.read_csv('assets/Atmospheric_Spectroscopy.csv', comment="#", low_memory=False)

solar_planets = pd.read_csv('assets/Solar_Values.csv', comment="#", low_memory=False)

# Dash app
app = Dash(__name__, external_stylesheets=["assets/style.css"])

# List of database columns to take into account for graphs
planet_features = {
    'Planet': [
        {'label': "Orbital Period (pl_orbper)", 'value': "pl_orbper"}, # in days (Not in values dataset)
        {'label': "Orbit Semi-Major Axis (pl_orbsmax)", 'value': "pl_orbsmax"}, # In Astronomical Units ( Not in values dataset)
        {'label': "Epoch of Periastron (pl_orbtper)", 'value': "pl_orbtper"}, # in degrees (Not in values dataset)
        {'label': "Argument of Periastron (pl_orblper)", 'value': "pl_orblper"}, # in degrees (Not in values dataset)
        {'label': "Proj. Obliquity (pl_projobliq)", 'value': "pl_projobliq"}, # in degrees (Not in values dataset)
        {'label': "True Obliquity (pl_trueobliq)", 'value': "pl_trueobliq"}, # in degrees (Not in values dataset)
        {'label': "Radius (pl_rade)", 'value': "pl_rade"}, # in Earth radiuses (Not in values dataset)
        {'label': "Mass (pl_bmasse)", 'value': "pl_bmasse"}, # estimation, in Earth masses ( Not in values dataset)
        {'label': "Density (pl_dens)", 'value': "pl_dens"}, # in g/cm^3 (Not in values dataset)
        {'label': "Orbital Eccentricity (pl_orbeccen)", 'value': "pl_orbeccen"}, # (Not in values dataset)
        {'label': "Insol. Flux (pl_insol)", 'value': "pl_insol"}, # in Earth flux (Not in values dataset)
        {'label': "Equil. Temp. (pl_eqt)", 'value': "pl_eqt"},  # in Kelvin (Not in values dataset)
        {'label': "Transit Duration (pl_trandur)", 'value': "pl_trandur"}, # in hours (Not in values dataset)
        {'label': "Transit Midpoint (pl_tranmid)", 'value': "pl_tranmid"}, # in days (Not in values dataset)
        {'label': "Transit Depth (pl_trandep)", 'value': "pl_trandep"}, # percentage (Not in values dataset)
        {'label': "Impact Parameter (pl_imppar)", 'value': "pl_imppar"}, # (Not in values dataset)
        {'label': "Occulation Depth (pl_occdep)", 'value': "pl_occdep"}, # percentage (Not in values dataset)
        {'label': "Rad. Velocity Amplitude (pl_rvamp)", 'value': "pl_rvamp"}, # in m/s (ot in values dataset)
        {'label': "Discovery Year (disc_year)", 'value': "disc_year"},
        {'label': "Last Update (rowupdate)", 'value': "rowupdate"}, # last update of parameters
        # {'label': "Public Release Date", 'value': "releasedate"}, # date publicly released
    ],
    'Stellar': [
        {'label': "Effec. Temp. (st_teff)", 'value': "st_teff"}, # in Kelvin
        {'label': "Radius (st_rad)", 'value': "st_rad"}, # in Solar radiuses
        {'label': "Mass (st_mass)", 'value': "st_mass"},
        {'label': "Density (st_dens)", 'value': "st_dens"},
        {'label': "Surface Grav. (st_logg)", 'value': "st_logg"},
        {'label': "Age (st_age)", 'value': "st_age"}, # in gigayears
        {'label': "Rot. Period (st_rotp)", 'value': "st_rotp"},
        {'label': "Rot. Velocity (st_vsin)", 'value': "st_vsin"},
        {'label': "Rad. Velocity (st_radv)", 'value': "st_radv"},
        {'label': "Metallicity (st_met)", 'value': "st_met"},
        {'label': "Luminosity (st_lum)", 'value': "st_lum"},
    ],
    'System': [
        {'label': "Parallax (sy_plx)", 'value': "sy_plx"},
        {'label': "Dist from Earth (sy_dist)", 'value': "sy_dist"}, # in parsecs
        {'label': "No. Stars (sy_snum)", 'value': "sy_snum"},
        {'label': "No. Planets (sy_snum)", 'value': "sy_pnum"},
        {'label': "No. Moons (sy_snum)", 'value': "sy_mnum"},
        {'label': "u (Sloan) Magnitude (sy_umag, ~354 nm)", 'value': "sy_umag"},
        {'label': "B (Johnson) Magnitude (sy_bmag, ~442 nm)", 'value': "sy_bmag"},
        {'label': "g (Sloan) Magnitude (sy_gmag, ~475 nm)", 'value': "sy_gmag"},
        {'label': "V (Johnson) Magnitude (sy_vmag, ~540 nm)", 'value': "sy_vmag"},
        {'label': "Kepler Magnitude (sy_kepmag, ~600 nm)", 'value': "sy_kepmag"},
        {'label': "r (Sloan) Magnitude (sy_rmag, ~622 nm)", 'value': "sy_rmag"},
        {'label': "Gaia Magnitude (sy_gaiamag, ~673 nm)", 'value': "sy_gaiamag"},
        {'label': "i (Sloan) Magnitude (sy_imag, ~763 nm)", 'value': "sy_imag"},
        {'label': "I (Cousins) Magnitude (sy_icmag, ~786.5 nm)", 'value': "sy_icmag"},
        {'label': "TESS Magnitude (sy_tmag, ~800 nm)", 'value': "sy_tmag"},
        {'label': "z (Sloan) Magnitude (sy_zmag, ~905 nm)", 'value': "sy_zmag"},
        {'label': "J (2MASS) Magnitude (sy_jmag, ~1.25 μm)", 'value': "sy_jmag"},
        {'label': "H (2MASS) Magnitude (sy_hmag, ~1.65 μm)", 'value': "sy_hmag"},
        {'label': "Ks (2MASS) Magnitude (sy_kmag, ~2,15 μm)", 'value': "sy_kmag"},
        {'label': "W1 (WISE) Magnitude (sy_w1mag, ~3.4 μm)", 'value': "sy_w1mag"},
        {'label': "W2 (WISE) Magnitude (sy_w2mag, ~4.6 μm)", 'value': "sy_w2mag"},
        {'label': "W3 (WISE) Magnitude (sy_w3mag, ~12 μm)", 'value': "sy_w3mag"},
        {'label': "W4 (WISE) Magnitude (sy_w4mag, ~22 μm)", 'value': "sy_w4mag"},
    ]
}
initial_values = [['pl_rade'], ['st_mass'], ["sy_plx"]]

app.layout = html.Div([
    html.H1("UMAP Analysis"),

    html.H2("Data filters:"),
    dcc.Checklist(
        id='filter-checklist',
        options=[
            {'label': 'Has atmospheric data', 'value': 'atmoData'},
            {'label': 'Default parameter set', 'value': 'default'},
            {'label': 'No controversial flag', 'value': 'noControv'},
            {'label': 'Water to Metal Density', 'value': 'densRange'},
            {'label': 'In Target Star Catalog', 'value': 'target'},
            {'label': 'Include Solar System', 'value': 'solar'},
        ],
        value=['default', 'noControv', 'solar', 'target'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Planet type:"),
    dcc.Checklist(
        id='pltype-checklist',
        options=[
            {'label': 'Terrestrial', 'value': 'terrestrial'},
            {'label': 'Super-Earth', 'value': 'super_earth'},
            {'label': 'Neptune-like', 'value': 'neptune_like'},
            {'label': 'Gas Giant', 'value': 'gas_giant'},
            {'label': 'Unknown', 'value': 'unknown'},
            {'label': 'TBA', 'value': 'tba'},
        ],
        value=['terrestrial', 'super_earth', 'neptune_like', 'gas_giant', 'unknown', 'tba'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='simplification-checkbox',
            options=[
                {'label': html.Span(['Simplify planet types?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H3("Stellar effective temp:"),
    dcc.Checklist(
        id='teff-checklist',
        options=[
            {'label': 'O type (>33000 K)', 'value': 'O'},
            {'label': 'B type (10000-33000 K)', 'value': 'B'},
            {'label': 'A type (7300-10000 K)', 'value': 'A'},
            {'label': 'F type (6000-7300 K)', 'value': 'F'},
            {'label': 'G type (5300-6000 K)', 'value': 'G'},
            {'label': 'K type (3900-5300 K, recommended)', 'value': 'K'},
            {'label': 'M type (2300-3900 K)', 'value': 'M'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['O', 'B', 'A', 'F', 'G', 'K', 'M', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Stellar luminosity:"),
    dcc.Checklist(
        id='lum-checklist',
        options=[
            {'label': 'Hypergiant', 'value': '0'},
            {'label': 'Supergiant', 'value': 'I'},
            {'label': 'Bright Giant', 'value': 'II'},
            {'label': 'Giant', 'value': 'III'},
            {'label': 'Subgiant', 'value': 'IV'},
            {'label': 'Main-sequence/Dwarf (recommended)', 'value': 'V'},
            {'label': 'Subdwarf', 'value': 'VI'},
            {'label': 'White Dwarf', 'value': 'VII'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Stellar metallicity ratio:"),
    dcc.Checklist(
        id='met-checklist',
        options=[
            {'label': 'Iron abundance (recommended)', 'value': '[Fe/H]'},
            {'label': 'General metal content', 'value': '[M/H]'},
            {'label': 'Unknown', 'value': 'null'},
        ],
        value=['[Fe/H]', '[M/H]', 'null'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H2("Select Features:"),
    *[
        html.Div([
            html.H3(group),
            dcc.Checklist(
                id={'type': 'feature-checklist', 'index': i},
                options=options,
                value=initial_values[i],
                className="checkbox-container" 
            )
        ]) for i, (group, options) in enumerate(planet_features.items())
    ],

    html.H2("Color coding:"),
    dcc.RadioItems(
        id="colorcode-radioitems",
        options=[
            {'label': 'Planet Type', 'value': 'pl_type'},
            {'label': 'St. Effective Temp.', 'value': 'st_teffclass'},
            {'label': 'Stellar Luminosity', 'value': 'st_lumclass'},
            # {'label': 'Host Star', 'value': 'hostname'}, # too much computational power.
        ],
        value="pl_type",
        className="checkbox-container"
    ),
    
    html.H2("How to handle NaN values:"),
    dcc.RadioItems(
        id="nanhandle-radioitems",
        options=[
            # {'label': 'Set to 0', 'value': 'zero'},
            {'label': 'Set to mean', 'value': 'mean'},
            {'label': 'Set to median', 'value': 'median'},
            {'label': 'Remove rows', 'value': 'remove'},
        ],
        value="median",
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='estimates-checkbox',
            options=[
                {'label': html.Span(['Estimate certain values?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H2("Neighbors:"),
    html.P("Recommended: Generate a 2-10 and 30-100 neighbors graph for each set of values."),
    dcc.Slider(
        id="neighbors-slider",
        min=2,
        max=100,
        step=1,
        value=15,
        marks={i: str(i) for i in range(0, 100, 10)},
        tooltip={"placement": "bottom", "always_visible": True}
    ),

    html.Button('Update Graph', id='update-button', n_clicks=1),

    dcc.Graph(id="umap", figure={}),

    html.P('NaN info:', id='naninfo-box'),

    html.P("Data sourced from:"),
    html.Div([
        html.A("https://www.doi.org/10.26133/NEA12", href="https://www.doi.org/10.26133/NEA12", target="_blank"),
        html.Br(),
        html.A("https://www.doi.org/10.26133/NEA36", href="https://www.doi.org/10.26133/NEA36", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/exoplanet-catalog/", href="https://science.nasa.gov/exoplanets/exoplanet-catalog/", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/target-star-catalog/", href="https://science.nasa.gov/exoplanets/target-star-catalog/", target="_blank"),
    ])
], style={'color': 'black', 'font-family': 'Arial', 'backgroundColor': 'white', 'padding': '20px', 'text-align': 'center'})

@app.callback(
    [Output('umap', 'figure'),
     Output('naninfo-box', 'children')],
     Input('update-button', 'n_clicks'),
    [State('filter-checklist', 'value'),
     State('simplification-checkbox', 'value'),
     State('estimates-checkbox', 'value'),
     State({'type': 'feature-checklist', 'index': ALL}, 'value'),
     State('colorcode-radioitems', 'value'),
     State('pltype-checklist', 'value'),
     State('teff-checklist', 'value'),
     State('lum-checklist', 'value'),
     State('met-checklist', 'value'),
     State('nanhandle-radioitems', 'value'),
     State('neighbors-slider', 'value')]
)
def update_graph(update, filters, simple, estimate, feature_vals, colorcode, pltype, teff, lum, met, nanhandle, neighbors):
    print(f"running graph {update}")

    # Combines all feature checkboxes into one list.
    features = sorted(set(val for group in feature_vals for val in group))

    # Load dataset
    df = pl_s.copy()

    # Apply filters
    if 'solar' in filters:
        df = pd.concat([pl_s.copy(), solar_planets.copy()], ignore_index=True) # type: ignore
    if 'atmoData' in filters:
        df = df[df['pl_name'].isin(at_s['pl_name']) | df["is_solar"] == True]
    if 'default' in filters:
        df = df[df['default_flag'] == True]
    if 'noControv' in filters:
        df = df[df['pl_controv_flag'] == False]
    if 'densRange' in filters:
        df=df[(df['pl_dens'] > 1) & (df['pl_dens'] < 5.6)] # Only gets planets with a density between water and metallic iron
    if 'target' in filters:
        df = df[df['hostname'].isin(target_stars) | df["is_solar"] == True] # type: ignore
    df = df[df['pl_type'].isin(pltype)]  
    df = df[df['st_teffclass'].isin(teff)]   
    df = df[df['st_lumclass'].isin(lum)]
    if 'null' in met:
        df = df[df['st_metratio'].isin(met) | df['st_metratio'].isna()]
    else:
        df = df[df['st_metratio'].isin(met)]
    if simple:
        df["pl_type"] = df["pl_type"].replace({"super_earth": "terrestrial", "neptune_like": "gas_giant"})
    if estimate:
        df["pl_dens"] = df["pl_dens"].fillna(df["pl_densest"])
        df["st_teffclass"] = df["st_teffclass"].replace("unknown", np.nan).fillna(df["st_teffclassest"])
        df["st_lumclass"] = df["st_lumclass"].replace("unknown", np.nan).fillna(df["st_lumclassest"])

    # Returns empty graphs and error message for certain anomalies
    missing_features = [f for f in features if f not in df.columns]
    if missing_features:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), "ERROR: Missing columns " + missing_features
    if df.empty:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: filtered dataset is empty'
    if len(features) <= 2:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: not enough features selected'
    
    # Select features to be evaluated (keeps certain others for hover info)
    df = df[["pl_name"] + ["pl_type"] +
        ["discoverymethod"] + ["disc_refname"] + ["disc_locale"] + ["disc_facility"] + ["disc_telescope"] + ["disc_instrument"] +
        ["hostname"] + ["is_solar"] + ["st_spectype"] + ["st_teffclass"] + ["st_lumclass"] +
        features
    ]
    epoch = pd.Timestamp("1970-01-01")
    if "rowupdate" in df.columns:
        df["rowupdate"] = pd.to_datetime(df["rowupdate"], errors="coerce")
        df["rowupdate"] = (df["rowupdate"] - epoch).dt.total_seconds()
    if "releasedate" in df.columns:
        df["releasedate"] = pd.to_datetime(df["releasedate"], errors="coerce")
        df["releasedate"] = (df["releasedate"] - epoch).dt.total_seconds()

    # Gets information on NaNs in each column
    naninfo = [f"Feature info:\nTotal entries: {len(df)}\nRows without NaNs: {(~df[features].isna().any(axis=1)).sum()}"]
    nan_counts = df[features].isna().sum()  # Column-wise NaNs in df
    most_nans_index = nan_counts.idxmax()  # Find index of the feature with the most NaNs
    for col_name, count in nan_counts.items():
        suffix = " (highest)" if col_name == most_nans_index else ""
        naninfo.append(f"{col_name}: {count} NaNs{suffix}, mean {df[col_name].mean():.2f}, median {df[col_name].median():.2f}, variance {df[col_name].var():.2f}")
    nanstring = "\n".join(naninfo)

    df_cleaned = None  # Will hold the cleaned DataFrame later

    # Replace NaNs with the value given by nanhandle for PCA 
    if nanhandle in ['mean', 'median']:
        cleaned_groups = []
        for pl_type, df_group in df.groupby('pl_type'):
            df_sub = df_group.copy()
            fill_vals = (
                df_sub[features].mean() if nanhandle == 'mean'
                else df_sub[features].median()
            )
            fill_vals = fill_vals.fillna(0)  # fallback for all-NaN columns
            df_sub[features] = df_sub[features].fillna(fill_vals)
            cleaned_groups.append(df_sub)
        df_cleaned = pd.concat(cleaned_groups)
    else:
        df_cleaned = df.dropna(subset=features)

    # Final check
    if df_cleaned.empty:
        return go.Figure(), go.Figure(), go.Figure(), go.Figure(), 'ERROR: filtered dataset is empty'

    X = df_cleaned[features].values

    # PCA-required data normalization
    X_mean = np.mean(X, axis=0)
    X_std = np.std(X, axis=0)
    X_std[X_std == 0] = 1  # prevents divide-by-zero
    X_normalized = (X - X_mean) / X_std

    # Numpy handles covariance matrix, eigenvalues, and eigenvectors
    cov_matrix = np.cov(X_normalized, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

    # Sort eigenvectors by descending eigenvalues
    idx = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, idx]
    eigenvalues = eigenvalues[idx]

    # make PCA array
    loadings = eigenvectors[:, :2] # Extract first 2 principal components to keep
    X_pca = np.dot(X_normalized, loadings)

    # UMAP
    embedding = umap.UMAP(n_neighbors=neighbors, min_dist=0.1).fit_transform(X_pca)

    # Create DataFrame for visualization by combining UMAP columns and original DataFrame
    df_umap = df_cleaned.join(pd.DataFrame(embedding, columns=["UMAP1", "UMAP2"], index=df_cleaned.index))
    df_umap = df_umap.dropna(subset=["pl_name"])  # Remove rows where pl_name is missing

    # Creates list of additional features for hover info
    exclude_cols = ["UMAP1", "UMAP2", "pl_type", "st_teffclass", "st_lumclass"]
    hover_features = [col for col in df_umap.columns if col not in exclude_cols] # Select all columns except the excluded ones
    hover_data_dict = {col: True for col in hover_features} # Convert list to dictionary format for hover_data

    color_map = {
        "terrestrial": "blue", "super_earth": "red", "neptune_like": "green", "gas_giant": "purple", "tba": "black",
        "II": "yellow", "III": "orange", "IV": "green", "V": "blue", "VI": "black",
        "B": "darkblue", "A": "royalblue", "F": "seagreen", "G": "yellow", "K": "orange", "M": "red",
        "unknown": "gray",
    }

    if colorcode in df_umap.columns and df_umap[colorcode].dtype == "object":
        # Count values
        label_counts = df_umap[colorcode].value_counts().to_dict()

        # Create a mapping: e.g., "terrestrial" → "terrestrial (42)"
        labeled_with_counts = {
            key: f"{key} ({label_counts.get(key, 0)})" for key in df_umap[colorcode].unique()
        }

        # Apply the relabeling
        df_umap["colorcode_labeled"] = df_umap[colorcode].map(labeled_with_counts)

        # Update discrete mapping to use new labels
        colorcode_use = "colorcode_labeled"
        color_map_labeled = {
            labeled_with_counts[k]: v for k, v in color_map.items() if k in labeled_with_counts
        }
    else:
        # For continuous or numeric colorcodes, no relabeling
        colorcode_use = colorcode
        color_map_labeled = color_map
    
    # Increases size of solar system planets
    df_umap["marker_size"] = df_umap["is_solar"].fillna(False).apply(lambda x: 3 if x else 1)
    
    # Scatterplot
    umap_graph = px.scatter(
        df_umap, x="UMAP1", y="UMAP2", color=colorcode_use, color_discrete_map=color_map_labeled if df_umap[colorcode_use].dtype == "object" else None, hover_data=hover_data_dict, size="marker_size"
    )
    umap_graph.update_traces(
        hoverlabel=dict(
            font=dict(color="black"),
            bgcolor="white",
        )
    )
    print("done")
    return umap_graph, html.Pre(nanstring)
    
if __name__ == '__main__':
    app.run(debug=True, port=8052)

running graph 1
done
running graph 2
done
running graph 3
done


In [None]:
# histogram/dotplot (Credit to Microsoft Copilot)
# Note to self: figure out how the heck this works)
import dash
import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output, ctx, ALL
import plotly.graph_objects as go
import plotly.express as px

# imports Planetary systems csv
pl_s = pd.read_csv('assets/Planetary_Systems.csv', comment="#", low_memory=False)

# imports atmospheric list for one filter.
at_s = pd.read_csv('assets/Atmospheric_Spectroscopy.csv', comment="#", low_memory=False)

solar_planets = pd.read_csv('assets/Solar_Values.csv', comment="#", low_memory=False)

# Dash app
app = Dash(__name__, external_stylesheets=["assets/style.css"])

# List of database columns to take into account for graphs
planet_features = {
    'Planet': [
        {'label': "Orbital Period (pl_orbper)", 'value': "pl_orbper"}, # in days (Not in values dataset)
        {'label': "Orbit Semi-Major Axis (pl_orbsmax)", 'value': "pl_orbsmax"}, # In Astronomical Units ( Not in values dataset)
        {'label': "Epoch of Periastron (pl_orbtper)", 'value': "pl_orbtper"}, # in degrees (Not in values dataset)
        {'label': "Argument of Periastron (pl_orblper)", 'value': "pl_orblper"}, # in degrees (Not in values dataset)
        {'label': "Proj. Obliquity (pl_projobliq)", 'value': "pl_projobliq"}, # in degrees (Not in values dataset)
        {'label': "True Obliquity (pl_trueobliq)", 'value': "pl_trueobliq"}, # in degrees (Not in values dataset)
        {'label': "Radius (pl_rade)", 'value': "pl_rade"}, # in Earth radiuses (Not in values dataset)
        {'label': "Mass (pl_bmasse)", 'value': "pl_bmasse"}, # estimation, in Earth masses ( Not in values dataset)
        {'label': "Density (pl_dens)", 'value': "pl_dens"}, # in g/cm^3 (Not in values dataset)
        {'label': "Orbital Eccentricity (pl_orbeccen)", 'value': "pl_orbeccen"}, # (Not in values dataset)
        {'label': "Insol. Flux (pl_insol)", 'value': "pl_insol"}, # in Earth flux (Not in values dataset)
        {'label': "Equil. Temp. (pl_eqt)", 'value': "pl_eqt"},  # in Kelvin (Not in values dataset)
        {'label': "Transit Duration (pl_trandur)", 'value': "pl_trandur"}, # in hours (Not in values dataset)
        {'label': "Transit Midpoint (pl_tranmid)", 'value': "pl_tranmid"}, # in days (Not in values dataset)
        {'label': "Transit Depth (pl_trandep)", 'value': "pl_trandep"}, # percentage (Not in values dataset)
        {'label': "Impact Parameter (pl_imppar)", 'value': "pl_imppar"}, # (Not in values dataset)
        {'label': "Occulation Depth (pl_occdep)", 'value': "pl_occdep"}, # percentage (Not in values dataset)
        {'label': "Rad. Velocity Amplitude (pl_rvamp)", 'value': "pl_rvamp"}, # in m/s (ot in values dataset)
        {'label': "Discovery Year (disc_year)", 'value': "disc_year"},
        {'label': "Last Update (rowupdate)", 'value': "rowupdate"}, # last update of parameters
        # {'label': "Public Release Date", 'value': "releasedate"}, # date publicly released
    ], # note: talk to achyutan
    'Stellar': [
        {'label': "Effec. Temp. (st_teff)", 'value': "st_teff"}, # in Kelvin
        {'label': "Radius (st_rad)", 'value': "st_rad"}, # in Solar radiuses
        {'label': "Mass (st_mass)", 'value': "st_mass"},
        {'label': "Density (st_dens)", 'value': "st_dens"},
        {'label': "Surface Grav. (st_logg)", 'value': "st_logg"},
        {'label': "Age (st_age)", 'value': "st_age"}, # in gigayears
        {'label': "Rot. Period (st_rotp)", 'value': "st_rotp"},
        {'label': "Rot. Velocity (st_vsin)", 'value': "st_vsin"},
        {'label': "Rad. Velocity (st_radv)", 'value': "st_radv"},
        {'label': "Metallicity (st_met)", 'value': "st_met"},
        {'label': "Luminosity (st_lum)", 'value': "st_lum"},
    ],
    'System': [
        {'label': "Parallax (sy_plx)", 'value': "sy_plx"},
        {'label': "Dist from Earth (sy_dist)", 'value': "sy_dist"}, # in parsecs
        {'label': "No. Stars (sy_snum)", 'value': "sy_snum"},
        {'label': "No. Planets (sy_snum)", 'value': "sy_pnum"},
        {'label': "No. Moons (sy_snum)", 'value': "sy_mnum"},
        {'label': "u (Sloan) Magnitude (sy_umag, ~354 nm)", 'value': "sy_umag"},
        {'label': "B (Johnson) Magnitude (sy_bmag, ~442 nm)", 'value': "sy_bmag"},
        {'label': "g (Sloan) Magnitude (sy_gmag, ~475 nm)", 'value': "sy_gmag"},
        {'label': "V (Johnson) Magnitude (sy_vmag, ~540 nm)", 'value': "sy_vmag"},
        {'label': "Kepler Magnitude (sy_kepmag, ~600 nm)", 'value': "sy_kepmag"},
        {'label': "r (Sloan) Magnitude (sy_rmag, ~622 nm)", 'value': "sy_rmag"},
        {'label': "Gaia Magnitude (sy_gaiamag, ~673 nm)", 'value': "sy_gaiamag"},
        {'label': "i (Sloan) Magnitude (sy_imag, ~763 nm)", 'value': "sy_imag"},
        {'label': "I (Cousins) Magnitude (sy_icmag, ~786.5 nm)", 'value': "sy_icmag"},
        {'label': "TESS Magnitude (sy_tmag, ~800 nm)", 'value': "sy_tmag"},
        {'label': "z (Sloan) Magnitude (sy_zmag, ~905 nm)", 'value': "sy_zmag"},
        {'label': "J (2MASS) Magnitude (sy_jmag, ~1.25 μm)", 'value': "sy_jmag"},
        {'label': "H (2MASS) Magnitude (sy_hmag, ~1.65 μm)", 'value': "sy_hmag"},
        {'label': "Ks (2MASS) Magnitude (sy_kmag, ~2,15 μm)", 'value': "sy_kmag"},
        {'label': "W1 (WISE) Magnitude (sy_w1mag, ~3.4 μm)", 'value': "sy_w1mag"},
        {'label': "W2 (WISE) Magnitude (sy_w2mag, ~4.6 μm)", 'value': "sy_w2mag"},
        {'label': "W3 (WISE) Magnitude (sy_w3mag, ~12 μm)", 'value': "sy_w3mag"},
        {'label': "W4 (WISE) Magnitude (sy_w4mag, ~22 μm)", 'value': "sy_w4mag"},
    ]
}
initial_values = ['pl_dens', None, None]  # Only one group should have a value

app.layout = html.Div([
    html.H1("Histogram/\"Dotplot\""),

    html.H2("Data filters:"),
    dcc.Checklist(
        id='filter-checklist',
        options=[
            {'label': 'Has atmospheric data', 'value': 'atmoData'},
            {'label': 'Default parameter set', 'value': 'default'},
            {'label': 'No controversial flag', 'value': 'noControv'},
            {'label': 'Water to Metal Density', 'value': 'densRange'},
            {'label': 'In Target Star Catalog', 'value': 'target'},
            {'label': 'Include Solar System', 'value': 'solar'},
        ],
        value=['default', 'noControv'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='host-star-toggle',
            options=[{'label': 'Filter by host star: ', 'value': 'enable'}],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
        dcc.Input(
            id='host-star-input',
            type='text',
            placeholder='Enter host star name',
            debounce=True,
            className="textbox-style"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H3("Planet type:"),
    dcc.Checklist(
        id='pltype-checklist',
        options=[
            {'label': 'Terrestrial', 'value': 'terrestrial'},
            {'label': 'Super-Earth', 'value': 'super_earth'},
            {'label': 'Neptune-like', 'value': 'neptune_like'},
            {'label': 'Gas Giant', 'value': 'gas_giant'},
            {'label': 'Unknown', 'value': 'unknown'},
            {'label': 'TBA', 'value': 'tba'},
        ],
        value=['terrestrial', 'super_earth', 'neptune_like', 'gas_giant', 'unknown', 'tba'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='simplification-checkbox',
            options=[
                {'label': html.Span(['Simplify planet types?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H3("Stellar effective temp:"),
    dcc.Checklist(
        id='teff-checklist',
        options=[
            {'label': 'O type (>33000 K)', 'value': 'O'},
            {'label': 'B type (10000-33000 K)', 'value': 'B'},
            {'label': 'A type (7300-10000 K)', 'value': 'A'},
            {'label': 'F type (6000-7300 K)', 'value': 'F'},
            {'label': 'G type (5300-6000 K)', 'value': 'G'},
            {'label': 'K type (3900-5300 K, recommended)', 'value': 'K'},
            {'label': 'M type (2300-3900 K)', 'value': 'M'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['O', 'B', 'A', 'F', 'G', 'K', 'M', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Stellar luminosity:"),
    dcc.Checklist(
        id='lum-checklist',
        options=[
            {'label': 'Hypergiant', 'value': '0'},
            {'label': 'Supergiant', 'value': 'I'},
            {'label': 'Bright Giant', 'value': 'II'},
            {'label': 'Giant', 'value': 'III'},
            {'label': 'Subgiant', 'value': 'IV'},
            {'label': 'Main-sequence/Dwarf (recommended)', 'value': 'V'},
            {'label': 'Subdwarf', 'value': 'VI'},
            {'label': 'White Dwarf', 'value': 'VII'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.H3("Stellar metallicity ratio:"),
    dcc.Checklist(
        id='met-checklist',
        options=[
            {'label': 'Iron abundance (recommended)', 'value': '[Fe/H]'},
            {'label': 'General metal content', 'value': '[M/H]'},
            {'label': 'Unknown', 'value': 'null'},
        ],
        value=['[Fe/H]', '[M/H]', 'null'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H2("Select a feature:"),
    *[
        html.Div([
            html.H4(category),
            dcc.RadioItems(
                id={'type': 'feature-radioitems', 'index': i},
                options=options,
                value=initial_values[i],
                className="checkbox-container" 
            )
        ]) for i, (category, options) in enumerate(planet_features.items())
    ],
    
    html.H2("Color coding:"),
    dcc.RadioItems(
        id="colorcode-radioitems",
        options=[
            {'label': 'Planet Type', 'value': 'pl_type'},
            {'label': 'St. Effective Temp.', 'value': 'st_teffclass'},
            {'label': 'Stellar Luminosity', 'value': 'st_lumclass'},
            # {'label': 'Host Star', 'value': 'hostname'}, # too much computational power.
        ],
        value="pl_type",
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='estimates-checkbox',
            options=[
                {'label': html.Span(['Estimate certain values?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H2("Min Value:"),
    dcc.Slider(
        id="min-value-slider",
        min=0,
        max=100,  # Will update dynamically
        step=None,
        value=0,
        tooltip={"placement": "bottom", "always_visible": True}
    ),

    html.H2("Max Value:"),
    dcc.Slider(
        id="max-value-slider",
        min=0,
        max=100,  # Will update dynamically
        step=None,
        value=100,
        tooltip={"placement": "bottom", "always_visible": True}
    ),

    dcc.Tabs(id="tabs", value="tab-1", children=[
        dcc.Tab(label="Histogram", value="tab-1", className="tab"),
        dcc.Tab(label="Dotplot", value="tab-2", className="tab"),
    ]),
    html.Div(id="tabs-content", children=[
        dcc.Graph(id="histogram", figure={}, style={"display": "block"}),
        dcc.Graph(id="dotplot", figure={}, style={"display": "none"}),
    ]),  # This will hold the selected graph

    html.P('NaN info:', id='naninfo-box'),

    html.P("Data sourced from:"),
    html.Div([
        html.A("https://www.doi.org/10.26133/NEA12", href="https://www.doi.org/10.26133/NEA12", target="_blank"),
        html.Br(),
        html.A("https://www.doi.org/10.26133/NEA36", href="https://www.doi.org/10.26133/NEA36", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/exoplanet-catalog/", href="https://science.nasa.gov/exoplanets/exoplanet-catalog/", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/target-star-catalog/", href="https://science.nasa.gov/exoplanets/target-star-catalog/", target="_blank"),
    ])
], style={'color': 'black', 'font-family': 'Arial', 'backgroundColor': 'white', 'padding': '20px', 'text-align': 'center'})

@app.callback(
    [Output('histogram', 'figure'),
     Output('dotplot', 'figure'),
     Output('naninfo-box', 'children')],
    [Input('filter-checklist', 'value'),
     Input('simplification-checkbox', 'value'),
     Input('estimates-checkbox', 'value'),
     Input({'type': 'feature-radioitems', 'index': ALL}, 'value'),
     Input('min-value-slider', 'value'),
     Input('max-value-slider', 'value'),
     Input('host-star-toggle', 'value'),
     Input('host-star-input', 'value'),
     Input('colorcode-radioitems', 'value'),
     Input('pltype-checklist', 'value'),
     Input('teff-checklist', 'value'),
     Input('lum-checklist', 'value'),
     Input('met-checklist', 'value'),]
)
def update_graph(filters, simple, estimate, feature_values, min_value, max_value, startoggle, starinput, colorcode, pltype, teff, lum, met):
    # Gets the selected feature from all lists
    feature = next((f for f in feature_values if f is not None), None)
    if feature is None:
        raise dash.exceptions.PreventUpdate

    # Load dataset
    df = pl_s.copy()
    if 'solar' in filters:
        df = pd.concat([pl_s.copy(), solar_planets.copy()], ignore_index=True) # type: ignore
    naninfo = f"{df[feature].isna().sum()} NaNs"
    df = df.dropna(subset=[feature])
    
    if feature == "rowupdate" or feature == "releasedate":
        epoch = pd.Timestamp("1970-01-01")
        if feature in df.columns:
            df.loc[:, feature] = (
                pd.to_datetime(df[feature], errors="coerce")
                .subtract(epoch)
                .dt.total_seconds()
                .astype(int)
        )
    # Cuts out features below minimum or above maximum
    df = df[(df[feature] >= min_value) & (df[feature] <= max_value)]
    # Apply filters
    if 'atmoData' in filters:
        df = df[df['pl_name'].isin(at_s['pl_name']) | df["is_solar"] == True]
    if 'default' in filters:
        df = df[df['default_flag'] == True]
    if 'noControv' in filters:
        df = df[df['pl_controv_flag'] == False]
    if 'densRange' in filters:
        df=df[(df['pl_dens'] > 1) & (df['pl_dens'] < 5.6)] # Only gets planets with a density between water and metallic iron
    if 'target' in filters:
        df = df[df['hostname'].isin(target_stars)] # type: ignore
    df = df[df['pl_type'].isin(pltype)]  
    df = df[df['st_teffclass'].isin(teff)]   
    df = df[df['st_lumclass'].isin(lum)]
    if 'null' in met:
        df = df[df['st_metratio'].isin(met) | df['st_metratio'].isna()]
    else:
        df = df[df['st_metratio'].isin(met)]
    if startoggle and starinput.strip():
        # Normalize both sides for safe matching
        df = df[df['hostname'].str.lower() == starinput.strip().lower()]
    if simple:
        df["pl_type"] = df["pl_type"].replace({"super_earth": "terrestrial", "neptune_like": "gas_giant"})
    if estimate:
        df["pl_dens"] = df["pl_dens"].fillna(df["pl_densest"])
        df["st_teffclass"] = df["st_teffclass"].replace("unknown", np.nan).fillna(df["st_teffclassest"])
        df["st_lumclass"] = df["st_lumclass"].replace("unknown", np.nan).fillna(df["st_lumclassest"])

    # Returns empty graphs if filtered dataset is empty
    if df.empty:
        return go.Figure(), go.Figure(), "ERROR: filtered dataset is empty"
    if max_value < min_value:
        return go.Figure(), go.Figure(), "ERROR: maximum is greater than minimum"
    naninfo = naninfo + f"\nTotal entries: {len(df)}\nMean: {df[feature].mean()}\nMedian: {df[feature].median()}\nVariance: {df[feature].var()}"
    color_map = {
        "terrestrial": "blue", "super_earth": "red", "neptune_like": "green", "gas_giant": "purple", "tba": "black",
        "II": "yellow", "III": "orange", "IV": "green", "V": "blue", "VI": "black",
        "B": "darkblue", "A": "royalblue", "F": "seagreen", "G": "yellow", "K": "orange", "M": "red",
        "unknown": "gray",
    }

    if colorcode in df.columns and df[colorcode].dtype == "object":
        # Count values
        label_counts = df[colorcode].value_counts().to_dict()

        # Create a mapping: e.g., "terrestrial" → "terrestrial (42)"
        labeled_with_counts = {
            key: f"{key} ({label_counts.get(key, 0)})" for key in df[colorcode].unique()
        }

        # Apply the relabeling
        df["colorcode_labeled"] = df[colorcode].map(labeled_with_counts)

        # Update discrete mapping to use new labels
        colorcode_use = "colorcode_labeled"
        color_map_labeled = {
            labeled_with_counts[k]: v for k, v in color_map.items() if k in labeled_with_counts
        }
    else:
        # For continuous or numeric colorcodes, no relabeling
        colorcode_use = colorcode
        color_map_labeled = color_map
    
    # Increases size of solar system planets
    df["marker_size"] = df["is_solar"].fillna(False).apply(lambda x: 3 if x else 1)

    # Generate Plotly graphs
    histogram = px.histogram(df, x=feature, nbins=100, color=colorcode_use, color_discrete_map=color_map_labeled if df[colorcode_use].dtype == "object" else None)
    dotplot = px.scatter(df, x=feature, color=colorcode_use, color_discrete_map=color_map_labeled if df[colorcode_use].dtype == "object" else None, size="marker_size", hover_data=['pl_name', 'hostname', 'discoverymethod', 'disc_refname', 'disc_locale', 'disc_facility', 'disc_telescope', 'disc_instrument'])
    dotplot.update_traces(
        hoverlabel=dict(
            font=dict(color="black"),
            bgcolor="white",
        ),
    )

    return histogram, dotplot, html.Pre(naninfo)

@app.callback(
    [Output('histogram', 'style'),
     Output('dotplot', 'style'),],
    [Input("tabs", "value")]
)
def update_tabs(tab):
    styles = {
        "tab-1": [{"display": "block"}, {"display": "none"}],
        "tab-2": [{"display": "none"}, {"display": "block"}],
    }
    return styles.get(tab, [{"display": "block"}, {"display": "none"}])
    
# Code to update slider min and max values
@app.callback(
    [Output('min-value-slider', 'min'),
     Output('min-value-slider', 'max'),
     Output('min-value-slider', 'value'),
     Output('min-value-slider', 'step'),
     Output('max-value-slider', 'min'),
     Output('max-value-slider', 'max'),
     Output('max-value-slider', 'value'),
     Output('max-value-slider', 'step'),],
    [Input({'type': 'feature-radioitems', 'index': ALL}, 'value')]
)
def update_sliders(feature_values):
    feature = next((f for f in feature_values if f is not None), None)
    df = pl_s.copy()
    df = df.dropna(subset=[feature])
    if feature == "rowupdate" or feature == "releasedate":
        # print(df[feature].isna().sum(), "NaNs in", feature)
        # print("Sample bad values:", df[feature][df[feature].isna()].head())
        epoch = pd.Timestamp("1970-01-01")
        if feature in df.columns:
            df.loc[:, feature] = (
                pd.to_datetime(df[feature], errors="coerce")
                .subtract(epoch)
                .dt.total_seconds()
                .astype(int)
        )
    if feature is None:
        raise dash.exceptions.PreventUpdate

    # Get min and max values for the selected feature and round them
    feature_min = np.floor(df[feature].min())
    feature_max = np.ceil(df[feature].max())
    step = (feature_max-feature_min)/20

    return (
        feature_min, feature_max, feature_min, step,
        feature_min, feature_max, feature_max, step
    )

# Code to ensure all radioLists are treated as the same one.
@app.callback(
    Output({'type': 'feature-radioitems', 'index': ALL}, 'value'),
    Input({'type': 'feature-radioitems', 'index': ALL}, 'value'),
    prevent_initial_call=True
)
def enforce_single_selection(values):
    triggered = ctx.triggered_id
    new_values = [None] * len(values)
    if triggered:
        idx = triggered['index']
        new_values[idx] = values[idx]
    return new_values

if __name__ == '__main__':
    app.run(debug=True, port=8053)

In [4]:
# Created a 3D exoplanet map. Credits to Microsoft Copilot for most of the code.
# To do: add representation of Earth
# 2d: change size of dots, overlay earth map for reference
# Poster: add the starmaps first, with some labels.

import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output, State, Dash, no_update
import plotly.graph_objects as go
import plotly.express as px

# imports Planetary systems csv
pl_s = pd.read_csv('assets/Planetary_Systems.csv', comment="#", low_memory=False)

# imports atmospheric list for one filter.
at_s = pd.read_csv('assets/Atmospheric_Spectroscopy.csv', comment="#", low_memory=False)

# Dash app
app = Dash(__name__, external_stylesheets=["assets/style.css"])

app.layout = html.Div([
    html.H1("3D Exoplanet \"Starmap\""),

    html.H2("Data filters:"),
    dcc.Checklist(
        id='filter-checklist',
        options=[
            {'label': 'Has atmospheric data', 'value': 'atmoData'},
            {'label': 'Default parameter set', 'value': 'default'},
            {'label': 'No controversial flag', 'value': 'noControv'},
            {'label': 'Water to Metal Density', 'value': 'densRange'},
            {'label': 'In Target Star Catalog', 'value': 'target'},
        ],
        value=['default', 'noControv'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='host-star-toggle',
            options=[{'label': 'Filter by host star: ', 'value': 'enable'}],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
        dcc.Input(
            id='host-star-input',
            type='text',
            placeholder='Enter host star name',
            debounce=True,
            className="textbox-style"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H3("Planet type:"),
    dcc.Checklist(
        id='pltype-checklist',
        options=[
            {'label': 'Terrestrial', 'value': 'terrestrial'},
            {'label': 'Super-Earth', 'value': 'super_earth'},
            {'label': 'Neptune-like', 'value': 'neptune_like'},
            {'label': 'Gas Giant', 'value': 'gas_giant'},
            {'label': 'Unknown', 'value': 'unknown'},
            {'label': 'TBA', 'value': 'tba'},
        ],
        value=['terrestrial', 'super_earth', 'neptune_like', 'gas_giant', 'unknown', 'tba'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='simplification-checkbox',
            options=[
                {'label': html.Span(['Simplify planet types?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H3("Stellar effective temp:"),
    dcc.Checklist(
        id='teff-checklist',
        options=[
            {'label': 'O type (>33000 K)', 'value': 'O'},
            {'label': 'B type (10000-33000 K)', 'value': 'B'},
            {'label': 'A type (7300-10000 K)', 'value': 'A'},
            {'label': 'F type (6000-7300 K)', 'value': 'F'},
            {'label': 'G type (5300-6000 K)', 'value': 'G'},
            {'label': 'K type (3900-5300 K, recommended)', 'value': 'K'},
            {'label': 'M type (2300-3900 K)', 'value': 'M'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['O', 'B', 'A', 'F', 'G', 'K', 'M', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H3("Stellar luminosity:"),
    dcc.Checklist(
        id='lum-checklist',
        options=[
            {'label': 'Hypergiant', 'value': '0'},
            {'label': 'Supergiant', 'value': 'I'},
            {'label': 'Bright Giant', 'value': 'II'},
            {'label': 'Giant', 'value': 'III'},
            {'label': 'Subgiant', 'value': 'IV'},
            {'label': 'Main-sequence/Dwarf (recommended)', 'value': 'V'},
            {'label': 'Subdwarf', 'value': 'VI'},
            {'label': 'White Dwarf', 'value': 'VII'},
            {'label': 'Unknown', 'value': 'unknown'},
        ],
        value=['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'unknown'],  # Default values to filter by
        className="checkbox-container"
    ),
    html.H3("Stellar metallicity ratio:"),
    dcc.Checklist(
        id='met-checklist',
        options=[
            {'label': 'Iron abundance (recommended)', 'value': '[Fe/H]'},
            {'label': 'General metal content', 'value': '[M/H]'},
            {'label': 'Unknown', 'value': 'null'},
        ],
        value=['[Fe/H]', '[M/H]', 'null'],  # Default values to filter by
        className="checkbox-container"
    ),

    html.H2("Color coding:"),
    dcc.RadioItems(
        id="colorcode-radioitems",
        options=[
            {'label': 'Planet Type', 'value': 'pl_type'},
            {'label': 'St. Effective Temp.', 'value': 'st_teffclass'},
            {'label': 'Stellar Luminosity', 'value': 'st_lumclass'},
            # {'label': 'Host Star', 'value': 'hostname'}, # too much computational power.
        ],  
        value="pl_type",
        className="checkbox-container"
    ),

    html.H2("How to handle NaN values:"),
    dcc.RadioItems(
        id="nanhandle-radioitems",
        options=[
            # {'label': 'Set to 0', 'value': 'zero'},
            {'label': 'Set to mean', 'value': 'mean'},
            {'label': 'Set to median', 'value': 'median'},
            {'label': 'Remove rows', 'value': 'remove'},
        ],
        value="median",
        className="checkbox-container"
    ),
    html.Div([
        dcc.Checklist(
            id='estimates-checkbox',
            options=[
                {'label': html.Span(['Estimate certain values?'], style={'fontWeight': 'bold'}), 'value': 'enable'},
            ],
            value=[],  # Start unchecked
            className="checkbox-container"
        ),
    ], className="checkbox-container", style={'display': 'flex', 'alignItems': 'center', 'justifyContent': 'center', 'gap': '10px'}),

    html.H2("Min Radius:"),
    dcc.Slider(
        id="min-radius-slider",
        min=0,
        max=10000,
        step=100,
        value=1000,
        marks={i: str(i) for i in range(0, 10001, 1000)},
        tooltip={"placement": "bottom", "always_visible": True}
    ),

    dcc.Tabs(id="tabs", value="tab-1", children=[
        dcc.Tab(label="3D Starmap", value="tab-1", className="tab"),
        dcc.Tab(label="2D Starmap", value="tab-2", className="tab"),
    ]),
    html.Div(id="tabs-content", children=[
        dcc.Graph(id="3d-starmap", figure={}, style={"display": "block"}),
        dcc.Graph(id="2d-starmap", figure={}, style={"display": "none"}),
    ]),  # This will hold the selected graph
    
    html.P('NaN info:', id='naninfo-box'),

    html.P("Data sourced from:"),
    html.Div([
        html.A("https://www.doi.org/10.26133/NEA12", href="https://www.doi.org/10.26133/NEA12", target="_blank"),
        html.Br(),
        html.A("https://www.doi.org/10.26133/NEA36", href="https://www.doi.org/10.26133/NEA36", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/exoplanet-catalog/", href="https://science.nasa.gov/exoplanets/exoplanet-catalog/", target="_blank"),
        html.Br(),
        html.A("https://science.nasa.gov/exoplanets/target-star-catalog/", href="https://science.nasa.gov/exoplanets/target-star-catalog/", target="_blank"),
    ])
], style={'color': 'black', 'font-family': 'Arial', 'backgroundColor': 'white', 'padding': '20px', 'text-align': 'center'})

@app.callback(
    [Output('3d-starmap', 'figure'),
     Output('2d-starmap', 'figure'),
     Output('naninfo-box', 'children')],
    [Input('filter-checklist', 'value'),
     Input('simplification-checkbox', 'value'),
     Input('estimates-checkbox', 'value'),
     Input('min-radius-slider', 'value'),
     Input('nanhandle-radioitems', 'value'),
     Input('host-star-toggle', 'value'),
     Input('host-star-input', 'value'),
     Input('colorcode-radioitems', 'value'),
     Input('pltype-checklist', 'value'),
     Input('teff-checklist', 'value'),
     Input('lum-checklist', 'value'),
     Input('met-checklist', 'value'),]

)
def update_graph(filters, simple, estimate, minRad, nanhandle, startoggle, starinput, colorcode, pltype, teff, lum, met):
    # Load dataset
    df = pl_s.copy()

    # Apply filters
    if 'atmoData' in filters:
        df = df[df['pl_name'].isin(at_s['pl_name'])]
    if 'default' in filters:
        df = df[df['default_flag'] == True]
    if 'noControv' in filters:
        df = df[df['pl_controv_flag'] == False]
    if 'densRange' in filters:
        df=df[(df['pl_dens'] > 1) & (df['pl_dens'] < 5.6)] # Only gets planets with a density between water and metallic iron
    if 'target' in filters:
        df = df[df['hostname'].isin(target_stars)] # type: ignore
    df = df[df['pl_type'].isin(pltype)]  
    df = df[df['st_teffclass'].isin(teff)]   
    df = df[df['st_lumclass'].isin(lum)]
    if 'null' in met:
        df = df[df['st_metratio'].isin(met) | df['st_metratio'].isna()]
    else:
        df = df[df['st_metratio'].isin(met)]
    if startoggle and starinput.strip():
        # Normalize both sides for safe matching
        df = df[df['hostname'].str.lower() == starinput.strip().lower()]
    if simple:
        df["pl_type"] = df["pl_type"].replace({"super_earth": "terrestrial", "neptune_like": "gas_giant"})
    if estimate:
        df["pl_dens"] = df["pl_dens"].fillna(df["pl_densest"])
        df["st_teffclass"] = df["st_teffclass"].replace("unknown", np.nan).fillna(df["st_teffclassest"])
        df["st_lumclass"] = df["st_lumclass"].replace("unknown", np.nan).fillna(df["st_lumclassest"])

    # Returns empty graphs if filtered dataset is empty
    if df.empty:
        return go.Figure(), go.Figure(), "ERROR: filtered dataset is empty"
    
    # Replace NaNs with the value given by nanhandle
    if nanhandle == 'zero':
        df['sy_dist'].fillna(0, inplace=True)
    elif nanhandle == 'mean':
        df['sy_dist'] = df.groupby('pl_type')['sy_dist'].transform(
            lambda x: x.fillna(np.nanmean(x))
        )
    elif nanhandle == 'median':
        df['sy_dist'] = df.groupby('pl_type')['sy_dist'].transform(
            lambda x: x.fillna(np.nanmedian(x))
        )
    else:
        df = df = df[df["sy_dist"].notna()]
    naninfo = f"Total entries: {len(df)}"
    
    # If distance is below minRad, sets distance to minRad on the map (doesn't change the data)
    dist = df['sy_dist'] if minRad == 0 else np.where(df['sy_dist'] <= minRad, 1, df['sy_dist'] / minRad)
    # Convert spherical coordinates (RA, Dec, Distance) to Cartesian (X, Y, Z)
    df['x'] = dist * np.cos(np.radians(df['dec'])) * np.cos(np.radians(df['ra']))
    df['y'] = dist * np.cos(np.radians(df['dec'])) * np.sin(np.radians(df['ra']))
    df['z'] = dist * np.sin(np.radians(df['dec']))
    
    fig2d = go.Figure()
    fig3d = go.Figure()

    # Color legend    
    color_map = {
        "terrestrial": "blue", "super_earth": "red", "neptune_like": "green", "gas_giant": "purple", "tba": "black",
        "II": "yellow", "III": "orange", "IV": "green", "V": "blue", "VI": "black",
        "B": "darkblue", "A": "royalblue", "F": "seagreen", "G": "yellow", "K": "orange", "M": "red",
        "unknown": "gray",
    }
    if colorcode in df.columns and df[colorcode].dtype == "object":
        # Count values
        label_counts = df[colorcode].value_counts().to_dict()

        # Create a mapping: e.g., "terrestrial" → "terrestrial (42)"
        labeled_with_counts = {
            key: f"{key} ({label_counts.get(key, 0)})" for key in df[colorcode].unique()
        }

        # Apply the relabeling
        df["colorcode_labeled"] = df[colorcode].map(labeled_with_counts)

        # Update discrete mapping to use new labels
        colorcode_use = "colorcode_labeled"
        color_map_labeled = {
            labeled_with_counts[k]: v for k, v in color_map.items() if k in labeled_with_counts
        }
    else:
        # For continuous or numeric colorcodes, no relabeling
        colorcode_use = colorcode
        color_map_labeled = color_map

    dmin = df['sy_dist'].min()
    dmax = df['sy_dist'].max()
    minSize = 2
    maxSize = 5

    if dmax != dmin:
        df['marker_size'] = minSize + (df['sy_dist'] - dmin) / (dmax - dmin) * (maxSize - minSize)
    else:
        df['marker_size'] = (minSize+maxSize)/2  # fallback size
    sizeref_val = 2. * maxSize / (maxSize**2)
                                  
    df['hover_text'] = (
        "Planet Name: " + df['pl_name'] +
        "<br>Planet Type: " + df['pl_type'] +
        "<br>RA: " + df['ra'].round(2).astype(str) + "°, Dec: " + df['dec'].round(2).astype(str) + "°" +
        "<br>Dist. from Earth: " + df['sy_dist'].round(2).astype(str) + " pc" +
        "<br>Disc. Method: " + df['discoverymethod'] +
        "<br>Disc. Reference: " + df['disc_refname'] +
        "<br>Disc. Locale: " + df['disc_locale'] +
        "<br>Disc. Facility: " + df['disc_facility'] +
        "<br>Disc. Telescope: " + df['disc_telescope'] +
        "<br>Disc. Instrument: " + df['disc_instrument']
    )

    # Plotting Earth
    fig3d.add_trace(go.Scatter3d(
        x=[0],
        y=[0],
        z=[0],
        mode="markers",
        marker=dict(
            size=10,  # Make Earth larger
            color="white",
            line=dict(color="royalblue", width=4)
        ),
        text=["Planet Name: Earth<br>Planet Type: terrestrial"],  # ← put your custom HTML hover text here
        hoverinfo='text',  # ← tell Plotly to use 'text' for hover display
        name="Earth"
    ))

    for category, color in color_map_labeled.items():
        filtered_df = df[(df[colorcode_use] == category) & (df["pl_name"] != "Earth")]
        if filtered_df.empty:
            continue
        sizes = filtered_df['marker_size']
        hover = filtered_df['hover_text']
        fig3d.add_trace(go.Scatter3d(
            x=filtered_df["x"],
            y=filtered_df["y"],
            z=filtered_df["z"],
            mode="markers",
            marker=dict(size=2, color=color),
            name=category,
            text=hover,
            hoverinfo='text'
        ))
        fig2d.add_trace(go.Scatter(
            x=filtered_df["ra"],
            y=filtered_df["dec"],
            mode='markers',
            name=category,
            marker=dict(
                size=sizes,
                color=color,
                sizemode='diameter',
                sizeref=sizeref_val,
                sizemin=minSize,
                line=dict(width=0)
            ),
            text=hover,
            hoverinfo='text'
        ))
    
    # Improve legend visibility and title
    fig3d.update_layout(
        legend=dict(
            x=0, y=1,  # Positioning
            title="Planet Type",
            bgcolor="rgba(255, 255, 255, 0.5)",  # Semi-transparent background
            borderwidth=1
        ),
        scene=dict(
            xaxis=dict(showgrid=False, showticklabels=False, visible=False),
            yaxis=dict(showgrid=False, showticklabels=False, visible=False),
            zaxis=dict(showgrid=False, showticklabels=False, visible=False),
            annotations=[]
        ),
        scene_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
        margin=dict(l=0, r=0, b=0, t=0)
    )
    fig3d.update_traces(
        hoverlabel=dict(
            font=dict(color="black"),
            bgcolor="white",
        ),
    )

    return fig3d, fig2d, html.Pre(naninfo)

@app.callback(
    [Output('3d-starmap', 'style'),
     Output('2d-starmap', 'style'),],
    [Input("tabs", "value")]
)
def update_tabs(tab):
    styles = {
        "tab-1": [{"display": "block"}, {"display": "none"}],
        "tab-2": [{"display": "none"}, {"display": "block"}],
    }
    return styles.get(tab, [{"display": "block"}, {"display": "none"}])

if __name__ == '__main__':
    app.run(debug=True, port=8054)

In [7]:
pl_s = pd.read_csv('assets/Planetary_Systems.csv', comment="#", low_memory=False)
print(pl_s["disc_facility"].unique())

['Xinglong Station' 'Thueringer Landessternwarte Tautenburg'
 'Okayama Astrophysical Observatory' 'W. M. Keck Observatory'
 'Multiple Observatories' 'Lick Observatory' 'Gemini Observatory'
 'Subaru Telescope' 'Paranal Observatory'
 'Cerro Tololo Inter-American Observatory' 'Mauna Kea Observatory'
 'European Space Agency (ESA) Gaia Satellite' 'Hubble Space Telescope'
 'Kepler' 'Haute-Provence Observatory' 'McDonald Observatory'
 'Anglo-Australian Telescope' 'Bohyunsan Optical Astronomical Observatory'
 'Transiting Exoplanet Survey Satellite (TESS)'
 'Roque de los Muchachos Observatory' 'K2'
 'Wide-field Infrared Survey Explorer (WISE) Sat' 'La Silla Observatory'
 'Calar Alto Observatory' 'NASA Infrared Telescope Facility (IRTF)'
 'CoRoT' 'Multiple Facilities' 'Yunnan Astronomical Observatory'
 'Spitzer Space Telescope' 'MEarth Project' 'Very Long Baseline Array'
 'Acton Sky Portal Observatory' 'Palomar Observatory' 'HATNet' 'HATSouth'
 'Kitt Peak National Observatory' 'Las Campanas Obse