In [1]:
import pandas as pd

In [4]:
df_k2 = pd.read_csv('data/k2Data.csv')
df_kepler = pd.read_csv('data/keplerData.csv')
df_toi = pd.read_csv('data/TOIData.csv')

In [22]:
print(df_k2.shape)
print(df_kepler.shape)
print(df_toi.shape)

(3992, 94)
(9564, 49)
(7668, 65)


## Standardize exoplanet catalogs across Kepler, K2, and TESS
We'll coalesce mission-specific columns into a unified schema and concatenate into a single dataframe.

In [None]:
from typing import Optional, List, Dict
import numpy as np
import pandas as pd
from math import isnan

# Small helpers
def coalesce(series_list: List[pd.Series]) -> pd.Series:
    """Return first non-null value across the provided series, element-wise.
    Ignores None inputs and aligns indices automatically.
    """
    ss: List[pd.Series] = [s for s in series_list if s is not None]
    if not ss:
        return pd.Series(dtype=object)
    out = ss[0].copy()
    for s in ss[1:]:
        if s is None:
            continue
        s = s.reindex(out.index)
        mask = out.isna()
        out.loc[mask] = s.loc[mask]
    return out

# Return numeric series or NaN series aligned to df index
def s_or_nan(df: pd.DataFrame, col: str) -> pd.Series:
    if col in df.columns:
        return pd.to_numeric(df[col], errors='coerce')
    return pd.Series(np.nan, index=df.index)

# Return raw/object series or NaN series aligned to df index
def s_or_nan_raw(df: pd.DataFrame, col: str) -> pd.Series:
    if col in df.columns:
        return df[col]
    return pd.Series(np.nan, index=df.index)

def to_float(s: pd.Series) -> pd.Series:
    if s is None:
        # Empty aligned series; call-sites should prefer s_or_nan when alignment matters
        return pd.Series(dtype=float)
    return pd.to_numeric(s, errors='coerce')

def hours_from_hours(s: pd.Series) -> pd.Series:
    return to_float(s)

def hours_from_minutes(s: pd.Series) -> pd.Series:
    return to_float(s) / 60.0

def ppm_from_ppm(s: pd.Series) -> pd.Series:
    return to_float(s)

def normalize_disposition(vals: pd.Series, mission: str) -> pd.Series:
    mapping = {
        'CANDIDATE': 'CANDIDATE',
        'CONFIRMED': 'CONFIRMED',
        'FALSE POSITIVE': 'FALSE POSITIVE',
        'FALSE_POSITIVE': 'FALSE POSITIVE',
        'FP': 'FALSE POSITIVE',
        'RETRACTED': 'RETRACTED',
        'NOT DISPOSITIONED': 'NOT DISPOSITIONED',
    }
    mission_map: Dict[str, Dict[str, str]] = {
        'Kepler': {
            'CANDIDATE': 'CANDIDATE',
            'CONFIRMED': 'CONFIRMED',
            'FALSE POSITIVE': 'FALSE POSITIVE',
            'FALSE_NEGATIVE': 'FALSE POSITIVE',
            'KOI': 'CANDIDATE',
            'NOT DISPOSITIONED': 'NOT DISPOSITIONED',
        },
        'K2': {
            'CANDIDATE': 'CANDIDATE',
            'CONFIRMED': 'CONFIRMED',
            'FALSE POSITIVE': 'FALSE POSITIVE',
        },
        'TESS': {
            'CANDIDATE': 'CANDIDATE',
            'CONFIRMED': 'CONFIRMED',
            'FALSE POSITIVE': 'FALSE POSITIVE',
            'INTERESTING': 'CANDIDATE',
        }
    }
    if vals is None:
        return pd.Series(dtype=object)
    def norm_one(x):
        if pd.isna(x):
            return np.nan
        xs = str(x).strip().upper()
        if xs in mapping:
            return mapping[xs]
        if mission in mission_map and xs in mission_map[mission]:
            return mission_map[mission][xs]
        if 'CONF' in xs:
            return 'CONFIRMED'
        if 'CAND' in xs or 'PC' in xs:
            return 'CANDIDATE'
        if 'FALSE' in xs or xs == 'FP':
            return 'FALSE POSITIVE'
        return xs
    return vals.apply(norm_one)

# Column order for final dataset
UNIFIED_COLUMNS = [
    'planet_name','host_star_id','mission','disposition','orbital_period_days',
    'transit_epoch_bjd','transit_duration_hours','transit_depth_ppm','planet_radius_re',
    'planet_mass_me','equilibrium_temp_k','insolation_flux','eccentricity','inclination_deg',
    'impact_parameter','stellar_teff_k','stellar_radius_rsun','stellar_mass_msun','stellar_metallicity',
    'stellar_logg','ra','dec','tic_id'
 ]

In [10]:
def standardize_kepler(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index)
    out['planet_name'] = coalesce([s_or_nan_raw(df, 'kepler_name'), s_or_nan_raw(df, 'kepoi_name')])
    out['host_star_id'] = s_or_nan_raw(df, 'kepid')
    out['mission'] = 'Kepler'
    out['disposition'] = normalize_disposition(coalesce([s_or_nan_raw(df, 'koi_disposition'), s_or_nan_raw(df, 'koi_pdisposition')]), 'Kepler')
    out['orbital_period_days'] = s_or_nan(df, 'koi_period')
    out['transit_epoch_bjd'] = to_float(coalesce([s_or_nan_raw(df, 'koi_time0bk'), s_or_nan_raw(df, 'koi_time0')]))
    out['transit_duration_hours'] = s_or_nan(df, 'koi_duration')
    out['transit_depth_ppm'] = s_or_nan(df, 'koi_depth')
    out['planet_radius_re'] = s_or_nan(df, 'koi_prad')
    out['planet_mass_me'] = s_or_nan(df, 'pl_masse') if 'pl_masse' in df.columns else pd.Series(np.nan, index=df.index)
    out['equilibrium_temp_k'] = s_or_nan(df, 'koi_teq')
    out['insolation_flux'] = s_or_nan(df, 'koi_insol')
    out['eccentricity'] = s_or_nan(df, 'koi_eccen')
    out['inclination_deg'] = s_or_nan(df, 'koi_incl')
    out['impact_parameter'] = s_or_nan(df, 'koi_impact')
    out['stellar_teff_k'] = s_or_nan(df, 'koi_steff')
    out['stellar_radius_rsun'] = s_or_nan(df, 'koi_srad')
    out['stellar_mass_msun'] = s_or_nan(df, 'koi_smass')
    out['stellar_metallicity'] = s_or_nan(df, 'koi_smet')
    out['stellar_logg'] = s_or_nan(df, 'koi_slogg')
    out['ra'] = s_or_nan(df, 'ra') if 'ra' in df.columns else pd.Series(np.nan, index=df.index)
    out['dec'] = s_or_nan(df, 'dec') if 'dec' in df.columns else pd.Series(np.nan, index=df.index)
    out['tic_id'] = s_or_nan(df, 'tic_id') if 'tic_id' in df.columns else pd.Series(np.nan, index=df.index)
    return out[UNIFIED_COLUMNS]

def standardize_k2(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index)
    out['planet_name'] = coalesce([s_or_nan_raw(df, 'pl_name'), s_or_nan_raw(df, 'k2_name')])
    out['host_star_id'] = coalesce([s_or_nan_raw(df, 'hostname'), s_or_nan_raw(df, 'epic_hostname')])
    out['mission'] = 'K2'
    out['disposition'] = normalize_disposition(s_or_nan_raw(df, 'disposition'), 'K2') if 'disposition' in df.columns else pd.Series(np.nan, index=df.index)
    out['orbital_period_days'] = s_or_nan(df, 'pl_orbper')
    out['transit_epoch_bjd'] = s_or_nan(df, 'pl_tranmid')
    out['transit_duration_hours'] = s_or_nan(df, 'pl_trandur') if 'pl_trandur' in df.columns else pd.Series(np.nan, index=df.index)
    out['transit_depth_ppm'] = s_or_nan(df, 'pl_trandep') if 'pl_trandep' in df.columns else pd.Series(np.nan, index=df.index)
    out['planet_radius_re'] = s_or_nan(df, 'pl_rade')
    me = s_or_nan(df, 'pl_masse') if 'pl_masse' in df.columns else pd.Series(np.nan, index=df.index)
    mj = s_or_nan(df, 'pl_massj') if 'pl_massj' in df.columns else pd.Series(np.nan, index=df.index)
    out['planet_mass_me'] = coalesce([me, mj * 317.828])
    out['equilibrium_temp_k'] = s_or_nan(df, 'pl_eqt')
    out['insolation_flux'] = s_or_nan(df, 'pl_insol')
    out['eccentricity'] = s_or_nan(df, 'pl_orbeccen') if 'pl_orbeccen' in df.columns else pd.Series(np.nan, index=df.index)
    out['inclination_deg'] = s_or_nan(df, 'pl_orbincl') if 'pl_orbincl' in df.columns else pd.Series(np.nan, index=df.index)
    out['impact_parameter'] = s_or_nan(df, 'pl_imppar') if 'pl_imppar' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_teff_k'] = s_or_nan(df, 'st_teff')
    out['stellar_radius_rsun'] = s_or_nan(df, 'st_rad')
    out['stellar_mass_msun'] = s_or_nan(df, 'st_mass')
    out['stellar_metallicity'] = s_or_nan(df, 'st_met') if 'st_met' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_logg'] = s_or_nan(df, 'st_logg') if 'st_logg' in df.columns else pd.Series(np.nan, index=df.index)
    out['ra'] = s_or_nan(df, 'ra') if 'ra' in df.columns else pd.Series(np.nan, index=df.index)
    out['dec'] = s_or_nan(df, 'dec') if 'dec' in df.columns else pd.Series(np.nan, index=df.index)
    out['tic_id'] = coalesce([s_or_nan(df, 'tic_id') if 'tic_id' in df.columns else pd.Series(np.nan, index=df.index),
                              s_or_nan(df, 'tid') if 'tid' in df.columns else pd.Series(np.nan, index=df.index)])
    return out[UNIFIED_COLUMNS]

def standardize_toi(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index)
    out['planet_name'] = coalesce([s_or_nan_raw(df, 'ctoi_alias'), s_or_nan_raw(df, 'toi')])
    out['host_star_id'] = coalesce([s_or_nan_raw(df, 'tid'), s_or_nan_raw(df, 'tic_id')])
    out['mission'] = 'TESS'
    out['disposition'] = normalize_disposition(s_or_nan_raw(df, 'tfopwg_disp'), 'TESS') if 'tfopwg_disp' in df.columns else pd.Series(np.nan, index=df.index)
    out['orbital_period_days'] = s_or_nan(df, 'pl_orbper') if 'pl_orbper' in df.columns else (s_or_nan(df, 'Orbital Period (days)') if 'Orbital Period (days)' in df.columns else pd.Series(np.nan, index=df.index))
    out['transit_epoch_bjd'] = s_or_nan(df, 'pl_tranmid') if 'pl_tranmid' in df.columns else pd.Series(np.nan, index=df.index)
    out['transit_duration_hours'] = s_or_nan(df, 'pl_trandurh') if 'pl_trandurh' in df.columns else pd.Series(np.nan, index=df.index)
    out['transit_depth_ppm'] = s_or_nan(df, 'pl_trandep') if 'pl_trandep' in df.columns else pd.Series(np.nan, index=df.index)
    out['planet_radius_re'] = s_or_nan(df, 'pl_rade') if 'pl_rade' in df.columns else pd.Series(np.nan, index=df.index)
    out['planet_mass_me'] = s_or_nan(df, 'pl_masse') if 'pl_masse' in df.columns else pd.Series(np.nan, index=df.index)
    out['equilibrium_temp_k'] = s_or_nan(df, 'pl_eqt') if 'pl_eqt' in df.columns else pd.Series(np.nan, index=df.index)
    out['insolation_flux'] = s_or_nan(df, 'pl_insol') if 'pl_insol' in df.columns else pd.Series(np.nan, index=df.index)
    out['eccentricity'] = s_or_nan(df, 'pl_orbeccen') if 'pl_orbeccen' in df.columns else pd.Series(np.nan, index=df.index)
    out['inclination_deg'] = s_or_nan(df, 'pl_orbincl') if 'pl_orbincl' in df.columns else pd.Series(np.nan, index=df.index)
    out['impact_parameter'] = s_or_nan(df, 'pl_imppar') if 'pl_imppar' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_teff_k'] = s_or_nan(df, 'st_teff') if 'st_teff' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_radius_rsun'] = s_or_nan(df, 'st_rad') if 'st_rad' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_mass_msun'] = s_or_nan(df, 'st_mass') if 'st_mass' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_metallicity'] = s_or_nan(df, 'st_met') if 'st_met' in df.columns else pd.Series(np.nan, index=df.index)
    out['stellar_logg'] = s_or_nan(df, 'st_logg') if 'st_logg' in df.columns else pd.Series(np.nan, index=df.index)
    out['ra'] = s_or_nan(df, 'ra') if 'ra' in df.columns else pd.Series(np.nan, index=df.index)
    out['dec'] = s_or_nan(df, 'dec') if 'dec' in df.columns else pd.Series(np.nan, index=df.index)
    out['tic_id'] = coalesce([s_or_nan(df, 'tid') if 'tid' in df.columns else pd.Series(np.nan, index=df.index), s_or_nan(df, 'tic_id') if 'tic_id' in df.columns else pd.Series(np.nan, index=df.index)])
    return out[UNIFIED_COLUMNS]

In [14]:
# Apply standardization and combine
df_kepler_std = standardize_kepler(df_kepler) if 'df_kepler' in globals() else standardize_kepler(df_kepler) if 'df_kepler' in dir() else None
df_k2_std = standardize_k2(df_k2)
df_toi_std = standardize_toi(df_toi)

frames = [d for d in [df_kepler_std, df_k2_std, df_toi_std] if d is not None]
unified_df = pd.concat(frames, ignore_index=True)

# Reorder columns explicitly and add any missing ones as NaN
for col in UNIFIED_COLUMNS:
    if col not in unified_df.columns:
        unified_df[col] = np.nan
unified_df = unified_df[UNIFIED_COLUMNS]

display(unified_df.head(10))
print(f"Unified rows: {len(unified_df):,}")
print(unified_df['mission'].value_counts(dropna=False))
print(unified_df['disposition'].value_counts(dropna=False).head(10))

Unnamed: 0,planet_name,host_star_id,mission,disposition,orbital_period_days,transit_epoch_bjd,transit_duration_hours,transit_depth_ppm,planet_radius_re,planet_mass_me,...,inclination_deg,impact_parameter,stellar_teff_k,stellar_radius_rsun,stellar_mass_msun,stellar_metallicity,stellar_logg,ra,dec,tic_id
0,Kepler-227 b,10797460,Kepler,CONFIRMED,9.488036,170.53875,2.9575,616.0,2.26,,...,,0.146,5455.0,0.927,,,4.467,291.93423,48.141651,
1,Kepler-227 c,10797460,Kepler,CONFIRMED,54.418383,162.51384,4.507,875.0,2.83,,...,,0.586,5455.0,0.927,,,4.467,291.93423,48.141651,
2,K00753.01,10811496,Kepler,CANDIDATE,19.89914,175.850252,1.7822,10800.0,14.6,,...,,0.969,5853.0,0.868,,,4.544,297.00482,48.134129,
3,K00754.01,10848459,Kepler,FALSE POSITIVE,1.736952,170.307565,2.40641,8080.0,33.46,,...,,1.276,5805.0,0.791,,,4.564,285.53461,48.28521,
4,Kepler-664 b,10854555,Kepler,CONFIRMED,2.525592,171.59555,1.6545,603.0,2.75,,...,,0.701,6031.0,1.046,,,4.438,288.75488,48.2262,
5,Kepler-228 d,10872983,Kepler,CONFIRMED,11.094321,171.20116,4.5945,1520.0,3.9,,...,,0.538,6046.0,0.972,,,4.486,296.28613,48.22467,
6,Kepler-228 c,10872983,Kepler,CONFIRMED,4.134435,172.97937,3.1402,686.0,2.77,,...,,0.762,6046.0,0.972,,,4.486,296.28613,48.22467,
7,Kepler-228 b,10872983,Kepler,CONFIRMED,2.566589,179.55437,2.429,227.0,1.59,,...,,0.755,6046.0,0.972,,,4.486,296.28613,48.22467,
8,K00114.01,6721123,Kepler,FALSE POSITIVE,7.36179,132.25053,5.022,234.0,39.21,,...,,1.169,6227.0,1.958,,,3.986,298.86435,42.151569,
9,Kepler-229 c,10910878,Kepler,CONFIRMED,16.068647,173.621937,3.5347,4910.0,5.76,,...,,0.052,5031.0,0.848,,,4.485,286.99948,48.37579,


Unified rows: 21,224
mission
Kepler    9564
TESS      7668
K2        3992
Name: count, dtype: int64
disposition
CANDIDATE         8482
FALSE POSITIVE    6324
CONFIRMED         5054
CP                 679
KP                 565
FA                  98
REFUTED             22
Name: count, dtype: int64


In [15]:
# Save output
output_path = 'data/unified_exoplanets.csv'
unified_df.to_csv(output_path, index=False)
print(f"Saved unified dataset to {output_path}")

Saved unified dataset to data/unified_exoplanets.csv
