In [5]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
import kagglehub
mobinchowdhury_k2_nasa_path = kagglehub.dataset_download('mobinchowdhury/k2-nasa')
mobinchowdhury_exoplanet_dataset_path = kagglehub.dataset_download('mobinchowdhury/exoplanet-dataset')

print('Data source import complete.')


Data source import complete.


In [3]:
!pip install pandas scikit-learn torch pytorch-tabnet joblib
!pip install onnx onnxruntime



In [None]:
!pip install --upgrade scikit-learn

Step 1: Import Modules and Prep Data

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from pytorch_tabnet.tab_model import TabNetClassifier
from imblearn.over_sampling import RandomOverSampler
import torch

# ---------------------------
# Step 1: Load and merge datasets
# ---------------------------
def load_nasa_dataset(tess_file, kepler_file, k2_file):
    # Load CSVs, skip commented lines
    tess_df = pd.read_csv(tess_file, comment='#')
    kepler_df = pd.read_csv(kepler_file, comment='#')
    k2_df = pd.read_csv(k2_file, comment='#')

    # Define the "canonical" columns we want in the merged dataset
    common_cols = ['planet_name', 'disposition', 'orbital_period', 'planet_radius',
                   'equilibrium_temp', 'insolation_flux', 'transit_depth', 'transit_duration',
                   'stellar_teff', 'stellar_logg', 'stellar_radius', 'ra', 'dec']

    # Helper to map CSV-specific column names to canonical names
    def rename_columns(df, mapping):
        df = df.copy()
        for k, v in mapping.items():
            if k in df.columns:
                df.rename(columns={k: v}, inplace=True)
        return df
    def normalize_disposition(df, source='generic'):
        df = df.copy()
        if 'disposition' in df.columns:
            if source == 'tess':  # TESS uses CP/PC/FP
                mapping = {'CP': 'CONFIRMED', 'PC': 'CANDIDATE', 'FP': 'FALSE POSITIVE'}
                df['disposition'] = df['disposition'].map(mapping)
            else:  # Kepler/K2 already use text like CONFIRMED/CANDIDATE/FALSE POSITIVE
                df['disposition'] = df['disposition'].replace({
                    'CANDIDATE': 'CANDIDATE',
                    'CONFIRMED': 'CONFIRMED',
                    'FALSE POSITIVE': 'FALSE POSITIVE',
                    'REFUTED': 'FALSE POSITIVE'
                })
        return df

    # Kepler mapping
    kepler_map = {
        'kepler_name': 'planet_name',
        'koi_disposition': 'disposition',
        'koi_period': 'orbital_period',
        'koi_prad': 'planet_radius',
        'koi_teq': 'equilibrium_temp',
        'koi_insol': 'insolation_flux',
        'koi_depth': 'transit_depth',
        'koi_duration': 'transit_duration',
        'koi_steff': 'stellar_teff',
        'koi_slogg': 'stellar_logg',
        'koi_srad': 'stellar_radius'
    }

    # K2 mapping
    k2_map = {
        'pl_name': 'planet_name',
        'disposition': 'disposition',
        'pl_orbper': 'orbital_period',
        'pl_rade': 'planet_radius',
        'pl_eqt': 'equilibrium_temp',
        'pl_insol': 'insolation_flux',
        'pl_rade': 'planet_radius',
        'st_teff': 'stellar_teff',
        'st_logg': 'stellar_logg',
        'st_rad': 'stellar_radius'
    }

    # TESS mapping
    tess_map = {
        'toi': 'planet_name',
        'tfopwg_disp': 'disposition',
        'pl_orbper': 'orbital_period',
        'pl_rade': 'planet_radius',
        'pl_eqt': 'equilibrium_temp',
        'pl_insol': 'insolation_flux',
        'pl_trandep': 'transit_depth',
        'pl_trandurh': 'transit_duration',
        'st_teff': 'stellar_teff',
        'st_logg': 'stellar_logg',
        'st_rad': 'stellar_radius'
    }

    # Rename columns
    kepler_df = rename_columns(kepler_df, kepler_map)
    k2_df = rename_columns(k2_df, k2_map)
    tess_df = rename_columns(tess_df, tess_map)
    kepler_df = normalize_disposition(kepler_df, 'kepler')
    k2_df = normalize_disposition(k2_df, 'k2')
    tess_df = normalize_disposition(tess_df, 'tess')

    # Keep only columns that exist in each dataframe
    kepler_df = kepler_df[[c for c in common_cols if c in kepler_df.columns]]
    k2_df = k2_df[[c for c in common_cols if c in k2_df.columns]]
    tess_df = tess_df[[c for c in common_cols if c in tess_df.columns]]

    # Merge all datasets
    merged_df = pd.concat([kepler_df, k2_df, tess_df], ignore_index=True)

    return merged_df

# Example usage
merged_df = load_nasa_dataset(
    mobinchowdhury_exoplanet_dataset_path+"/tess.csv",
    mobinchowdhury_exoplanet_dataset_path+"/keplar.csv",
    mobinchowdhury_k2_nasa_path+"/k2panda.csv"
)

print("Merged dataset shape:", merged_df.shape)
print(merged_df.head())

Merged dataset shape: (21267, 13)
    planet_name     disposition  orbital_period  planet_radius  \
0  Kepler-227 b       CONFIRMED        9.488036           2.26   
1  Kepler-227 c       CONFIRMED       54.418383           2.83   
2           NaN       CANDIDATE       19.899140          14.60   
3           NaN  FALSE POSITIVE        1.736952          33.46   
4  Kepler-664 b       CONFIRMED        2.525592           2.75   

   equilibrium_temp  insolation_flux  transit_depth  transit_duration  \
0             793.0            93.59          615.8           2.95750   
1             443.0             9.11          874.8           4.50700   
2             638.0            39.30        10829.0           1.78220   
3            1395.0           891.96         8079.2           2.40641   
4            1406.0           926.16          603.3           1.65450   

   stellar_teff  stellar_logg  stellar_radius         ra        dec  
0        5455.0         4.467           0.927  291.93423  48

Step 2: Polish data and split for test and train

In [8]:
merged_df = merged_df.dropna(subset=['disposition'])

# Features and target

candidate_df = merged_df[merged_df['disposition'] == 'CANDIDATE'].copy()

# Drop unused columns
candidate_X = candidate_df.drop(columns=['disposition', 'planet_name'])



filtered_df = merged_df[merged_df['disposition'].isin(['CONFIRMED', 'FALSE POSITIVE'])].copy()
X = filtered_df.drop(columns=['disposition', 'planet_name'])
# Fill missing values in numeric columns with median
X = X.fillna(X.median())

# Standardize numeric features
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

# ---------------------------
# Step 3: Handle class imbalance
# ---------------------------
# ros = RandomOverSampler(random_state=42)
# X_res, y_res = ros.fit_resample(X, y)
# Encode target labels


y = filtered_df['disposition']
le = LabelEncoder()
y = le.fit_transform(y)  # e.g., 'CONFIRMED' -> 0, 'CANDIDATE' -> 1, etc.
feature_names = list(X.columns)
# Apply same preprocessing 
candidate_X = candidate_X.fillna(X.median())

# Make sure columns are in the same order as training
candidate_X = candidate_X[feature_names]

# Convert to numpy
candidate_input = candidate_X.values
# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:", np.bincount(y_train))
print("y_test distribution:", np.bincount(y_test))
print(y)
candidate_df
le.classes_

X_train shape: (9675, 11)
X_test shape: (2419, 11)
y_train distribution: [4595 5080]
y_test distribution: [1149 1270]
[0 0 1 ... 1 1 1]


array(['CONFIRMED', 'FALSE POSITIVE'], dtype=object)

Step 3: Train the model & Save the pkl


In [None]:
# ============================
# 3. Train TabNet
# ============================
# ---------------------------
# Step 5: TabNet training
# ---------------------------
scheduler_params = {"step_size": 50, "gamma": 0.9}
clf = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    scheduler_params=scheduler_params,
    n_steps=8, n_d=16, n_a=16, gamma=1.5,
    mask_type="entmax"
)

clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    max_epochs=200,
    patience=200,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

import joblib
# Save the model with map_location='cpu' to ensure it can be loaded on a CPU-only machine
joblib.dump(clf, "tabnet_exoplanet.pkl") # map_location='cpu' is not a joblib argument
joblib.dump(y, "target_encoder.pkl")

print("✅ Model and encoder saved!")



```
Evaluate the model
```






In [None]:
# ============================
# 4. Evaluation
# ============================
# Step 6: Evaluation
# ---------------------------
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [9]:
import joblib
import torch

# Load model
# Specify map_location='cpu' if the model was saved from a GPU machine
# and you are loading it on a CPU-only machine.
clf = joblib.load("/kaggle/input/tabnet_exoplannet/pytorch/default/1/tabnet_exoplanet.pkl")

**Evaluate model agaisnt candidate plannet data to figure out which candate has high possibility of being confirmed **

In [10]:
candidate_probs = clf.predict_proba(candidate_input)

In [12]:
import json

# Add probabilities to candidate_df safely
for i, class_name in enumerate(le.classes_):
    col_name = f"prob_{class_name.replace(' ', '_').lower()}"
    candidate_df[col_name] = candidate_probs[:, i]

# Print first 5 rows
print(candidate_df.head())

# Export the entire DataFrame to JSON (records = list of dicts)
candidate_json = candidate_df.to_json(orient='records')

# Optional: save to file
with open("/kaggle/working/candidate_predictions.json", "w") as f:
    f.write(candidate_json)

print("✅ Exported candidate predictions to JSON")



   planet_name disposition  orbital_period  planet_radius  equilibrium_temp  \
2          NaN   CANDIDATE       19.899140          14.60             638.0   
58         NaN   CANDIDATE       40.419504           7.51             467.0   
62         NaN   CANDIDATE        7.240661          19.45             734.0   
63         NaN   CANDIDATE        3.435916           0.55            1272.0   
84         NaN   CANDIDATE       10.181584           7.73             812.0   

    insolation_flux  transit_depth  transit_duration  stellar_teff  \
2             39.30        10829.0            1.7822        5853.0   
58            11.29         6256.0            3.3620        5446.0   
62            68.63          556.4            0.5580        5005.0   
63           617.61           23.2            3.1330        5779.0   
84           102.91         5741.1            3.5089        5988.0   

    stellar_logg  stellar_radius         ra        dec  prob_confirmed  \
2          4.544           0.8

In [15]:
import numpy as np
import torch

# Feature names used by your model (excluding planet_name and disposition)
feature_names = ['orbital_period', 'planet_radius', 'equilibrium_temp', 'insolation_flux', 
                 'transit_depth', 'transit_duration', 'stellar_teff', 'stellar_logg', 
                 'stellar_radius', 'ra', 'dec']

# Single candidate's data
candidate_data = [6.444, 2.460, 712.281, 60.831, 1121.951, 1.931, 4803.0, 4.521, 0.737, 296.004, -47.562]

# Convert to numpy array and reshape to (1, n_features)
candidate_array = np.array(candidate_data, dtype=np.float32).reshape(1, -1)

# Convert to torch tensor (if your model is on CPU)
candidate_tensor = torch.tensor(candidate_array)


# Predict probabilities using the trained TabNetClassifier
candidate_probs = clf.predict_proba(candidate_tensor.numpy())  # clf expects numpy array

# Assuming you already have your LabelEncoder `le` to map classes
for i, class_name in enumerate(le.classes_):
    print(f"Probability of {class_name}: {candidate_probs[0, i]:.4f}")

# Specifically for CONFIRMED
confirmed_index = np.where(le.classes_ == "CONFIRMED")[0][0]
print(f"\nProbability of CONFIRMED: {candidate_probs[0, confirmed_index]:.4f}")


Probability of CONFIRMED: 0.9181
Probability of FALSE POSITIVE: 0.0819

Probability of CONFIRMED: 0.9181


In [None]:
import json

# Convert the entire candidate_df to list of dictionaries
candidate_list = candidate_df.to_dict(orient="records")

# Export to JSON
with open("/kaggle/working/top-candidate_data.json", "w") as f:
    json.dump(candidate_list, f, indent=4)

# Optional: print first 5 entries nicely
print(json.dumps(candidate_list[:5], indent=4))



In [None]:
print("Candidate cols:", candidate_df.columns.tolist())
print("Train cols:", X.columns.tolist())
print("Shape candidate:", candidate_df.shape, "train:", X_train.shape[1])
print(candidate_df.head())


In [None]:
import matplotlib.pyplot as plt
import pandas as pd


# Get TabNet feature importances
importances = clf.feature_importances_

# Combine into DataFrame
feat_imp = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print(feat_imp)

# Plot top 15 features
plt.figure(figsize=(10, 6))
plt.barh(feat_imp["feature"].head(15), feat_imp["importance"].head(15))
plt.gca().invert_yaxis()  # so most important is on top
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("TabNet Feature Importances")
plt.show()


In [None]:
import json
import pandas as pd

# Suppose X is your DataFrame of features
feature_names = X.columns.tolist()
importances = clf.feature_importances_  # TabNet feature importances

# Combine names and importances into a dictionary
feat_dict = {name: float(imp) for name, imp in zip(feature_names, importances)}

# Export to JSON
with open("tabnet_feature_importances.json", "w") as f:
    json.dump(feat_dict, f, indent=4)

print("Feature importances saved to tabnet_feature_importances.json")


In [None]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
import joblib

# Load your TabNet model
clf = joblib.load("tabnet_exoplanet.pkl")

# Take a small batch from your data (replace X_test with your test data)
sample = torch.tensor(X_test[:5], dtype=torch.float32)

# Get the model output
output_proba = clf.predict_proba(sample)  # probabilities
output_label = clf.predict(sample)        # class labels

print("Output shape (predict_proba):", output_proba.shape)
print("Sample output (predict_proba):\n", output_proba)

print("Output shape (predict):", output_label.shape)
print("Sample output (predict):\n", output_label)

# Determine format
if output_proba.shape[1] == 1:
    print("Model outputs single probability per sample (sigmoid).")
elif output_proba.shape[1] == 2:
    print("Model outputs two probabilities (softmax over two classes).")
else:
    print("Unknown output format!")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get explanations (feature masks) for X_test
explain_matrix, masks = clf.explain(X_test)

# Example: check the first test sample
sample_idx = 7
sample_explain = explain_matrix[sample_idx]

# Plot local feature importance for that sample
plt.figure(figsize=(10,6))
plt.barh(range(len(feature_names)), sample_explain, align="center")
plt.yticks(range(len(feature_names)), feature_names)
plt.xlabel("Importance for this sample")
plt.title(f"TabNet Local Explanation for Sample {sample_idx}")
plt.show()


In [None]:
import pandas as pd

# Average importance across all samples
global_importance = np.mean(explain_matrix, axis=0)

pd.DataFrame({
    "feature": feature_names,
    "mean_importance": global_importance
}).sort_values("mean_importance", ascending=False)


In [None]:
import json

# Compute min and max for each feature
feature_ranges = {}
for col in X.columns:
    min_val = X[col].min()
    max_val = X[col].max()
    feature_ranges[col] = {"range": [float(min_val), float(max_val)]}

# Save as JSON
with open("feature_ranges.json", "w") as f:
    json.dump(feature_ranges, f, indent=4)

print(feature_ranges)

In [13]:
import torch

# Make sure the model is in eval mode

# Suppose X_train is your training data (numpy or pandas)
# If X_train is a DataFrame:

# If X_train is a numpy array and you have a separate feature_names list, use it
# feature_names = ["feat1", "feat2", ..., "featN"]

# Create a dummy input with the same number of features
device = "cuda" if torch.cuda.is_available() else "cpu"

# Move the model to device
clf.network.to(device)
clf.network.eval()  # important for export

# Create dummy input on the same device
dummy_input = torch.tensor(np.zeros((1, X.shape[1])), dtype=torch.float32).to(device)

# Export to ONNX
torch.onnx.export(
    clf.network, # Export the underlying PyTorch network
    dummy_input,
    "/kaggle/working/tabnet_exoplanet.onnx",
    export_params=True,  # store the trained parameters inside the model file
    opset_version=11,    # the ONNX version to export the model to
    do_constant_folding=True, # whether to execute constant folding for optimization
    input_names=['input'],   # the model's input names
    output_names=['output'], # the model's output names
    dynamic_axes={'input': {0: 'batch_size'},    # variable length axes
                  'output': {0: 'batch_size'}})

print("Export completed. ONNX file ready for web use.")


  chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)


Export completed. ONNX file ready for web use.


In [21]:
import onnxruntime as ort

# Load the model
session = ort.InferenceSession("/kaggle/working/tabnet_exoplanet.onnx")
input_name = session.get_inputs()[0].name
input_name
outputs = session.run(None, {input_name: candidate_tensor.numpy()})
outputs

[array([[2.9333153, 0.5169577]], dtype=float32),
 array(-0.5776607, dtype=float32)]

In [24]:
print(outputs[0])
session.get_outputs()

[[2.9333153 0.5169577]]


[<onnxruntime.capi.onnxruntime_pybind11_state.NodeArg at 0x7da264df8b70>,
 <onnxruntime.capi.onnxruntime_pybind11_state.NodeArg at 0x7da264e10db0>]

In [25]:
import numpy as np

# The raw output (logits) from your ONNX session
onnx_output_logits = outputs[0]

# Apply softmax function
exp_logits = np.exp(onnx_output_logits)
softmax_probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

print("Raw ONNX output (logits):\n", onnx_output_logits)
print("Softmax probabilities:\n", softmax_probs)

# Assuming le (LabelEncoder) is still available
if 'le' in globals():
    print("\nProbabilities with class names:")
    for i, class_name in enumerate(le.classes_):
        print(f"{class_name}: {softmax_probs[0, i]:.4f}")
else:
    print("\nLabelEncoder 'le' not found. Cannot display class names.")

Raw ONNX output (logits):
 [[2.9333153 0.5169577]]
Softmax probabilities:
 [[0.9180662  0.08193382]]

Probabilities with class names:
CONFIRMED: 0.9181
FALSE POSITIVE: 0.0819
