# Deep Neural Network for Exoplanet Discovery Method Classification

In [5]:
import numpy as np
import tensorflow as tf
import pandas as pd

## Data

In [9]:
# data is aleady preprocessed; standardized and one-hot encoded

composite_preprocessed = pd.read_csv('Composite_preprocessed_NO_MV_BALANCED.csv')
composite_preprocessed.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Circumbinary Flag,Discovery Year,Detected by Radial Velocity Variations,Detected by Pulsar Timing Variations,Detected by Pulsation Timing Variations,Detected by Transits,Detected by Astrometric Variations,...,Controversial Flag,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements
0,2,1,0,0,2007,1,0,0,0,0,...,0,78.28058,264.13775,18.33392,177.4179,1,2,0,0,0
1,1,1,0,0,2009,1,0,0,0,0,...,0,41.04437,108.719,74.95821,141.64699,1,1,0,0,0
2,1,1,0,0,2008,1,0,0,0,0,...,0,-21.05141,106.41269,38.22901,11.95935,1,1,0,0,0
3,1,2,0,0,2002,1,0,0,0,0,...,0,46.94447,69.16849,62.87885,223.24717,1,4,1,0,0
4,3,1,0,0,1996,1,0,0,0,0,...,0,13.20446,83.33558,69.46803,321.21176,1,4,3,0,0


### 2 possible results ; Either the exoplanet was discovered by transits or it wasn't
### so we have 2 output units 
### We will use the features from the XGBoost refined features model 
### This ensures consistency, reduces complexity, and leverages the feature selection process that has already been validated with my refined features XGBoost model
### This would also allow for a fair comparison between models and is likely to result in better performance for the DNN

# Remove unimportant features

In [None]:
remove = ['Number of Radial Velocity Time Series',
          'Number of Stellar Spectra Measurements',
          'Controversial Flag',
          'Circumbinary Flag']

composite_unnecessary_removed = composite_preprocessed.drop(remove, axis=1)

# Train Test Split

In [None]:
# we are trying to predict whether an exoplanet has been detected by transits (0 or 1)
targets = composite_unnecessary_removed['Detected by Transits']
# training features are all variables except the targets
features = composite_unnecessary_removed.drop(['Detected by Transits'], axis=1)

## Observe Feature Correlations

In [None]:
features.corr()

## Split data

In [None]:
# Splitting dataset into training and testing addresses overfitting
# shuffling is necessary to remove dependencies that come from order of data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, targets, train_size = 0.8, random_state = 42)

x_train.shape, y_train.shape

## DNN Configuration

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

### Create Model Container

In [None]:
# Initializes a Sequential Model (linear stack of layers)
# Simplest way to build a model in Keras, where you can just keep adding layers sequentially

dnn_model = Sequential()

### Learn and represent complex relationships in the data

In [None]:
# add dense (fully connected) layer with 64 neurons
# common for hidden layers to start at a high number of neurons to capture complex patterns
# shape[1] specifies our number of input features
# we use Rectified Linear Unit (ReLU) as the activation function because ReLU helps mitigate the vanishing gradient problem
# and allows the model to learn complex paterns

dnn_model.add(Dense(64, input_dim = x_train.shape[1], activation ='relu'))

### Measure to prevent overfitting

In [None]:
# add a dropout layer with dropout rate of 0.5 
# This essentially just means 50% of the neurons will be randomly set to 0 during training
# Dropout is a regularization technique used to prevent overfitting 
# This helps prevent the model from overfitting by randomly dropping neurons during training

dnn_model.add(Dropout(0.5))

### Add another layer to learn more complex patterns
### Deeper networks (more layers) often capture more complex relationships 

In [None]:
dnn_model.add(Dense(32, activation='relu'))

# Evaluate Model performance

In [None]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt