# Deep Neural Network for Exoplanet Discovery Method Classification

In [5]:
import numpy as np
import tensorflow as tf
import pandas as pd

## Data

In [9]:
composite_preprocessed = pd.read_csv('Composite_preprocessed_NO_MV.csv')
composite_preprocessed.head()

Unnamed: 0,Number of Stars,Number of Planets,Number of Moons,Circumbinary Flag,Discovery Year,Detected by Radial Velocity Variations,Detected by Pulsar Timing Variations,Detected by Pulsation Timing Variations,Detected by Transits,Detected by Astrometric Variations,...,Controversial Flag,Galactic Latitude [deg],Galactic Longitude [deg],Ecliptic Latitude [deg],Ecliptic Longitude [deg],Number of Photometry Time Series,Number of Radial Velocity Time Series,Number of Stellar Spectra Measurements,Number of Emission Spectroscopy Measurements,Number of Transmission Spectroscopy Measurements
0,2,1,0,0,2007,1,0,0,0,0,...,0,78.28058,264.13775,18.33392,177.4179,1,2,0,0,0
1,1,1,0,0,2009,1,0,0,0,0,...,0,41.04437,108.719,74.95821,141.64699,1,1,0,0,0
2,1,1,0,0,2008,1,0,0,0,0,...,0,-21.05141,106.41269,38.22901,11.95935,1,1,0,0,0
3,1,2,0,0,2002,1,0,0,0,0,...,0,46.94447,69.16849,62.87885,223.24717,1,4,1,0,0
4,3,1,0,0,1996,1,0,0,0,0,...,0,13.20446,83.33558,69.46803,321.21176,1,4,3,0,0


In [10]:
composite_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5602 entries, 0 to 5601
Data columns (total 25 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Number of Stars                                   5602 non-null   int64  
 1   Number of Planets                                 5602 non-null   int64  
 2   Number of Moons                                   5602 non-null   int64  
 3   Circumbinary Flag                                 5602 non-null   int64  
 4   Discovery Year                                    5602 non-null   int64  
 5   Detected by Radial Velocity Variations            5602 non-null   int64  
 6   Detected by Pulsar Timing Variations              5602 non-null   int64  
 7   Detected by Pulsation Timing Variations           5602 non-null   int64  
 8   Detected by Transits                              5602 non-null   int64  
 9   Detected by Astrome

In [2]:


mnist_train, mnist_test = mnist_dataset['train'], mnist_dataset['test']

# TF has training and testing datasets, but no validation sets
# Must split ourselves

# we start by defining the number of validation samples as a % of the train samples
# this is where we make use of mnist_info

num_validation_samples = 0.1 * mnist_info.splits['train'].num_examples
# cast this number to an integer in case float causes an error along the way
num_validation_samples = tf.cast(num_validation_samples, tf.int64)

num_test_samples = mnist_info.splits['test'].num_examples
num_test_samples = tf.cast(num_test_samples, tf.int64)


# scale our data in some way to make the result more numerically stable
# have inputs between 0 and 1
# function scale that takes an MNIST image and its label as input
def scale(image, label):
    image = tf.cast(image, tf.float32)
    # since the possible values for the inputs are 0 to 255 (256 different shades of grey)
    # if we divide each element by 255, we would get the desired result -> all elements will be between 0 and 1 
    image /= 255.
    return image, label

# .map() allows us to apply a custom transformation to a given dataset
scaled_train_and_validation_data = mnist_train.map(scale)

# finally, we scale and batch the test data
# scale it so it has the same magnitude as the train and validation
# there would be a single batch, equal to the size of the test data
test_data = mnist_test.map(scale)


# shuffle the data 10000 at a time
BUFFER_SIZE = 10000

shuffled_train_and_validation_data = scaled_train_and_validation_data.shuffle(BUFFER_SIZE)

# once scaled and shuffled the data, proceed to actually extracting the train and validation
# validation data would be equal to 10% of the training set
# .take() method to take that many samples
# create a batch with a batch size equal to the total number of validation samples
validation_data = shuffled_train_and_validation_data.take(num_validation_samples)
# the train_data is everything else, so we skip as many samples as there are in the validation dataset
train_data = shuffled_train_and_validation_data.skip(num_validation_samples)


BATCH_SIZE = 100

# take advantage of the occasion to batch the train data
# very helpful when we train, as we would be able to iterate over the different batches
train_data = train_data.batch(BATCH_SIZE)
validation_data = validation_data.batch(num_validation_samples)
test_data = test_data.batch(num_test_samples)

# takes next batch (it is the only batch)
# because as_supervized=True, we've got a 2-tuple structure
validation_inputs, validation_targets = next(iter(validation_data))

## Model

### Outline the model

In [6]:
input_size = 784
output_size = 10
hidden_layer_size = 50

model = tf.keras.Sequential([
    # the first layer (the input layer)
    # each observation is 28x28x1 pixels, therefore it is a tensor of rank 3
    # must flatten the images
    # 'Flatten' takes 28x28x1 tensor and orders it into a (None,) 
    # or (28x28x1,) = (784,) vector
    # allows us to actually create a feed forward neural network
                            tf.keras.layers.Flatten(input_shape=(28,28,1)),
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
        # the final layer is no different, just make sure to activate it with softmax
                            tf.keras.layers.Dense(output_size, activation='softmax')   
                            ])

### Choose the optimizer and the loss function

In [7]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# we choose adam because the results from this optimizer are generally better than every other optimization algorithm
# model and optimizer expect output shape to match target shape in a one-hot encoded format; so we choose sparse_cat
# Last argument specifies the metric we wish to calculate throughout training and testing

### Training

In [8]:
# max number of epochs
NUM_EPOCHS = 5

# fit the model
model.fit(train_data, epochs = NUM_EPOCHS, validation_data=(validation_inputs, validation_targets), verbose=2)

Epoch 1/5
540/540 - 6s - loss: 0.3206 - accuracy: 0.9105 - val_loss: 0.0000e+00 - val_accuracy: 0.0000e+00
Epoch 2/5
540/540 - 5s - loss: 0.1344 - accuracy: 0.9604 - val_loss: 0.1131 - val_accuracy: 0.9680
Epoch 3/5
540/540 - 5s - loss: 0.0969 - accuracy: 0.9710 - val_loss: 0.0972 - val_accuracy: 0.9745
Epoch 4/5
540/540 - 5s - loss: 0.0752 - accuracy: 0.9776 - val_loss: 0.0841 - val_accuracy: 0.9745
Epoch 5/5
540/540 - 5s - loss: 0.0609 - accuracy: 0.9812 - val_loss: 0.0681 - val_accuracy: 0.9790


<tensorflow.python.keras.callbacks.History at 0x229f22494a8>

## Test the model

In [9]:
test_loss, test_accuracy = model.evaluate(test_data)

      1/Unknown - 1s 1s/step - loss: 0.0897 - accuracy: 0.97 - 1s 1s/step - loss: 0.0897 - accuracy: 0.9728

In [10]:
print('Test loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

Test loss: 0.09. Test accuracy: 97.28%
