In [2]:
# !pip install tensorflow==2.0.0

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split


print(tf.__version__)

# https://www.tensorflow.org/tutorials/load_data/pandas_dataframe

2.0.0


In [3]:
csv_file = "./mast-info-unified-000004000.csv"
# csv_file = "./mast-info-unified.csv"
df = pd.read_csv(csv_file, index_col=0)
display(df.head())
display(df.dtypes)
display(df.shape)

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,flux_000000000,flux_000000001,flux_000000002,flux_000000003,...,flux_000003991,flux_000003992,flux_000003993,flux_000003994,flux_000003995,flux_000003996,flux_000003997,flux_000003998,flux_000003999,flux_000004000
0,757450.0,K00889.01,Kepler-75 b,CONFIRMED,CANDIDATE,0.999,11305.5,11333.433,11384.334,11288.82,...,11298.839,11279.101,11280.896,11347.004,11364.274,11293.777,11339.508,11247.401,11266.217,11344.707
1,1025986.0,K07621.01,,CANDIDATE,CANDIDATE,0.0,1256772.9,1256532.6,1256749.2,1256607.1,...,1256640.5,1256648.8,1256964.0,1256802.1,1256561.6,1256788.0,1256436.4,1256765.4,1256475.5,1256605.0
2,1026957.0,K00958.01,,CANDIDATE,CANDIDATE,1.0,127373.03,127471.375,127322.2,127416.195,...,127308.69,127347.94,127338.64,127412.79,127345.91,127514.53,127492.25,127435.23,127390.78,127481.14
3,1027438.0,K01010.01,,FALSE POSITIVE,FALSE POSITIVE,,46625.54,46659.367,46551.5,46562.5,...,46627.324,46606.805,46678.12,46634.367,46576.465,46628.56,46730.277,46681.21,46699.383,46630.242
4,1161345.0,K00984.01,,CANDIDATE,CANDIDATE,0.711,281025.47,281107.12,281115.75,281128.88,...,279630.28,279669.38,279746.53,279712.88,279650.66,279706.4,279592.78,279419.28,279650.66,279554.78


kepid               float64
kepoi_name           object
kepler_name          object
koi_disposition      object
koi_pdisposition     object
                     ...   
flux_000003996      float64
flux_000003997      float64
flux_000003998      float64
flux_000003999      float64
flux_000004000      float64
Length: 4007, dtype: object

(2052, 4007)

https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html

### Columna **koi_disposition**
La categoría de el KOI dentro de "Exoplanet Archive". Contempla los valores:
- (0) CANDIDATE
- (1) CONFIRMED
- (3) FALSE POSITIVE
- (4) NOT DISPOSITIONED 

Todos los KOI marcados como CONFIRMED están validados como exoplanetas válidos.<br>
Aquellos que tengan otra clasificación, siguen el criterio de la columna *koi_pdisposition* (Disposition Using Kepler Data)


### Columna **koi_pdisposition**
Valor alternativo que designa la clasificación mas probable de un KOI no confimado.<br>
Contempla los valores:

- (0) CANDIDATE: elementos que han superado todas las pruebas previas realizadas para identificar los falsos positivos, aunque esto no significa a priori que se hayan realizado todas las pruebas posibles. Una prueba futura puede confirmar este KOI como un falso positivo.
- (1) FALSE POSITIVE: elementos que han fallado al menos un test de los descritos en Batalha et al (2012). Pueden ocurrir cuando:
    - El KOI es en realidad una estrella binaria eclipsante
    - La curva de luz está contaminada por una estrella binaria eclipsante que emite luz de fondo
    - la variabilidad estelar se confunde con la coherencia planetaria
- (2) NOT DISPOSITIONED: objetos en los que las pruebas de disposición aún no han concluido.

El valor de esta clasificacion del KOI puede cambiar a lo largo del tiempo según avance en mayor profundidad el análisis de las imágenes, curvas de luz y futuras observaciones del KOI.

Teniendo clara las disposiciones de las columnas **koi_disposition** y **koi_pdisposition** unificaremos ambas clasificaciones en una única columna.<br>
Como **koi_disposition** es la primera columna a consultar si el KOI es un exoplaneta o no, y, en caso negativo, se consultará **koi_pdisposition**; haremos que los valores CONFIRMED de **koi_disposition** sustituyan los valores de **koi_pdisposition**

In [4]:
# donde koi_disposition = "CONFIRMED", sustituimos koi_pdisposition a "CONFIRMED"
df.loc[df['koi_disposition'] == "CONFIRMED", "koi_pdisposition"] = "CONFIRMED"

In [5]:
pd.Categorical(df['koi_pdisposition'])

[CONFIRMED, CANDIDATE, CANDIDATE, FALSE POSITIVE, CANDIDATE, ..., FALSE POSITIVE, CONFIRMED, FALSE POSITIVE, CONFIRMED, CANDIDATE]
Length: 2052
Categories (3, object): [CANDIDATE, CONFIRMED, FALSE POSITIVE]

In [6]:
# pasamos la clasificacion a categórico
df['koi_pdisposition'] = pd.Categorical(df['koi_pdisposition'])
df['koi_pdisposition'] = df.koi_pdisposition.cat.codes

# eliminamos columnas innecesarias
df = df.drop(['kepid', 'kepoi_name','kepler_name', 'koi_disposition','koi_score'], axis=1)

# eliminamos los flux que sean NaN
# df = df.dropna(axis='columns')

In [7]:
df.insert(1, "confirmed", 0)
df.loc[df['koi_pdisposition'] == 2, "confirmed"] = 1

In [8]:
df

Unnamed: 0,koi_pdisposition,confirmed,flux_000000000,flux_000000001,flux_000000002,flux_000000003,flux_000000004,flux_000000005,flux_000000006,flux_000000007,...,flux_000003991,flux_000003992,flux_000003993,flux_000003994,flux_000003995,flux_000003996,flux_000003997,flux_000003998,flux_000003999,flux_000004000
0,1,0,11305.500,11333.433,11384.334,1.128882e+04,1.134004e+04,11302.961,11320.389,1.132288e+04,...,11298.839,11279.101,11280.896,11347.004,1.136427e+04,11293.777,11339.508,11247.401,1.126622e+04,1.134471e+04
1,0,0,1256772.900,1256532.600,1256749.200,1.256607e+06,1.256606e+06,1256651.000,1257024.900,1.256846e+06,...,1256640.500,1256648.800,1256964.000,1256802.100,1.256562e+06,1256788.000,1256436.400,1256765.400,1.256476e+06,1.256605e+06
2,0,0,127373.030,127471.375,127322.200,1.274162e+05,1.274061e+05,127498.875,127394.766,1.273083e+05,...,127308.690,127347.940,127338.640,127412.790,1.273459e+05,127514.530,127492.250,127435.230,1.273908e+05,1.274811e+05
3,2,1,46625.540,46659.367,46551.500,4.656250e+04,4.670499e+04,46577.070,46612.170,4.665443e+04,...,46627.324,46606.805,46678.120,46634.367,4.657646e+04,46628.560,46730.277,46681.210,4.669938e+04,4.663024e+04
4,0,0,281025.470,281107.120,281115.750,2.811289e+05,2.810300e+05,281087.970,281110.100,2.811821e+05,...,279630.280,279669.380,279746.530,279712.880,2.796507e+05,279706.400,279592.780,279419.280,2.796507e+05,2.795548e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,2,1,20196.771,20187.938,20217.463,2.021479e+04,2.023160e+04,20238.190,20151.418,2.019932e+04,...,20213.924,20268.210,20211.129,20240.803,2.015694e+04,20161.758,20205.070,20243.924,2.017517e+04,2.017169e+04
2048,1,0,7444.926,7415.429,7396.656,7.374800e+03,7.459314e+03,7386.945,7463.753,7.412065e+03,...,7405.416,7410.583,7378.180,7434.868,7.441580e+03,7390.238,7387.744,7424.705,7.411852e+03,7.361608e+03
2049,2,1,268170.120,268132.560,268216.940,2.681775e+05,2.681086e+05,268116.160,268145.300,2.680695e+05,...,268170.340,268273.800,268150.530,268022.200,2.680848e+05,268157.900,268151.160,268067.000,2.680373e+05,2.682637e+05
2050,1,0,104238.190,104252.164,104253.120,1.042484e+05,1.042650e+05,104293.390,104218.190,1.043329e+05,...,104252.414,104212.050,104245.260,104257.380,1.041969e+05,104219.110,104255.380,104321.020,1.044072e+05,1.041903e+05


# --------------------------------

In [9]:
confirmed = df.pop('confirmed')

In [10]:
dataset = tf.data.Dataset.from_tensor_slices((df.values, confirmed.values))

In [11]:
for feat, targ in dataset.take(5):
  print ('Features: {}, Target: {}'.format(feat, targ))

Features: [1.0000000e+00 1.1305500e+04 1.1333433e+04 ... 1.1247401e+04 1.1266217e+04
 1.1344707e+04], Target: 0
Features: [      0.  1256772.9 1256532.6 ... 1256765.4 1256475.5 1256605. ], Target: 0
Features: [     0.    127373.03  127471.375 ... 127435.23  127390.78  127481.14 ], Target: 0
Features: [2.0000000e+00 4.6625540e+04 4.6659367e+04 ... 4.6681210e+04 4.6699383e+04
 4.6630242e+04], Target: 1
Features: [     0.   281025.47 281107.12 ... 279419.28 279650.66 279554.78], Target: 0


In [12]:
tf.constant(df['koi_pdisposition'])

<tf.Tensor: id=21, shape=(2052,), dtype=int32, numpy=array([1, 0, 0, ..., 2, 1, 0])>

In [13]:
train_dataset = dataset.shuffle(len(df)).batch(1)

In [14]:
def get_compiled_model(df):
  network = tf.keras.Sequential()

  network.add(tf.keras.layers.Dense(512,activation='relu',input_shape=(df.shape[1],)))
  network.add(tf.keras.layers.Dense(256,activation='relu'))
  network.add(tf.keras.layers.Dense(128,activation='relu'))
  network.add(tf.keras.layers.Dense(64,activation='relu'))
  network.add(tf.keras.layers.Dense(32,activation='relu'))
  network.add(tf.keras.layers.Dense(32,activation='relu'))
  network.add(tf.keras.layers.Dense(2,activation='softmax'))

  network.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy']
  )
  return network

In [15]:
model = get_compiled_model(df)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2049536   
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 6

In [None]:
model.fit(train_dataset, epochs=15)

Epoch 1/15
