In [1]:
import pandas as pd


df = pd.read_csv('data/training_set.csv')
df.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [None]:
Todo: https://arxiv.org/pdf/1711.10609.pdf

In [2]:
import itertools


def generate_series(paths, prepare, chunk_size=1e6):
    
    # Read the data in chunks
    chunks = itertools.chain(*[pd.read_csv(p, chunksize=chunk_size) for p in paths])
    orphans = pd.DataFrame()
    
    for chunk in chunks:
        
        # Add the previous orphans to the chunk
        chunk = pd.concat((orphans, chunk))
        
        # Determine which rows are orphans
        last_val = chunk['object_id'].iloc[-1]
        is_orphan = chunk['object_id'] == last_val
        
        # Put the new orphans aside
        chunk, orphans = chunk[~is_orphan], chunk[is_orphan]
        
        # Yield one series per object
        for object_id, g in chunk.groupby('object_id'):
            yield object_id, prepare(g)
                
                
def generate_batch_series(paths, prepare, batch_size=16, chunk_size=1e6):
    
    while True:
        
        batch = []
        
        for object_id, series in generate_series(paths, prepare, chunk_size=1e6):
            batch.append(series.reshape(-1, 1))
        
            if len(batch) == batch_size:
                yield np.array(batch), np.array(batch)
                batch = []

In [3]:
from keras import layers as l
from keras import models as m

timesteps = 180
encoding_dim = 16 

inputs = l.Input(shape=(timesteps, 1))
encoded = l.normalization.BatchNormalization()(inputs)
encoded = l.LSTM(units=16)(encoded)

decoded = l.RepeatVector(timesteps)(encoded)
decoded = l.LSTM(1, return_sequences=True)(decoded)

autoencoder = m.Model(inputs, decoded)
encoder = m.Model(inputs, encoded)

autoencoder.compile(optimizer='adam', loss='mean_squared_error')

Using TensorFlow backend.


In [6]:
import numpy as np

def log(x):
    y = np.log1p(np.abs(x))
    return np.where(x < 0, -y, y)


def prepare_series(g):
    return log(np.interp(x=np.linspace(59580, 60674, timesteps), xp=g['mjd'], fp=g['flux']))


batch_size = 16
generator = generate_batch_series(paths=['data/training_set.csv', 'data/test_set.csv'], prepare=prepare_series, batch_size=batch_size)

autoencoder.fit_generator(generator, steps_per_epoch=1000, epochs=10);

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2df5441358>

In [38]:
generator = generate_series(paths=['data/training_set.csv', 'data/test_set.csv'], prepare=prepare_series)

features = {}
ids = []
batch = []

for object_id, series in generator:
    
    ids.append(object_id)
    batch.append(series.reshape(-1, 1))
    
    if len(batch) == batch_size:
        encoded = encoder.predict(np.array(batch))
        for i, enc in zip(ids, encoded):
            features[i] = enc
        ids = []
        batch = []

In [45]:
pd.DataFrame.from_dict(features, orient='index').add_prefix('auto_').to_hdf('data/features.h5', 'autoencoder')