In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
# import tensorflow_probability as tfp

In [3]:
dftrain = pd.read_csv("data/plasticc_train_metadata.csv")

dftest = pd.read_csv("data/plasticc_test_metadata.csv")

dftest = dftest[:20000]

dftest['target'] = dftest['true_target']

dftrain = pd.concat([dftrain, dftest], axis=0, ignore_index=True)

model_nums = {90:'SN Ia', 67:'SNIa-91bg', 52:'SNIax', 42:'SNII', 62:'SNIbc', 95:'SLSN-I', 15:'TDE', 64:'KN', 88:'AGN', 92:'RRL', 65:'M-dwarf', 16:'EB', 53:'Mira', 6:'muLens-Single', 991:'muLens-Binary', 992:'ILOT', 993:'CaRT', 994:'PISN', 995:'muLens-String'}

sn_models = [90, 67, 52, 42, 62, 95, 15]

dftrain = dftrain[dftrain['target'].isin(sn_models)]

dftrain['target_names'] = dftrain['target'].map(model_nums)

In [4]:
lcs = pd.read_csv("data/plasticc_train_lightcurves.csv")
lcs_test = pd.read_csv("data/plasticc_test_lightcurves_01.csv")

lcs = pd.concat([lcs, lcs_test[:3000000]], axis=0, ignore_index=True)

# lcs = pd.merge(lcs, dftrain, on='object_id', how='inner')
# lcs['passband']

In [5]:
columns = ['object_id', 'mjd', 'passband', 'flux', 'flux_err']
columns2 = ['mjd', 'passband', 'flux', 'flux_err']
# filter wavelengths in angstroms
wavelengths = {
    0: 3671.0,
    1: 4827.0,
    2: 6223.0,
    3: 7546.0,
    4: 8691.0,
    5: 9712.0
}

lcs['passband'] = lcs['passband'].map(wavelengths)

In [6]:
# Prepare Train Data
x_data = []
y_data = []

max_length = 352

for id in dftrain['object_id']:
    if len(lcs[lcs['object_id'] == id]) == 0:
        continue
    x_data.append(lcs[lcs['object_id'] == id][columns].reset_index())
    y_data.append(dftrain[dftrain['object_id'] == id][['object_id', 'true_peakmjd']].values.tolist()[0])

    # Expand light curve data to max length with 0s
    while len(x_data[-1].index) < max_length: 
        x_data[-1].loc[len(x_data[-1].index)] = [len(x_data[-1].index), id, 0, 0, 0, 0]
    x_data[-1] = x_data[-1][columns]


In [7]:
# Normalize Data
for i in range(len(x_data)):
    # try:
    #     x_train[i] = x_train[i].values
    # except AttributeError:
    #     pass
    y_data[i][1] -= 58999
    y_data[i][1] /= 2000
    for j in range(len(x_data[i])):
        if x_data[i].loc[j]['mjd'] == 0:
            continue
        x_data[i].at[j, 'mjd'] -= 59000
        x_data[i].at[j, 'mjd'] /= 2000
        x_data[i].at[j, 'passband'] -= 3670
        x_data[i].at[j, 'passband'] /= 6041

In [8]:
x_train, x_test, y_train, y_test = train_test_split(
     x_data, y_data, test_size=0.2, random_state=34)

In [9]:
x_test, x_val, y_test, y_val = train_test_split(
    x_test, y_test, test_size=0.5, random_state=53)

In [10]:
x_train_values = []
for i in range(len(x_train)):
    x_train_values.append([])
    for row in x_train[i].values:
        x_train_values[-1].append(row[1:])
y_train_values = [np.array(y[1]) for y in y_train]

x_test_values = []
for i in range(len(x_test)):
    x_test_values.append([])
    for row in x_test[i].values:
        x_test_values[-1].append(row[1:])
y_test_values = [np.array(y[1]) for y in y_test]

x_val_values = []
for i in range(len(x_val)):
    x_val_values.append([])
    for row in x_val[i].values:
        x_val_values[-1].append(row[1:])
y_val_values = [np.array(y[1]) for y in y_val]

In [11]:
with open('x_train_values.pickle', 'wb') as file:
    pickle.dump(x_train_values, file)

with open('y_train_values.pickle', 'wb') as file:
    pickle.dump(y_train_values, file)

with open('y_test.pickle', 'wb') as file:
    pickle.dump(y_test, file)

with open('y_val.pickle', 'wb') as file:
    pickle.dump(y_val, file)

with open('x_test_values.pickle', 'wb') as file:
    pickle.dump(x_test_values, file)

with open('y_test_values.pickle', 'wb') as file:
    pickle.dump(y_test_values, file)

with open('x_val_values.pickle', 'wb') as file:
    pickle.dump(x_test_values, file)

with open('y_val_values.pickle', 'wb') as file:
    pickle.dump(y_val_values, file)