## Simple model using Bx, By, Bz, jy, vz
 classifications: separatrices, o_structures, null
 
 fixed length timeseries informed by d_per_de ~ 4
 
 structure will start the same as alpha but the end goal is to change it to make use of the fixed length series
 maybe using purely convolutional network, with pooling and dropouts etc

In [None]:
import sys
import datetime as dt
import h5py
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from sklearn.model_selection import train_test_split
# get functions from other notebooks
%run /tigress/kendrab/analysis-notebooks/loss_fns.ipynb
%run /tigress/kendrab/analysis-notebooks/metrics.ipynb
%run /tigress/kendrab/analysis-notebooks/preproc_utils.ipynb
start = dt.datetime.now(dt.timezone.utc)  # for timing
startstr = start.strftime("%d-%m-%y_%H%M%S")

### Assemble a model

In [None]:
model_name = "gamma"
# hyperparameters
learning_rate = 0.01
filters = 16
kernel_size = 3
pool_size = 2
segment_length = 30
stride = 10
mask_value = int(-10.0)
epochs = 10
thinning_factor = 0.9
hyperparams = {'learning_rate':learning_rate, 'filters':filters, 'kernel_size':kernel_size, 'pool_size':pool_size,
              'segment_length':segment_length, 'stride':stride, 'epochs':epochs, 'thinning_factor':thinning_factor}

# input
bx_input = keras.Input(shape=(segment_length,1), name="bx") 
by_input = keras.Input(shape=(segment_length,1), name="by") 
bz_input = keras.Input(shape=(segment_length,1), name="bz") 
jy_input = keras.Input(shape=(segment_length,1), name="jy") 
vz_input = keras.Input(shape=(segment_length,1), name="vz") 


# convolve and pool separately
bx_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(bx_input)
by_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(by_input)
bz_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(bz_input)
jy_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(jy_input)
vz_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(vz_input)

bx_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(bx_conv)
by_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(by_conv)
bz_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(bz_conv)
jy_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(jy_conv)
vz_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(vz_conv)

# merge the layers together
avg = keras.layers.Average()([bx_pool, by_pool, bz_pool, jy_pool, vz_pool])
# convolve and pool
avg_conv = keras.layers.Conv1D(filters=2*filters, kernel_size=kernel_size, padding='valid')(avg)
avg_pool = keras.layers.MaxPooling1D(pool_size=2)(avg_conv)


# use dense layer to output
flat_pool = keras.layers.Flatten()(avg_pool)
flat_logits = keras.layers.Dense(segment_length*4, activation='relu')(flat_pool)
logits = keras.layers.Reshape((segment_length, 4))(flat_logits)
probs = keras.layers.Softmax()(logits)
# throw together the model
model = keras.Model(
    inputs=[bx_input, by_input, bz_input, jy_input, vz_input],
    outputs=[probs])

# show the model
model.summary()
keras.utils.plot_model(model, "/scratch/gpfs/kendrab/model_"+model_name+".png", show_shapes=True)

### Get 1d sampling (If training/testing only, not building!)
Generated by [1d_sampling](./1d_sampling.ipynb)

In [None]:
# TODO use command line args or someting easier than throwing it here
readpaths = ['/tigress/kendrab/03082021/'+"1000samples_idx31_bxbybzjyvz.hdf5",
             '/tigress/kendrab/03082021/'+"1000samples_idx26_bxbybzjyvz.hdf5",
             '/tigress/kendrab/03082021/'+"1000samples_idx22_bxbybzjyvz.hdf5",
             '/tigress/kendrab/03082021/'+"1000samples_idx18_bxbybzjyvz.hdf5",
             '/tigress/kendrab/03082021/'+"1000samples_idx15_bxbybzjyvz.hdf5"]

bx_list = []
by_list = []
bz_list = []
jy_list = []
vz_list = []
topo_list = []

for filepath in readpaths:
    file = h5py.File(filepath, 'r')
    bx_list += list(file['bx_smooth'][:])
    by_list += list(file['by'][:])
    bz_list += list(file['bz_smooth'][:])
    jy_list += list(file['jy'][:])
    vz_list += list(file['vz'][:])    
    topo_list_tmp = list(file['topo'][:])
    for i in range(len(topo_list_tmp)):  # I tried to vectorize this but I didn't get it to work
        topo_list_tmp[i] = keras.utils.to_categorical(topo_list_tmp[i], num_classes=4)
    topo_list += topo_list_tmp
    file.close()

### Preprocess data

In [None]:
# chunk into sliding windows (put fn into preproc_utils)
# I wish I could make this cleaner but varying inputs isn't a great idea in ML, right?
# So you'll have to hardcode in the new inputs you want to have
# Is this good practice???
bx_segs = batch_subsects(bx_list, segment_length, stride).reshape(-1, segment_length, 1)
by_segs = batch_subsects(by_list, segment_length, stride).reshape(-1, segment_length, 1)
bz_segs = batch_subsects(bz_list, segment_length, stride).reshape(-1, segment_length, 1)
jy_segs = batch_subsects(jy_list, segment_length, stride).reshape(-1, segment_length, 1)
vz_segs = batch_subsects(vz_list, segment_length, stride).reshape(-1, segment_length, 1)
topo_segs = batch_subsects(topo_list, segment_length, stride)

(bx_train, bx_test, by_train, by_test, bz_train, bz_test, jy_train, jy_test, vz_train, vz_test, 
     topo_train, topo_test) = \
                       train_test_split(bx_segs, by_segs, bz_segs, jy_segs, vz_segs, topo_segs)
# try to do some rebalancing in the training set
[bx_train, by_train, bz_train, jy_train, vz_train], topo_train = \
    rebalance_ctrl_group([bx_train, by_train, bz_train, jy_train, vz_train], topo_train, 
                         null_label=[1,0,0,0], thinning_factor = thinning_factor)

### Compile and train model

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# loss_fn = tfa.losses.SigmoidFocalCrossEntropy(gamma=10)  # gamma must be an integer apparently (in int form)
loss_fn = keras.losses.CategoricalCrossentropy()
loss = gen_loss_per_pt(loss_fn=loss_fn)
metric = gen_metric_per_cat()
metrics = ["acc"]  # loss_fn keyword left default
# for i in range(4):
#     metrics.append(gen_metric_per_cat(mask_layer=mask_layer, cat_idx=i))


model.compile(optimizer=opt, loss=loss, metrics=metrics,
             run_eagerly = True)  # run eagerly to get .numpy() method

In [None]:
model.fit(x={'bx': bx_train, 'by': by_train, 'bz': bz_train, 'jy': jy_train, 'vz': vz_train},
          y = topo_train, epochs=epochs)

### Observe the results, dump information to file

In [None]:
log = open("/scratch/gpfs/kendrab/model_outs/"+model_name+'log'+startstr, 'w')

with open("/scratch/gpfs/kendrab/model_outs/"+model_name+'log'+startstr, 'w') as log:
    log.write(f"Model {model_name} trained on {startstr}\n")
    log.write(f"loss function \t\t{loss.name}\n")
    log.write("Hyperparameters:\n")
    for key in hyperparams.keys():
        log.write(f"{key}\t\t{hyperparams[key]}\n")
        
    log.write("Training performance\n")        
    print("Training performance")
    topo_pred = model(inputs={'bx': bx_train, 'by': by_train, 'bz': bz_train, 'jy': jy_train, 'vz': vz_train}, training=False)
    num_per_cat = [np.sum(topo_train[...,i] == 1) for i in range(4)]
    log.write(f"cat_breakdown\t\t{num_per_cat}\n")
    print(f"cat_breakdown\t\t{num_per_cat}")
    print([np.max(topo_train[:,:,i]) for i in range(4)])
    print([np.max(topo_pred[:,:,i]) for i in range(4)])
    for i in range(4):
        acc = gen_metric_per_cat(cat_idx=i)(tf.convert_to_tensor(topo_train), topo_pred)
        log.write(f"cat{i}acc\t\t{acc}\n")
        print(f"Category {i} had accuracy {acc}")

    log.write("Testing performance\n")
    print("Testing performance")
    topo_pred = model(inputs={'bx': bx_test, 'by': by_test, 'bz': bz_test, 'jy': jy_test, 'vz': vz_test}, training=False)
    num_per_cat = [np.sum(topo_test[...,i] == 1) for i in range(4)]
    log.write(f"cat_breakdown\t\t{num_per_cat}\n")
    print(f"cat_breakdown\t\t{num_per_cat}")
    print([np.max(topo_test[:,:,i]) for i in range(4)])
    print([np.max(topo_pred[:,:,i]) for i in range(4)])
    for i in range(4):
        acc = gen_metric_per_cat(cat_idx=i)(tf.convert_to_tensor(topo_test), topo_pred)
        log.write(f"cat{i}acc\t\t{acc}\n")
        print(f"Category {i} had accuracy {acc}")    
    end = dt.datetime.now(dt.timezone.utc)    
    log.write(f"runtime_seconds\t\t{(end-start).total_seconds()}")