## Simple model using Bx, By, Bz, jy, vz
 classifications: current sheets, o_structures, null
 
 fixed length timeseries informed by d_per_de ~ 4
 
model epsilon but as a binary problem

In [None]:
import sys
import datetime as dt
import h5py
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib
matplotlib.use('svg')
import matplotlib.pyplot as plt
# get functions from other notebooks
%run /tigress/kendrab/analysis-notebooks/loss_fns.ipynb
%run /tigress/kendrab/analysis-notebooks/metrics.ipynb
%run /tigress/kendrab/analysis-notebooks/preproc_utils.ipynb
%run /tigress/kendrab/analysis-notebooks/eval_utils.ipynb
start = dt.datetime.now(dt.timezone.utc)  # for timing
time_str = start.strftime("%H%M%S")
date_str = start.strftime("%d-%m-%y")
start_str = date_str + time_str

### Assemble a model

In [None]:
model_name = "zeta"
# hyperparameters
learning_rate = 0.01
filters = 32
kernel_size = 3
pool_size = 2
padding_length = 10  # amount of data on each side of each segment for additional info
stride = 1  # size (and therefore spacing) of each segment
input_length = stride + 2*padding_length
mask_value = int(-10.0)
epochs = 10
thinning_factor = [.99, None]
hyperparams = {'learning_rate':learning_rate, 'filters':filters, 'kernel_size':kernel_size, 'pool_size':pool_size,
              'padding_length':padding_length, 'stride':stride, 'input_length':input_length, 'epochs':epochs,
               'thinning_factor':thinning_factor}

# input
bx_input = keras.Input(shape=(input_length,1), name="bx") 
by_input = keras.Input(shape=(input_length,1), name="by") 
bz_input = keras.Input(shape=(input_length,1), name="bz") 
jy_input = keras.Input(shape=(input_length,1), name="jy") 
vz_input = keras.Input(shape=(input_length,1), name="vz") 


# convolve and pool separately
bx_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(bx_input)
by_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(by_input)
bz_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(bz_input)
jy_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(jy_input)
vz_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')(vz_input)

bx_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(bx_conv)
by_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(by_conv)
bz_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(bz_conv)
jy_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(jy_conv)
vz_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(vz_conv)

# merge the layers together
avg = keras.layers.Average()([bx_pool, by_pool, bz_pool, jy_pool, vz_pool])
# convolve and pool
avg_conv = keras.layers.Conv1D(filters=2*filters, kernel_size=kernel_size, padding='valid')(avg)
avg_pool = keras.layers.MaxPooling1D(pool_size=2)(avg_conv)


# use dense layer to output
flat_pool = keras.layers.Flatten()(avg_pool)
flat_logits = keras.layers.Dense(stride*2, activation='relu')(flat_pool)
logits = keras.layers.Reshape((stride, 2))(flat_logits)
probs = keras.layers.Softmax()(logits)
# throw together the model
model = keras.Model(
    inputs=[bx_input, by_input, bz_input, jy_input, vz_input],
    outputs=[probs])

# show the model
model.summary()
keras.utils.plot_model(model, "/scratch/gpfs/kendrab/model_"+model_name+".png", show_shapes=True)

### Get 1d sampling (If training/testing only, not building!)
Generated by [1d_sampling](./1d_sampling.ipynb)

In [None]:
# TODO use command line args or someting easier than throwing it here
readpaths = ['/tigress/kendrab/03082021/'+"1000samples_idx22_bxbybzjyvz.hdf5",
             '/tigress/kendrab/03082021/'+"1000samples_idx18_bxbybzjyvz.hdf5",
             '/tigress/kendrab/03082021/'+"1000samples_idx15_bxbybzjyvz.hdf5"]

idx_list = []  # to keep track of which file what sample came from
s_list = []
bx_list = []
by_list = []
bz_list = []
jy_list = []
vz_list = []
x0_list = []
x1_list = []
topo_list = []

for idx, filepath in enumerate(readpaths):
    file = h5py.File(filepath, 'r')
    idx_list += [np.array([idx for i in bx]) for bx in file['bx_smooth'][:]]  # check this structure!!!
    s_list += list(file['s'][:])
    bx_list += list(file['bx_smooth'][:])
    by_list += list(file['by'][:])
    bz_list += list(file['bz_smooth'][:])
    jy_list += list(file['jy'][:])
    vz_list += list(file['vz'][:]) 
    x0_list += list(file['x0'][:])
    x1_list += list(file['x1'][:])
    topo_list_tmp = list(file['topo'][:])
    for i in range(len(topo_list_tmp)):  # I tried to vectorize this but I didn't get it to work
        topo_list_tmp[i] = topo_list_tmp[i] % 2  # cat 0,2 are not plasmoids, cat 1,3 are
        topo_list_tmp[i] = keras.utils.to_categorical(topo_list_tmp[i], num_classes=2)
    topo_list += topo_list_tmp
    file.close()

### Preprocess data

In [None]:
# chunk into sliding windows (put fn into preproc_utils)
# NOTE TOPO HAS DIFFERENT SEGMENT LENGTHS THAN THE INPUTS (stride vs. 2*padding+stride)
idx_segs = batch_unpadded_subsects(idx_list, padding_length, stride)
s_segs = batch_subsects(s_list, input_length, stride).reshape(-1, input_length, 1)
bx_segs = batch_subsects(bx_list, input_length, stride).reshape(-1, input_length, 1)
by_segs = batch_subsects(by_list, input_length, stride).reshape(-1, input_length, 1)
bz_segs = batch_subsects(bz_list, input_length, stride).reshape(-1, input_length, 1)
jy_segs = batch_subsects(jy_list, input_length, stride).reshape(-1, input_length, 1)
vz_segs = batch_subsects(vz_list, input_length, stride).reshape(-1, input_length, 1)
x0_segs = batch_unpadded_subsects(x0_list, padding_length, stride)
x1_segs = batch_unpadded_subsects(x1_list, padding_length, stride)
topo_segs = batch_unpadded_subsects(topo_list, padding_length, stride)

(idx_train, idx_test, s_train, s_test, bx_train, bx_test, by_train, by_test, bz_train, bz_test, jy_train, jy_test, vz_train, vz_test, 
     x0_train, x0_test, x1_train, x1_test, topo_train, topo_test) = \
                       train_test_split(idx_segs, s_segs, bx_segs, by_segs, bz_segs, jy_segs, vz_segs,
                                        x0_segs, x1_segs, topo_segs)
# try to do some rebalancing in the training set
# model is struggling on plasmoids, which are underrepresented
[idx_train, s_train, bx_train, by_train, bz_train, jy_train, vz_train, x0_train, x1_train], topo_train = \
    rebalance_ctrl_group([idx_train, s_train, bx_train, by_train, bz_train, jy_train, vz_train, x0_train, x1_train],
                         topo_train, null_label=[1,0], thinning_factor = thinning_factor[0])

### Compile and train model

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# loss_fn = tfa.losses.SigmoidFocalCrossEntropy(gamma=10)  # gamma must be an integer apparently (in int form)
loss_fn = keras.losses.CategoricalCrossentropy()
loss = gen_loss_per_pt(loss_fn=loss_fn)
metric = gen_metric_per_cat()
metrics = ["acc"]  # loss_fn keyword left default
# for i in range(4):
#     metrics.append(gen_metric_per_cat(mask_layer=mask_layer, cat_idx=i))


model.compile(optimizer=opt, loss=loss, metrics=metrics,
             run_eagerly = True)  # run eagerly to get .numpy() method

In [None]:
model.fit(x={'bx': bx_train, 'by': by_train, 'bz': bz_train, 'jy': jy_train, 'vz': vz_train},
          y = topo_train, epochs=epochs)

### Make output directories if they do not exist and set up output file names

In [None]:
log_file, cf_file, samplefile_start = generic_outputs_structure("/scratch/gpfs/kendrab/model_outs/",
                                                                model_name, date_str, time_str)

### Observe the results, dump information to file

In [None]:
with open(log_file, 'w') as log:
    log.write(f"Model {model_name} trained on {start_str}\n")
    log.write(f"loss function \t\t{loss.name}\n")
    log.write("Hyperparameters:\n")
    for key in hyperparams.keys():
        log.write(f"{key}\t\t{hyperparams[key]}\n")
        
    log.write("Training performance\n")        
    print("Training performance")
    train_topo_pred = model(inputs={'bx': bx_train, 'by': by_train, 'bz': bz_train, 'jy': jy_train, 'vz': vz_train}, training=False)
    train_1d = np.argmax(topo_train.reshape(-1,2), axis=1) # for confusion matrix
    train_1d_pred = np.argmax(train_topo_pred.numpy().reshape(-1,2), axis=1)  
    num_per_cat = [np.sum(topo_train[...,i] == 1) for i in range(2)]
    log.write(f"cat_breakdown\t\t{num_per_cat}\n")
    print(f"cat_breakdown\t\t{num_per_cat}")
    print([np.max(topo_train[:,:,i]) for i in range(2)])
    print([np.max(train_topo_pred[:,:,i]) for i in range(2)])
    for i in range(2):
        acc = gen_metric_per_cat(cat_idx=i)(tf.convert_to_tensor(topo_train), train_topo_pred)
        log.write(f"cat{i}recall\t\t{acc}\n")
        print(f"Category {i} had recall {acc}")

    log.write("Testing performance\n")
    print("Testing performance")
    test_topo_pred = model(inputs={'bx': bx_test, 'by': by_test, 'bz': bz_test, 'jy': jy_test, 'vz': vz_test}, training=False)
    test_1d = np.argmax(topo_test.reshape(-1,2), axis=1) # for confusion matrix
    test_1d_pred = np.argmax(test_topo_pred.numpy().reshape(-1,2), axis=1)  
    num_per_cat = [np.sum(topo_test[...,i] == 1) for i in range(2)]
    log.write(f"cat_breakdown\t\t{num_per_cat}\n")
    print(f"cat_breakdown\t\t{num_per_cat}")
    print([np.max(topo_test[:,:,i]) for i in range(2)])
    print([np.max(test_topo_pred[:,:,i]) for i in range(2)])
    for i in range(2):
        acc = gen_metric_per_cat(cat_idx=i)(tf.convert_to_tensor(topo_test), test_topo_pred)
        log.write(f"cat{i}recall\t\t{acc}\n")
        print(f"Category {i} had recall {acc}")    
    end = dt.datetime.now(dt.timezone.utc)    
    log.write(f"runtime_seconds\t\t{(end-start).total_seconds()}")

### Save confusion matrices

In [None]:
plt_traintest_cf_matrices(train_1d, train_1d_pred, test_1d, test_1d_pred, cf_file)

### Plot summaries of a selection of segments

In [None]:
plot_reps([bx_train, by_train, bz_train, jy_train, vz_train], ['bx','by','bz','jy','vz'], s_train, topo_train, train_topo_pred, 
          samplefile_start, inputs_padding=padding_length, true_coords=np.stack([x0_train, x1_train], axis=-1), exs_per_cat=5 )

### Overplot trained data onto 2d contour

In [None]:
# readpaths = ['/tigress/kendrab/03082021/'+"idx22_bxbybzjyvz.hdf5",
#              '/tigress/kendrab/03082021/'+"idx18_bxbybzjyvz.hdf5",
#              '/tigress/kendrab/03082021/'+"idx15_bxbybzjyvz.hdf5"]
# flux_fn_meshes = []
# flux_fns = []
# # read in the flux function data
# for filepath in readpaths:
#     file = h5py.File(filepath, 'r')    
#     flux_fn_meshes.append([file['default_x'][:], file['default_z'][:]])
#     flux_fns.append(file['flux_fn'][0])
# """training data"""
# # recombine all slices into one long array (predictions, x and z locations, file#s)
# all_training_pred = train_1d_pred
# all_training_x =  np.concatenate(x0_train)
# all_training_z =  np.concatenate(x1_train)
# all_training_idxs = np.concatenate(idx_train)
# # split based on file# index, plot all predictions for each file# into a separate visualization
# for i in range(len(readpaths)):
#     idxs = np.nonzero(all_training_idxs == i)
#     """ This part is experimental and not part of the official generic file structure yet."""
#     fname = "/scratch/gpfs/kendrab/model_outs/"+date_str+'/'+model_name+f'_{i}train_2dvis'+time_str # note no extension
#     show_2d_success(all_training_pred[idxs], all_training_x[idxs], all_training_z[idxs],
#                     flux_fn_meshes[i], flux_fns[i], fname)
# """test data"""
# # recombine all slices into one long array (predictions, x and z locations, file#s)
# all_test_pred = test_1d_pred
# all_test_x =  np.concatenate(x0_test)
# all_test_z =  np.concatenate(x1_test)
# all_test_idxs = np.concatenate(idx_test)
# # split based on file# index, plot all predictions for each file# into a separate visualization
# for i in range(len(readpaths)):
#     idxs = np.nonzero(all_test_idxs == i)
#     fname = "/scratch/gpfs/kendrab/model_outs/"+date_str+'/'+model_name+f'_{i}test_2dvis'+time_str # note no extension
#     show_2d_success(all_test_pred[idxs], all_test_x[idxs], all_test_z[idxs],
#                     flux_fn_meshes[i], flux_fns[i], fname)