## Simple model using Bx, By, Bz, jy, vz
 classifications: o_structures, null
 
 fixed length timeseries informed by d_per_de ~ 4
 
 similar to gamma, but with only plasmoid/non-plasmoid classification

In [1]:
import sys
import datetime as dt
import h5py
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import shuffle
import matplotlib
matplotlib.use('svg')
import matplotlib.pyplot as plt
# get functions from other notebooks
%run /tigress/kendrab/analysis-notebooks/loss_fns.ipynb
%run /tigress/kendrab/analysis-notebooks/metrics.ipynb
%run /tigress/kendrab/analysis-notebooks/preproc_utils.ipynb
%run /tigress/kendrab/analysis-notebooks/eval_utils.ipynb
start = dt.datetime.now(dt.timezone.utc)  # for timing
time_str = start.strftime("%H%M%S")
date_str = start.strftime("%d-%m-%y")
start_str = date_str + time_str

 The versions of TensorFlow you are currently using is 2.3.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


### Assemble a model

In [2]:
model_name = "delta"
# hyperparameters
learning_rate = 0.01
filters = 32
kernel_size = 3
pool_size = 2
padding_length = 10  # amount of data on each side of each segment for additional info
stride = 10  # size (and therefore spacing) of each segment
input_length = stride + 2*padding_length
epochs = 5
thinning_factor = [0.8, None]
hyperparams = {'learning_rate':learning_rate, 'filters':filters, 'kernel_size':kernel_size, 'pool_size':pool_size,
              'input_length':input_length, 'stride':stride, 'epochs':epochs, 'thinning_factor':thinning_factor}

# input
bx_input = keras.Input(shape=(input_length,1), name="bx") 
by_input = keras.Input(shape=(input_length,1), name="by") 
bz_input = keras.Input(shape=(input_length,1), name="bz") 
jy_input = keras.Input(shape=(input_length,1), name="jy") 
vz_input = keras.Input(shape=(input_length,1), name="vz") 


# convolve and pool separately
bx_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu')(bx_input)
by_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu')(by_input)
bz_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu')(bz_input)
jy_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu')(jy_input)
vz_conv = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu')(vz_input)

bx_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(bx_conv)
by_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(by_conv)
bz_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(bz_conv)
jy_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(jy_conv)
vz_pool = keras.layers.MaxPooling1D(pool_size=pool_size)(vz_conv)

# merge the layers together
avg = keras.layers.Average()([bx_pool, by_pool, bz_pool, jy_pool, vz_pool])
# convolve and pool
avg_conv = keras.layers.Conv1D(filters=2*filters, kernel_size=kernel_size, padding='valid', activation='relu')(avg)
avg_pool = keras.layers.MaxPooling1D(pool_size=2)(avg_conv)


# use dense layer to output
flat_pool = keras.layers.Flatten()(avg_pool)
flat_logits = keras.layers.Dense(stride*2, activation='relu')(flat_pool)
logits = keras.layers.Reshape((stride, 2))(flat_logits)
probs = keras.layers.Softmax()(logits)
# throw together the model
model = keras.Model(
    inputs=[bx_input, by_input, bz_input, jy_input, vz_input],
    outputs=[probs])

# show the model
model.summary()
keras.utils.plot_model(model, "/scratch/gpfs/kendrab/model_"+model_name+".png", show_shapes=True)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
bx (InputLayer)                 [(None, 30, 1)]      0                                            
__________________________________________________________________________________________________
by (InputLayer)                 [(None, 30, 1)]      0                                            
__________________________________________________________________________________________________
bz (InputLayer)                 [(None, 30, 1)]      0                                            
__________________________________________________________________________________________________
jy (InputLayer)                 [(None, 30, 1)]      0                                            
_______________________________________________________________________________________

### Get 1d sampling (If training/testing only, not building!)
Generated by [1d_sampling](./1d_sampling.ipynb)

In [3]:
# TODO use command line args or someting easier than throwing it here
basedir = '/tigress/kendrab/21032023/'
readpaths = []

for i in range(10):
    totdir = basedir+str(i)+'/'
    for j in range(5,60,5):
        readpaths.append(totdir+f"100samples_idx{j}_bxbybzjyvz.hdf5")

idx_list = []  # to keep track of which file what sample came from
s_list = []
bx_list = []
by_list = []
bz_list = []
jy_list = []
vz_list = []
x0_list = []
x1_list = []
topo_list = []

train_idx = None

for idx, filepath in enumerate(readpaths):
    with h5py.File(filepath, 'r') as file:
        idx_list += [np.array([idx for i in bx]) for bx in file['bx_smooth'][:]]  # check this structure!!!
        s_list += list(file['s'][:])
        bx_list += list(file['bx_smooth'][:])
        by_list += list(file['by'][:])
        bz_list += list(file['bz_smooth'][:])
        jy_list += list(file['jy'][:])
        vz_list += list(file['vz'][:]) 
        x0_list += list(file['x0'][:])
        x1_list += list(file['x1'][:])
        topo_list_tmp = list(file['topo'][:])
        for i in range(len(topo_list_tmp)):  # I tried to vectorize this but I didn't get it to work
            topo_list_tmp[i] = topo_list_tmp[i] % 2  # cat 0,2 are not plasmoids, cat 1,3 are
            topo_list_tmp[i] = keras.utils.to_categorical(topo_list_tmp[i], num_classes=2)
        topo_list += topo_list_tmp

        if idx == 70:  # trying for 70-30 train test split
            train_idx = len(bx_list)
print(len(bx_list))
# do train test split
idx_train_list = idx_list[:train_idx]  # to keep track of which file what sample came from
s_train_list = s_list[:train_idx] 
bx_train_list = bx_list[:train_idx] 
by_train_list = by_list[:train_idx] 
bz_train_list = bz_list[:train_idx] 
jy_train_list = jy_list[:train_idx] 
vz_train_list = vz_list[:train_idx] 
x0_train_list = x0_list[:train_idx] 
x1_train_list = x1_list[:train_idx] 
topo_train_list = topo_list[:train_idx] 

idx_test_list = idx_list[train_idx:] 
s_test_list = s_list[train_idx:] 
bx_test_list = bx_list[train_idx:] 
by_test_list = by_list[train_idx:] 
bz_test_list = bz_list[train_idx:] 
jy_test_list = jy_list[train_idx:] 
vz_test_list = vz_list[train_idx:] 
x0_test_list = x0_list[train_idx:] 
x1_test_list = x1_list[train_idx:] 
topo_test_list = topo_list[train_idx:] 

# BUT WAIT THERE'S MORE! Include the slices from plain ol current sheets. Split 50-50 between train and test
# lots of magic numbers here but we don't have time to make the code nice rn
noplasmoids_dir = '/tigress/kendrab/06022023/'
noplasmoids_paths = []
for j in range(5,55,5):
        noplasmoids_paths.append(noplasmoids_dir+f"100samples_idx{j}_bxbybzjyvz.hdf5")
        
for k in range(5):
    # training part
    with h5py.File(noplasmoids_paths[k], 'r') as file:
        idx_train_list += [np.array([idx for i in bx]) for bx in file['bx_smooth'][:]]  # check this structure!!!
        s_train_list += list(file['s'][:])
        bx_train_list += list(file['bx_smooth'][:])
        by_train_list += list(file['by'][:])
        bz_train_list += list(file['bz_smooth'][:])
        jy_train_list += list(file['jy'][:])
        vz_train_list += list(file['vz'][:]) 
        x0_train_list += list(file['x0'][:])
        x1_train_list += list(file['x1'][:])
        topo_list_tmp = list(file['topo'][:])
        for i in range(len(topo_list_tmp)):  # I tried to vectorize this but I didn't get it to work
            topo_list_tmp[i] = topo_list_tmp[i] % 2  # cat 0,2 are not plasmoids, cat 1,3 are
            topo_list_tmp[i] = keras.utils.to_categorical(topo_list_tmp[i], num_classes=2)
        topo_train_list += topo_list_tmp    
        
    # testing part
    with h5py.File(noplasmoids_paths[k+5], 'r') as file:
        idx_test_list += [np.array([idx for i in bx]) for bx in file['bx_smooth'][:]]  # check this structure!!!
        s_test_list += list(file['s'][:])
        bx_test_list += list(file['bx_smooth'][:])
        by_test_list += list(file['by'][:])
        bz_test_list += list(file['bz_smooth'][:])
        jy_test_list += list(file['jy'][:])
        vz_test_list += list(file['vz'][:]) 
        x0_test_list += list(file['x0'][:])
        x1_test_list += list(file['x1'][:])
        topo_list_tmp = list(file['topo'][:])
        for i in range(len(topo_list_tmp)):  # I tried to vectorize this but I didn't get it to work
            topo_list_tmp[i] = topo_list_tmp[i] % 2  # cat 0,2 are not plasmoids, cat 1,3 are
            topo_list_tmp[i] = keras.utils.to_categorical(topo_list_tmp[i], num_classes=2)
        topo_test_list += topo_list_tmp        

11000


### Preprocess data

In [5]:
# chunk into sliding windows
# NOTE TOPO HAS DIFFERENT SEGMENT LENGTHS THAN THE INPUTS (stride vs. 2*padding+stride)
idx_train = batch_unpadded_subsects(idx_train_list, padding_length, stride)
s_train = batch_subsects(s_train_list, input_length, stride).reshape(-1, input_length, 1)
bx_train = batch_subsects(bx_train_list, input_length, stride).reshape(-1, input_length, 1)
by_train = batch_subsects(by_train_list, input_length, stride).reshape(-1, input_length, 1)
bz_train = batch_subsects(bz_train_list, input_length, stride).reshape(-1, input_length, 1)
jy_train = batch_subsects(jy_train_list, input_length, stride).reshape(-1, input_length, 1)
vz_train = batch_subsects(vz_train_list, input_length, stride).reshape(-1, input_length, 1)
x0_train = batch_unpadded_subsects(x0_train_list, padding_length, stride)
x1_train = batch_unpadded_subsects(x1_train_list, padding_length, stride)
topo_train = batch_unpadded_subsects(topo_train_list, padding_length, stride)

print(bx_train.shape)
idx_test = batch_unpadded_subsects(idx_test_list, padding_length, stride)
s_test = batch_subsects(s_test_list, input_length, stride).reshape(-1, input_length, 1)
bx_test = batch_subsects(bx_test_list, input_length, stride).reshape(-1, input_length, 1)
by_test = batch_subsects(by_test_list, input_length, stride).reshape(-1, input_length, 1)
bz_test = batch_subsects(bz_test_list, input_length, stride).reshape(-1, input_length, 1)
jy_test = batch_subsects(jy_test_list, input_length, stride).reshape(-1, input_length, 1)
vz_test = batch_subsects(vz_test_list, input_length, stride).reshape(-1, input_length, 1)
x0_test = batch_unpadded_subsects(x0_test_list, padding_length, stride)
x1_test = batch_unpadded_subsects(x1_test_list, padding_length, stride)
topo_test = batch_unpadded_subsects(topo_test_list, padding_length, stride)


idx_train, s_train, bx_train, by_train, bz_train, jy_train, vz_train, x0_train, x1_train, topo_train = \
    shuffle(idx_train, s_train, bx_train, by_train, bz_train, jy_train, vz_train, x0_train, x1_train, topo_train)

idx_test, s_test, bx_test, by_test, bz_test, jy_test, vz_test, x0_test, x1_test, topo_test = \
    shuffle(idx_test, s_test, bx_test, by_test, bz_test, jy_test, vz_test, x0_test, x1_test, topo_test)

# (idx_train, idx_test, s_train, s_test, bx_train, bx_test, by_train, by_test, bz_train, bz_test, jy_train, jy_test, vz_train, vz_test, 
#      x0_train, x0_test, x1_train, x1_test, topo_train, topo_test) = \
#                        train_test_split(idx_segs, s_segs, bx_segs, by_segs, bz_segs, jy_segs, vz_segs,
#                                         x0_segs, x1_segs, topo_segs)
# try to do some rebalancing in the training set
# model is struggling on plasmoids, which are underrepresented
[idx_train, s_train, bx_train, by_train, bz_train, jy_train, vz_train, x0_train, x1_train], topo_train = \
    rebalance_ctrl_group([idx_train, s_train, bx_train, by_train, bz_train, jy_train, vz_train, x0_train, x1_train],
                         topo_train, null_label=[1,0], thinning_factor = thinning_factor[0])


(406293, 30, 1)
Total batch: 406293




AxisError: axis 1 is out of bounds for array of dimension 0

### Compile and train model

In [5]:
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
# loss_fn = tfa.losses.SigmoidFocalCrossEntropy(gamma=10)  # gamma must be an integer apparently (in int form)
loss_fn = keras.losses.CategoricalCrossentropy()
loss = gen_loss_per_pt(loss_fn=loss_fn)
metric = gen_metric_per_cat()
metrics = ["acc"]  # loss_fn keyword left default
# for i in range(4):
#     metrics.append(gen_metric_per_cat(mask_layer=mask_layer, cat_idx=i))


model.compile(optimizer=opt, loss=loss, metrics=metrics,
             run_eagerly = True)  # run eagerly to get .numpy() method

In [6]:
model.fit(x={'bx': bx_train, 'by': by_train, 'bz': bz_train, 'jy': jy_train, 'vz': vz_train},
          y = topo_train, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2ad8efe2e5d0>

### Make output directories if they do not exist and set up output file names

In [7]:
log_file, cf_file, samplefile_start = generic_outputs_structure("/tigress/kendrab/analysis-notebooks/model_outs/",
                                                                model_name, date_str, time_str)

### Observe the results, dump information to file

In [8]:
with open(log_file, 'w') as log:
    log.write(f"Model {model_name} trained on {start_str}\n")
    log.write(f"loss function \t\t{loss.__name__}\n")
    log.write("Hyperparameters:\n")
    for key in hyperparams.keys():
        log.write(f"{key}\t\t{hyperparams[key]}\n")
        
    log.write("Training performance\n")        
    print("Training performance")
    train_topo_pred = model(inputs={'bx': bx_train, 'by': by_train, 'bz': bz_train, 'jy': jy_train, 'vz': vz_train}, training=False)
    train_1d = np.argmax(topo_train.reshape(-1,2), axis=1) # for confusion matrix
    train_1d_pred = np.argmax(train_topo_pred.numpy().reshape(-1,2), axis=1)  
    num_per_cat = [np.sum(topo_train[...,i] == 1) for i in range(2)]
    log.write(f"cat_breakdown\t\t{num_per_cat}\n")
    print(f"cat_breakdown\t\t{num_per_cat}")
    print([np.max(topo_train[:,:,i]) for i in range(2)])
    print([np.max(train_topo_pred[:,:,i]) for i in range(2)])
    for i in range(2):
        acc = gen_metric_per_cat(cat_idx=i)(tf.convert_to_tensor(topo_train), train_topo_pred)
        log.write(f"cat{i}recall\t\t{acc}\n")
        print(f"Category {i} had recall {acc}")

    log.write("Testing performance\n")
    print("Testing performance")
    test_topo_pred = model(inputs={'bx': bx_test, 'by': by_test, 'bz': bz_test, 'jy': jy_test, 'vz': vz_test}, training=False)
    test_1d = np.argmax(topo_test.reshape(-1,2), axis=1) # for confusion matrix
    test_1d_pred = np.argmax(test_topo_pred.numpy().reshape(-1,2), axis=1)  
    num_per_cat = [np.sum(topo_test[...,i] == 1) for i in range(2)]
    log.write(f"cat_breakdown\t\t{num_per_cat}\n")
    print(f"cat_breakdown\t\t{num_per_cat}")
    print([np.max(topo_test[:,:,i]) for i in range(2)])
    print([np.max(test_topo_pred[:,:,i]) for i in range(2)])
    for i in range(2):
        acc = gen_metric_per_cat(cat_idx=i)(tf.convert_to_tensor(topo_test), test_topo_pred)
        log.write(f"cat{i}recall\t\t{acc}\n")
        print(f"Category {i} had recall {acc}")    
    end = dt.datetime.now(dt.timezone.utc)    
    log.write(f"runtime_seconds\t\t{(end-start).total_seconds()}")

Training performance
cat_breakdown		[748348, 413022]
[1.0, 1.0]
[1.0, 1.0]
Category 0 had recall 0.8930243849754333
Category 1 had recall 0.7645403742790222
Testing performance
cat_breakdown		[2106305, 252175]
[1.0, 1.0]
[1.0, 1.0]
Category 0 had recall 0.913077175617218
Category 1 had recall 0.7661465406417847


### Save confusion matrices

In [9]:
plt_traintest_cf_matrices(train_1d, train_1d_pred, test_1d, test_1d_pred, cf_file)

### Save model

In [10]:
model.save(samplefile_start+"_modelfile.h5")

### Plot summaries of a selection of segments

In [11]:
plot_reps([bx_train, by_train, bz_train, jy_train, vz_train], ['bx','by','bz','jy','vz'], s_train, topo_train, train_topo_pred, 
          samplefile_start, inputs_padding=padding_length, true_coords=np.stack([x0_train, x1_train], axis=-1), exs_per_cat=5 )

### Put temporary stuff here, delete later

In [33]:
# fig, ax = plt.subplots(1,2)
# ax[0].set(title="Training Confusion")
# ax[1].set(title="Testing Confusion")
# cf_train_p = ConfusionMatrixDisplay(confusion_matrix(train_1d, train_1d_pred))
# cf_test_p = ConfusionMatrixDisplay(confusion_matrix(test_1d, test_1d_pred))
# cf_train_p.plot(ax=ax[0], colorbar=None, values_format='d', cmap='Greys')
# cf_test_p.plot(ax=ax[1], colorbar=None, values_format='d', cmap='Greys')

# fig.tight_layout()
# fig.savefig("/tigress/kendrab/analysis-notebooks/model_outs/" + date_str + "/deltacfmatrix"+time_str+"_nonnormal.svg")
# plt.close(fig='all')

In [2]:
print(748348+413022)
print(2106305+252175)
2106305/252175

1161370
2358480


8.352552790720729