In [33]:
# no ADDA or critic, just distance

In [34]:
import sys
import os
import psutil  # :( mem leak
import h5py
import copy
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader
from  torch.nn.functional import one_hot
from sklearn.utils import shuffle
import torch
from torch import nn
from torchvision.models.feature_extraction import get_graph_node_names
from torchvision.models.feature_extraction import create_feature_extractor
import torchinfo
import optuna
from datetime import datetime, timezone
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")
dtype = torch.double

# Get functions from other notebooks
%run /tigress/kendrab/analysis-notebooks/torch_models/utils.ipynb
%run /tigress/kendrab/analysis-notebooks/preproc_utils.ipynb
%run /tigress/kendrab/analysis-notebooks/eval_utils.ipynb
%run /tigress/kendrab/analysis-notebooks/torch_models/import_mms_data.ipynb
%run /tigress/kendrab/analysis-notebooks/torch_models/ndb.ipynb

Using cpu device
Using cpu device


### Helupful functions and classes

In [35]:
def train(dl_source, dl_target, feat_extract_source, feat_extract_target, loss_fn, optimizer, iter_source=None):
    # note feat_extract_source is a special boye that returns a dictionary that we need to get a value from
    inter_iteration_stuff = {}  # whatever I need to escape this function (losses over time, the current iterator, etc.)
    feat_extract_source.eval()
    batch_size = dl_source.batch_size  # the length of a tensordataset is the batch size (shared first dim)
    
    # let's iterate enough that the mms dataset is completely used. 
    samples_source = len(dl_source.dataset)
    samples_target = len(dl_target.dataset)
    total_batches =samples_target//batch_size
    print(f"training on {samples_source} source samples and {samples_target} target samples")
    # iterate
    if iter_source is None:  # make the source iterator if it doesn't already exist
        print("Making source iterator")  # shouldn't ever fire with current setup / needs from the code
        iter_source = iter(dl_source)
    iter_target = iter(dl_target)
    for batch in range(total_batches):
        ds_source, iter_source = iter_or_restart_dl(dl_source, iter_source)
        ds_target, iter_target = iter_or_restart_dl(dl_target, iter_target)
        loss_feat = []
        # unpack values
        _, _, bx_s, by_s, bz_s, ex_s, ey_s, ez_s, jy_s, _, _, _ = ds_source
        bx_t, by_t, bz_t, ex_t, ey_t, ez_t, _, jy_t, _, _ = ds_target
        # calculate features, add labels
        feat_source = feat_extract_source(bx_s, by_s, bz_s, ex_s, ey_s, ez_s, jy_s)["features"].detach()  # don't calc gradient
        feat_target = feat_extract_target(bx_t, by_t, bz_t, ex_t, ey_t, ez_t, jy_t)
        
        feat_extract_target.train()
        optimizer.zero_grad()
        # put features into spaces the loss fn expects
        logsm_feat_target = nn.functional.log_softmax(feat_target, dim=-1)
        logsm_feat_source = nn.functional.log_softmax(feat_source, dim=-1)
        lossF = loss_fn(logsm_feat_target, logsm_feat_source)  # pred, true
        lossF.backward()
        optimizer.step() 
        loss_feat.append(lossF.item())

        if (batch+1) % 50 == 0:
            current_sample = (batch+1)*batch_size
            print(f"feat extract loss: {lossF}, sample {current_sample}/{total_batches*batch_size}")
    # configure function outputs 
    inter_iteration_stuff["iter_source"] = iter_source
    if "loss_feat" not in inter_iteration_stuff:
        inter_iteration_stuff["loss_feat"] = [] 
    inter_iteration_stuff["loss_feat"] += loss_feat
    
    return inter_iteration_stuff

In [36]:
def repeat_layers_n_times(layer_list, n):  # this instead of something simpler to be absolutely sure the layers are different objects and not repeating the same one
    new_layer_list = []
    for i in range(n):
        for layer in layer_list:
            new_layer_list.append(copy.deepcopy(layer))
    return new_layer_list

In [37]:
class MMSFeatExtract(nn.Module): #TODO: strided convolution instead of pooling?
    """ 1D CNN Model """
    def __init__(self, mms_num_conv, mms_kp_limit, mms_kernel_size, mms_pool_size, mms_out_channels, mms_learning_rate, mms_dropout_fraction, feat_shape):
        super().__init__()
        # define these all separately because they will get different weights
        # consider smooshing these together into one convolution with in_channels=6. Idk if a good idea
        feat_shape_nobatch = feat_shape[1:]
        self.bx_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        self.by_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        self.bz_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        self.ex_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        self.ey_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        self.ez_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        self.jy_layers = nn.Sequential(*repeat_layers_n_times([nn.LazyConv1d(mms_out_channels, mms_kernel_size, padding='valid'),
                                           nn.LeakyReLU(), nn.AvgPool1d(mms_pool_size), nn.Dropout(p=mms_dropout_fraction)], mms_num_conv))
        
        self.post_merge_layers = nn.Sequential(nn.Conv1d(mms_out_channels, mms_out_channels*2, mms_kernel_size,
                                                         padding='valid'),
                                               nn.LeakyReLU(),
                                               nn.AvgPool1d(mms_pool_size))
        self.resize_features = nn.Sequential(nn.Flatten(), nn.LazyLinear(np.prod(feat_shape_nobatch)), nn.Unflatten(-1, feat_shape_nobatch))
                                               

    def forward(self, bx, by, bz, ex, ey, ez, jy):
        bx_proc = self.bx_layers(bx)
        by_proc = self.by_layers(by)
        bz_proc = self.bz_layers(bz)
        ex_proc = self.ex_layers(ex)
        ey_proc = self.ey_layers(ey)
        ez_proc = self.ez_layers(ez)
        jy_proc = self.jy_layers(jy)
        combined = (bx_proc + by_proc + bz_proc + ex_proc + ey_proc + ez_proc + jy_proc)/6.
        mms_features = self.post_merge_layers(combined)
        features = self.resize_features(mms_features)
        
        return features

### Main

In [38]:
# parameters
mms_epochs = 1  # number of times to loop through the entire mms dataset (start with 1 lmao)
ndb_samples = 10000
name='da_dist'

rank = os.environ.get("OMPI_COMM_WORLD_RANK")


In [39]:
# load a model
%run /tigress/kendrab/analysis-notebooks/torch_models/import_model.ipynb
batch_size = 64
# TODO?: enforce that key variables that need to exist later down the pipeline
# are populated by import_model?



In [40]:
# extract the model's features (before classification step)
# get_graph_node_names(model)
return_nodes = {'post_merge_layers.2' : 'features'}
feat_sim = create_feature_extractor(model, return_nodes=return_nodes)
mock_data = torch.ones((batch_size, 1, input_length), dtype=dtype)
tsum = torchinfo.summary(feat_sim, input_data = [mock_data for i in range(7)])
feat_shape = tsum.summary_list[-1].output_size
# extract the classifier part
all_classifier = nn.Sequential(*list(model.children())[-1][-2:])
torchinfo.summary(all_classifier)

Layer (type:depth-idx)                   Param #
Sequential                               --
├─Flatten: 1-1                           --
├─LazyLinear: 1-2                        2,387
Total params: 2,387
Trainable params: 2,387
Non-trainable params: 0

In [41]:
# Load the sim data
%run /tigress/kendrab/analysis-notebooks/torch_models/import_sim_data.ipynb

(425291, 1, 89)
(425291, 2, 11)
torch.float64


In [42]:
# Load the mms data locations and shuffle the files to not be chronological
global_mms_filenames = get_filenames()
debug_filename = "2021-08-18T04-48-00_2021-08-20T04-30-30.h5"

### Time to do some training 

In [43]:
def objective(trial):
    # timing stuff
    start = datetime.now(timezone.utc)  # for timing
    time_str = start.strftime("%H%M%S")
    date_str = start.strftime("%d-%m-%y")
    start_str = date_str + time_str

    # make suggestions
    mms_num_conv = trial.suggest_int('num_conv', 1, 2)  # for separate layers but also for end layer (so really there are num_conv*2 layers)
    mms_kp_limit = int(input_length**(1/(mms_num_conv+1)))  # a non-maximal upper bound on sizes to avoid running out of length
    print(f"kernel eqn limit kp={kp_limit}")
    min_pool_size=2
    mms_kernel_size = trial.suggest_int('kernel_size', 2, min(mms_kp_limit - min_pool_size, 10))  # max size 10 or lower
    mms_pool_size = trial.suggest_int('pool_size', min_pool_size, min(mms_kp_limit - mms_kernel_size, 5))
    mms_out_channels = trial.suggest_int('out_channels', 8, 56)  # like 'filters' in keras
    mms_learning_rate = trial.suggest_float('learning_rate', 0.001, 0.003, log=True)  # CANT CHANGE LOG NOW >:(
    mms_dropout_fraction = trial.suggest_float('dropout', 0, 0.3)    

    hyperparams = {'mms_num_conv': mms_num_conv, 'mms_kp_limit': mms_kp_limit, 'mms_kernel_size':mms_kernel_size, 'mms_pool_size':mms_pool_size,
                        'mms_out_channels':mms_out_channels, 'mms_learning_rate':mms_learning_rate, 'mms_dropout_fraction':mms_dropout_fraction,
                        'feat_shape':feat_shape}

    mms_filenames = global_mms_filenames
    
    # make the boyes
    feat_mms = MMSFeatExtract(mms_num_conv, mms_kp_limit, mms_kernel_size, mms_pool_size, mms_out_channels, mms_learning_rate,
                              mms_dropout_fraction, feat_shape).to(device=device, dtype=torch.double)
    loss_fn = torch.nn.KLDivLoss(reduction= 'batchmean', log_target=True)
    opt = torch.optim.Adam(model.parameters(),lr=learning_rate)
    sum_loss_feat = 0
    
    for epoch in range(mms_epochs):
        loss_feat = []
        print(f"Starting Epoch {epoch+1}")
        mms_filenames = shuffle(mms_filenames)
        sim_iter = iter(sim_dl)  # to make sure we loop through the whole dataset before starting over even as we switch between mms files
        for i, mms_file in enumerate(mms_filenames):
            print(f"getting MMS file {mms_file}, number {i+1} of {len(mms_filenames)}")
            mms_data_dict = get_mms_data(mms_file)
            if len(mms_data_dict) == 0:  # data was not loaded, skip this file
                continue
            mms_dl = format_mms_data(mms_data_dict)
            
            training_step = train(sim_dl, mms_dl, feat_sim, feat_mms, loss_fn, opt, iter_source=sim_iter)
            sim_iter = training_step["iter_source"]
            loss_feat += training_step["loss_feat"]
            
        sum_loss_feat = sum(loss_feat)  

    # calculate ndb score
    mms_features, _ = get_mms_features(feature_extractor=feat_mms, n=ndb_samples)
    ndb_sim_dl = DataLoader(sim_dset, batch_size = ndb_samples, shuffle=True, drop_last=True)
    ndb_sim_samples = next(iter(ndb_sim_dl))
    sim_features = get_sim_features(feature_extractor = feat_sim, samples = ndb_sim_samples, n = ndb_samples)["features"]
    mms_features_flat = torch.flatten(mms_features, start_dim=1).detach()
    sim_features_flat = torch.flatten(sim_features, start_dim=1).detach()
    ndb = ndb_score(sim_features_flat, mms_features_flat)
    
    # save if we are retrieving a specific trial
    if study.user_attrs['save']:
        print("Saving model...")  # DEBUG

        if rank is not None:
            time_str += f"_{rank}"  # differentiate between mpi ranks that started at same second
        log_file, _, _, file_start = generic_outputs_structure("/tigress/kendrab/analysis-notebooks/model_outs/",
                                                                name, date_str, time_str)
        # Dump information to file
        with open(log_file, 'w') as log:
            log.write(f"MMS model {name} domain adapted on {start_str}\n")
            log.write(f"using model file {model_file}\n")
            log.write(f"trial number: {trial.number}\n")
            log.write(f"Feature extractor loss: {sum_loss_feat}\n")
            log.write(f"NDB score: {ndb}\n")
            log.write("Hyperparameters:\n")
            for key in hyperparams.keys():
                log.write(f"{key}\t\t{hyperparams[key]}\n")
                
        # save the mms classifier
        class MMSModel(nn.Module):
            def __init__(self, feat_extract, classifier):
                super().__init__()
                self.feat_extract = feat_extract
                self.classifier = classifier
            
            def forward(self, bx, by, bz, ex, ey, ez, jy):
                features = self.feat_extract(bx, by, bz, ex, ey, ez, jy)
                logits = self.classifier(features)
                return logits
            
        mms_classifier = MMSModel(feat_mms, all_classifier)
        print(torchinfo.summary(mms_classifier))
        torch.save(mms_classifier.state_dict(), file_start+"mms_classifier_statedict.tar")             
    
    # trial ended
    end = datetime.now(timezone.utc)
    print(f"trial execution time (s): {(end-start).total_seconds()}")
    return ndb

In [44]:
# assume study already made
""" study = optuna.create_study(study_name='da_discrep2',storage="mysql+mysqldb://optunauser:Frikkenoptuna@stellar-intel.princeton.edu:47793/da_discrep", direction='minimize')"""
study = optuna.load_study(study_name='da_discrep2',storage="mysql+mysqldb://optunauser:Frikkenoptuna@stellar-intel.princeton.edu:47793/da_discrep")
# regular training with saving models
study.set_user_attr('save', True)
study.optimize(objective, n_trials=1)
# # bringing back the best one to train a new model
# trial_num = study.user_attrs['knee_trial_num']
# best_params = study.trials[trial_num].params
# study.enqueue_trial(best_params, skip_if_exists=False)
# study.set_user_attr('save',True)
# study.optimize(objective, n_trials=4)
# objective(study.trials[trial_num])

kernel eqn limit kp=9
Starting Epoch 1
getting MMS file 2017-07-07T19-35-30_2017-07-08T01-30-30.h5, number 1 of 252




components loaded in order ['B', 'E', 'j', 'time']
training on 425291 source samples and 1704 target samples


[I 2024-10-24 10:48:09,102] Trial 2 finished with value: 0.96 and parameters: {'num_conv': 1, 'kernel_size': 7, 'pool_size': 2, 'out_channels': 19, 'learning_rate': 0.0012192174762584518, 'dropout': 0.2637361201158373}. Best is trial 1 with value: 0.92.


Bin number 0 found statistically different with z-score 20.433762886797137 > 1.96
Bin number 1 found statistically different with z-score 14.742332445504958 > 1.96
Bin number 2 found statistically different with z-score 26.632560811275656 > 1.96
Bin number 3 found statistically different with z-score 24.50519766019962 > 1.96
Bin number 4 found statistically different with z-score 25.504714253411414 > 1.96
Bin number 5 found statistically different with z-score 21.91488415108522 > 1.96
Bin number 6 found statistically different with z-score 18.14759572885496 > 1.96
Bin number 7 found statistically different with z-score 16.23035793640016 > 1.96
Bin number 8 found statistically different with z-score 25.837226264274413 > 1.96
Bin number 9 found statistically different with z-score 13.702045035498896 > 1.96
Bin number 10 found statistically different with z-score 129.074595187125 > 1.96
Bin number 11 found statistically different with z-score 12.937902900046966 > 1.96
Bin number 12 found 

In [45]:
# # plot the loss
# fig, ax = plt.subplots(2)
# ax[0].plot(loss_disc)
# ax[1].plot(loss_feat)
# ax[0].set(title="Discriminator loss", xlabel="training iteration", ylabel="loss")
# ax[1].set(title="MMS Feature Extractor loss", xlabel="training iteration", ylabel="loss")
# fig.savefig("/tigress/kendrab/analysis-notebooks/model_outs/scratchwork/training_losses.svg")  # TODO: save model and training info to its own folder
# plt.close(fig='all')