## Inspecting Data
- Referencing src/mist/train_mist.py

In [1]:
import argparse
import logging
import pickle
from pathlib import Path

import yaml

from mist import parsing, utils
from mist.data import datasets, featurizers, splitter
from mist.models import mist_model

In [2]:
import os
os.getcwd()

'/Users/michaelvolk/Documents/projects/mist'

In [3]:
#kwargs passed through arg parse
kwargs = {
  "debug": False,
  "seed": 1,
  "save_dir": "results/model_train_demos/mist_fp_model",
  "dataset_name": "canopus_train_public",
  "splitter_name": "preset",
  "reshuffle_val": False,
  "split_sizes": [
    0.8,
    0.1,
    0.1
  ],
  "split_file": "data/paired_spectra/canopus_train_public/splits/canopus_hplus_100_0.csv",
  "augment_data": True,
  "augment_prob": 0.5,
  "remove_weights": "exp",
  "inten_prob": 0.1,
  "remove_prob": 0.5,
  "add_forward_specs": False,
  "forward_aug_folder": None,
  "frac_orig": 0.4,
  "learning_rate": 0.00077,
  "weight_decay": 1e-07,
  "min_lr": 0.0001,
  "lr_decay_frac": 0.95,
  "scheduler": False,
  "lr_decay_time": 10000,
  "patience": 30,
  "optim_name": "radam",
  "ckpt_file": None,
  "min_epochs": None,
  "gpus": 0,
  "max_epochs": 600,
  "batch_size": 128,
  "num_workers": 0,
  "persistent_workers": False,
  "cache_featurizers": True,
  "gradient_clip_val": 5,
  "loss_fn": "cosine",
  "worst_k_weight": None,
  "top_layers": 1,
  "fp_names": [
    "morgan4096"
  ],
  "shuffle_train": False,
  "iterative_preds": "growing",
  "iterative_loss_weight": 0.4,
  "refine_layers": 4,
  "hidden_size": 256,
  "max_peaks": None,
  "spectra_dropout": 0.1,
  "frag_fps_loss_lambda": 8.0,
  "magma_modulo": 512,
  "magma_aux_loss": True,
  "peak_attn_layers": 2,
  "num_heads": 8,
  "additive_attn": False,
  "pairwise_featurization": True,
  "use_cls": True,
  "single_form_encoder": True,
  "recycle_form_encoder": True,
  "cls_type": "ms1",
  "set_pooling": "cls",
  "max_count": None,
  "model": "MistNet",
  "spec_features": "peakformula",
  "mol_features": "fingerprint",
  "dataset_type": "default"
}


In [4]:
my_splitter = splitter.get_splitter(**kwargs)
# Get model class
model_class = mist_model.MistNet
kwargs["model"] = model_class.__name__
kwargs["spec_features"] = model_class.spec_features()
kwargs["mol_features"] = model_class.mol_features()
kwargs["dataset_type"] = model_class.dataset_type()

# Get featurizers
paired_featurizer = featurizers.get_paired_featurizer(**kwargs)

# Build dataset
spectra_mol_pairs = datasets.get_paired_spectra(**kwargs)
spectra_mol_pairs = list(zip(*spectra_mol_pairs))

# Redefine splitter s.t. this splits three times and remove subsetting
split_name, (train, val, test) = my_splitter.get_splits(spectra_mol_pairs)


10709it [00:00, 151019.93it/s]
10709it [00:01, 9080.16it/s]
10709it [00:00, 5143341.52it/s]


In [5]:
train_dataset = datasets.SpectraMolDataset(
        spectra_mol_list=train, featurizer=paired_featurizer, **kwargs
    )
val_dataset = datasets.SpectraMolDataset(
    spectra_mol_list=val, featurizer=paired_featurizer, **kwargs
)
test_dataset = datasets.SpectraMolDataset(
    spectra_mol_list=test, featurizer=paired_featurizer, **kwargs
)
spec_dataloader_module = datasets.SpecDataModule(
    train_dataset, val_dataset, test_dataset, **kwargs
)

In [119]:
print(f"Train len: {len(train_dataset)}")
print(f"Val len: {len(val_dataset)}")
print(f"Test len: {len(test_dataset)}")

Train len: 6141
Val len: 1070
Test len: 819


In [120]:
test_dataset[0]

{'spec': [{'peak_type': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]),
   'form_vec': array([[0.27160494, 0.        , 0.        , 0.29411765, 0.        ,
           0.        , 0.        , 0.12658228, 0.        , 0.        ,
           0.        , 0.        , 0.        , 0.        , 0.        ,
           0.        , 0.30210197],
          [0.14814815, 0.        , 0.        , 0.14705882, 0.        ,
           0.        , 0.        , 0.06329114, 0.        , 0.        ,
           0.        , 0.        , 0.        , 0.        , 0.        ,
           0.        , 0.15921618],
          [0.13580247, 0.        , 0.        , 0.11764706, 0.        ,
           0.        , 0.        , 0.06329114, 0.        , 0.        ,
           0.        , 0.        , 0.        , 0.        , 0.        ,
           0.        , 0.14017471],
          [0.13580247, 0.        , 0.        , 0.11764706, 0.        

In [121]:
test_dataset[0].keys()

dict_keys(['spec', 'mol', 'spec_indices', 'mol_indices', 'matched'])

In [122]:
test_dataset[0]["spec"][0].keys()

dict_keys(['peak_type', 'form_vec', 'frag_intens', 'name', 'magma_fps', 'magma_aux_loss'])

In [123]:
test_dataset[0]["spec"][0]["peak_type"]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3])

In [124]:
len(test_dataset[0]["spec"][0]["peak_type"])

43

In [126]:
spec_lens = [len(test_dataset[i]["spec"]) for i in range(len(test_dataset))]

In [127]:
import pandas as pd
pd.Series(spec_lens).unique()

array([1])

In [128]:
test_dataset[0]["spec"][0]["form_vec"].shape

(43, 17)

In [129]:
test_dataset[0]["spec"][0]["frag_intens"].shape

(43,)

In [130]:
test_dataset[0]["spec"][0]["name"]

'CCMSLIB00000001645'

In [131]:
test_dataset[0]["spec"][0]["magma_fps"].shape

(43, 2048)

In [132]:
test_dataset[0]["spec"][0]["magma_aux_loss"]

True

In [133]:
len(test_dataset[0]["mol"][0])

4096