In [1]:
import ipsuite as ips
import numpy as np
from ipsuite import OrcaSinglePoint
from src import ShuffleAndSelect, FixEnergy
from apax.nodes import Apax, ApaxBatchPrediction

2024-12-05 11:21:09,165 (DEBUG): Welcome to IPS - the Interatomic Potential Suite!


  from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1733394075.505165 1132165 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [2]:
project = ips.Project(remove_existing_graph=True, automatic_node_names=True)

# Data Generation

In [3]:
with project.group("DataGeneration"):
    # Load entire MLIP-MD trajectory
    traditional_md = ips.AddDataH5MD(file="data/traditional_md.h5")
    enhanced_sampling_md = ips.AddDataH5MD(file="data/enhanced_md.h5")
    # TODO: METAD MD

    # Load Random Configurations from MD files
    trad_dataset = ips.RandomSelection(data=traditional_md.frames, n_configurations=500)
    rand_trad_dataset = ips.RandomSelection(data=trad_dataset.excluded_frames, n_configurations=500)
    es_dataset = ips.RandomSelection(data=enhanced_sampling_md.frames, n_configurations=500)

    # Flatten Energies
    raw_datasets = [trad_dataset, rand_trad_dataset, es_dataset]
    datasets = []
    for data in raw_datasets:
        datasets.append(FixEnergy(data = data.frames))

    dft = []
    for data in datasets:
        dft.append(
            OrcaSinglePoint(
                data=data.frames,
                orcasimpleinput="PBE def2-TZVP TightSCF EnGrad",
                orcablocks="%pal nprocs 8 end",
                orca_shell="/data/fzills/tools/orca_5_0_4/orca",
            ))
        
    # Split Datasets into Test,Train and Validate Data
    split_datasets = []
    
    for data in dft:
        split_datasets.append(
            ShuffleAndSelect(
                data=data.frames,
                n_train=20, 
                n_test=400, 
                n_validate=20,
            ))

# Model Training

In [None]:
with project.group("ModelTraining"):
    base_model = Apax(
        data=split_datasets[0].train_frames,
        validation_data=split_datasets[0].validate_frames,
        config="configs/base_train.yaml",
    )

    r_trad_model = Apax(
        model = base_model,
        data=split_datasets[1].train_frames,
        validation_data=split_datasets[1].validate_frames,
        config="configs/random_transfer.yaml",
    )
    
    es_model = Apax(
        model = base_model,
        data=split_datasets[2].train_frames,
        validation_data=split_datasets[2].validate_frames,
        config="configs/es_transfer.yaml",
    )
    models = [base_model, r_trad_model, es_model]

# Model Metrics

In [5]:
with project.group("ModelMetrics"):
    for i in range(len(models)):
        # Use ES-Dataset to Test Models as this data covers most of CV space
        pred = ApaxBatchPrediction(data=split_datasets[2].test_frames, model=models[i], batch_size=10)
        ips.PredictionMetrics(x = split_datasets[2].test_frames, y = pred.frames)

In [6]:
project.build()

2024-12-05 11:21:19,496 - INFO: Saving params.yaml


100%|██████████| 23/23 [00:00<00:00, 270.75it/s]
