In [1]:
import torch

torch.set_default_device("cuda")

In [2]:
import pandas as pd
from tqdm import tqdm
from pymatgen.core.structure import Structure, Molecule
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph
import numpy as np
from sklearn.model_selection import train_test_split
import random

In [3]:
from MEGNetSparse.model import MEGNet

In [4]:
data = pd.read_csv('../megnet_test/data/data_all_new.csv')

molecules = []
targets = []

for i, row in tqdm(data.iterrows(), total=data.shape[0]):
    molecules.append(Molecule.from_str(row['xyz'], 'xyz'))
    targets.append(row['U_0'] / len(molecules[-1]))

100%|██████████| 122381/122381 [00:43<00:00, 2813.89it/s]


In [5]:
def create_structure_dict(structures: list, targets: list):
    structure_dict = {}

    for structure in tqdm(zip(structures, targets), total=len(structures)):
        if structure[0].formula in structure_dict:
            structure_dict[structure[0].formula].append(structure)
        else:
            structure_dict[structure[0].formula] = [structure]
        
    return structure_dict

def train_test_split_for_structures(structure_dict: dict, test_size: float = 0.1):
    structures_train, structures_test = [], []
    targets_train, targets_test = [], []
    train_all, test_all = [], []

    for _, data in tqdm(structure_dict.items()):
        if len(data) < 2:
            train_all += data
        else:
            train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
            train_all += train_data
            test_all += test_data
    
    random.shuffle(train_all)
    random.shuffle(test_all)

    for t_data in train_all:
        structures_train.append(t_data[0])
        targets_train.append(t_data[1])
    
    for te_data in test_all:
        structures_test.append(te_data[0])
        targets_test.append(te_data[1])
    
    return structures_train, targets_train, structures_test, targets_test

In [6]:
structure_dict = create_structure_dict(molecules, targets)
molecules_train, targets_train, molecules_val_test, targets_val_test = train_test_split_for_structures(structure_dict)
structure_dict_val_test = create_structure_dict(molecules_val_test, targets_val_test)
molecules_val, targets_val, molecules_test, targets_test = train_test_split_for_structures(structure_dict_val_test, 0.5)

100%|██████████| 122381/122381 [00:08<00:00, 13737.46it/s]
100%|██████████| 134/134 [00:00<00:00, 2488.04it/s]
100%|██████████| 12293/12293 [00:00<00:00, 13101.31it/s]
100%|██████████| 117/117 [00:00<00:00, 4189.79it/s]


In [7]:
config = {
    'model': {
        'train_batch_size': 54,
        'test_batch_size': 54,
        'add_z_bond_coord': False,
        'atom_features': 'werespecies',
        'state_input_shape': 2,
        'cutoff': 5,
        'edge_embed_size': 10,
        'vertex_aggregation': 'mean',
        'global_aggregation': 'mean',
        'embedding_size': 32,
        'nblocks': 3,
    },
    'optim': {
        'factor': 0.5,
        'patience': 30,
        'threshold': 5e-2,
        'min_lr': 1e-5,
        'lr_initial': 1e-3,
        'scheduler': 'ReduceLROnPlateau',
    }
}

In [8]:
from MEGNetSparse import MEGNetTrainer

trainer = MEGNetTrainer(config, 'cuda')



In [9]:
trainer.prepare_data(molecules_train, targets_train, molecules_val, targets_val, "U0")

adding targets to data




converting data


100%|██████████| 110088/110088 [00:41<00:00, 2657.62it/s]
100%|██████████| 6140/6140 [00:01<00:00, 3319.93it/s]


In [10]:
trainer.train_one_epoch()

target: U0 device: cuda


RuntimeError: stack expects each tensor to be equal size, but got [336] at entry 0 and [90] at entry 1