In [1]:
import pandas as pd
import numpy as np
from pymatgen.core import Structure, Molecule
from tqdm import tqdm

import torch
from torch import nn
import dgl
from dgl.data.utils import split_dataset

from baseNet.layers import MLP
from baseNet.models import MLPNet
from baseNet.graph.data import myDataset, myDataLoader, collate_fn
from baseNet.graph.converters import get_element_list, Molecule2Graph

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("qm9_sample.csv")
data

Unnamed: 0,struct_id,structure,energy
0,624,"{'@module': 'pymatgen.core.structure', '@class...",-307.529289
1,1028,"{'@module': 'pymatgen.core.structure', '@class...",-359.350089
2,3039,"{'@module': 'pymatgen.core.structure', '@class...",-324.556404
3,3176,"{'@module': 'pymatgen.core.structure', '@class...",-345.632155
4,3990,"{'@module': 'pymatgen.core.structure', '@class...",-402.159485
...,...,...,...
123,127070,"{'@module': 'pymatgen.core.structure', '@class...",-415.378890
124,128346,"{'@module': 'pymatgen.core.structure', '@class...",-432.865261
125,129530,"{'@module': 'pymatgen.core.structure', '@class...",-505.948830
126,131787,"{'@module': 'pymatgen.core.structure', '@class...",-475.012138


In [3]:
def load_dataset(dataframe) -> tuple[list[Structure], list[str], list[float]]:
    structures = []
    mol_ids = []
    energy = []
    stress = []

    for i in tqdm(range(len(dataframe))):
        mol = Molecule.from_dict(eval(dataframe["structure"][i]))

        eles = [mol[i].species_string for i in range(len(mol))]
        coords = mol.cart_coords.astype('float32')
        mol = Molecule(eles, coords)

        structures.append(mol)
        mol_ids.append(dataframe["struct_id"][i])
        energy.append(float(dataframe["energy"][i]))
        stress.append(np.zeros((3, 3)).tolist())

    return structures, mol_ids, energy, stress


molecules, mol_ids, energy, stress = load_dataset(data)

elem_list = get_element_list(molecules)
# setup a graph converter
converter = Molecule2Graph(element_types=elem_list, cutoff=4.0)
# convert the raw dataset into MEGNetDataset
mp_dataset = myDataset(
    structures=molecules,
    labels={
        "energies": energy,
    },
    converter=converter,
)

train_data, val_data, test_data = split_dataset(
    mp_dataset,
    frac_list=[0.9, 0.05, 0.05],
    shuffle=True,
    random_state=42,
)

train_loader, val_loader, test_loader = myDataLoader(
    train_data=train_data,
    val_data=val_data,
    test_data=test_data,
    collate_fn=collate_fn,
    batch_size=16,
    num_workers=0,
)

100%|█████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 1112.59it/s]


In [4]:
for batch in train_loader:
    g, lat, state_attr, labels = batch
    print(g)
    break

Graph(num_nodes=283, num_edges=3468,
      ndata_schemes={'frac_coords': Scheme(shape=(3,), dtype=torch.float32), 'node_type': Scheme(shape=(), dtype=torch.int32)}
      edata_schemes={'bond_vec': Scheme(shape=(3,), dtype=torch.float32), 'bond_dist': Scheme(shape=(), dtype=torch.float32), 'pbc_offset': Scheme(shape=(3,), dtype=torch.float32)})


In [5]:
model = MLPNet([128, 1024, 100], dropout=0.05)
model

MLPNet(
  (MLPblock): ModuleList(
    (0): MLP(128 → 1024, 1024 → 100)
    (1): MLP(128 → 1024, 1024 → 100)
    (2): MLP(128 → 1024, 1024 → 100)
  )
  (dropout): Dropout(p=0.05, inplace=False)
  (init0): MLP(3 → 128)
  (init1): MLP(100 → 128)
  (out): MLP(128 → 1)
)

In [8]:
out = model(g)
out

tensor([0.0974, 0.1096, 0.0912,  ..., 0.0810, 0.0926, 0.0926],
       grad_fn=<SqueezeBackward0>)

In [9]:
len(out)

3468