In [4]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=3


In [5]:
import os
import pymatgen
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import trange, tqdm

In [6]:
structures = pd.read_csv("datasets/dichalcogenides_innopolis_202105/defects.csv", index_col=0)

In [7]:
defects_descriptors = pd.read_csv("datasets/dichalcogenides_innopolis_202105/descriptors.csv", index_col=0)

In [8]:
import pymatgen.io.cif
STRUCTURES_FOLDER = "datasets/dichalcogenides_innopolis_202105/initial/"
initial_structures = dict()
for structure_file in tqdm(os.listdir(STRUCTURES_FOLDER)):
  this_file = pymatgen.io.cif.CifParser(os.path.join(STRUCTURES_FOLDER, structure_file))
  initial_structures[os.path.splitext(structure_file)[0]] = this_file.get_structures()[0]

  0%|          | 0/3480 [00:00<?, ?it/s]


Issues encountered while parsing CIF: Some fractional co-ordinates rounded to ideal values to avoid issues with finite precision.



In [9]:
structures["initial_structure"] = structures.apply(lambda row: initial_structures[row._id], axis=1)

In [17]:
import matplotlib
import pylab

from math import floor, ceil
from random import shuffle, seed
from megnet.models import MEGNetModel
from megnet.data.graph import GaussianDistance
from megnet.data.crystal import CrystalGraphWithBondTypes
from megnet.data.molecule import MolecularGraph
from megnet.utils.preprocessing import StandardScaler
from megnet.callbacks import ModelCheckpointMAE
from pymatgen.core import Lattice, Structure, Molecule

cm = pylab.get_cmap('Set1')

import tensorflow as tf
import numpy as np

In [11]:
import wandb
from wandb.keras import WandbCallback

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train, test = train_test_split(structures, test_size=0.25, random_state=42)

In [14]:
wandb.init(project='ai4material_design', entity='kazeev')

In [15]:
config = wandb.config
config.targets = ["homo", "energy_per_atom"]

In [24]:
gc = CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 10), 0.5), cutoff=6.)
model = MEGNetModel(nfeat_edge=100, nfeat_global=2, ntarget=len(config.targets), graph_converter=gc)

In [25]:
scaler = StandardScaler.from_training_data(train["initial_structure"],
                                           train[config.targets].values, is_intensive=True)
model.target_scaler = scaler

In [26]:
model.train(train["initial_structure"], train[config.targets],
            validation_structures=test["initial_structure"],
            validation_targets=test[config.targets],
            callbacks=[WandbCallback()],
            epochs=100, verbose=1, patience=1000)

ValueError: The data dimension for bond features is (1, 3334, 10) and does not match model required shape of (None, None, 100)

In [None]:
# TODO(kazeevn) speedup
test.loc[:, "predicted"] = model.predict_structures(test["initial_structure"].values)

In [None]:
fig, ax = plt.subplots()
ax.scatter(test[config.target], test["predicted"])
ax.set_xlabel("True energy per atom, eV")
ax.set_ylabel("Predicted energy per atom, eV");