In [1]:
from jarvis.db.figshare import data
from jarvis.core.atoms import Atoms
import pandas as pd
from modnet.featurizers.presets import DeBreuck2020Featurizer
import os
# os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
from pymatgen.core import Composition
import warnings
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
import json

  from .autonotebook import tqdm as notebook_tqdm
2023-09-07 21:40:02.875112: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-07 21:40:04.173649: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-07 21:40:04.173747: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:

class BasicFeaturizer(DeBreuck2020Featurizer):
    from pymatgen.analysis.local_env import VoronoiNN

    from matminer.featurizers.composition import (
        AtomicOrbitals,
        ElementFraction,
        ElementProperty,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
    )

    from matminer.featurizers.structure import (
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )

    oxid_composition_featurizers = ()

    composition_featurizers = (
        AtomicOrbitals(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
    )

    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

basic_featurizer = BasicFeaturizer()
basic_featurizer.set_n_jobs(20)
# basic_featurizer._n_jobs = None

Newer versions of matminer will not work, and older versions may not be compatible with newer MODNet versions due to other conflicts.
To use this featurizer robustly, please install `modnet==0.1.13` with its pinned dependencies.

This preset will now be initialised without importing matminer featurizers to enable use with existing previously featurized data, but attempts to perform further featurization will result in an error.


In [32]:

warnings.filterwarnings('ignore')

def iterate_dataset(folder_path):
    dataset = []
    for root, subfolders, files in os.walk(folder_path):
        dataset.append(subfolders)
    return dataset[0]


file_path = "/scratch/yll6162/modnet/ibrnet_data"
with open(os.path.join(file_path, "feature_grp.json"), 'r') as json_file:
    feature_grp = json.load(json_file)

datasets = iterate_dataset(file_path)
# LOOP 
target_name = "target"
mae_dic = {}
print(datasets)
physical_devices = tf.config.list_physical_devices('GPU')

tf.config.set_visible_devices(physical_devices[1:], 'GPU')

# target_property = datasets[0]
# target_property = "jarvis_bulk_modulus"
target_property = "jarvis_shear_modulus"
df_train = pd.read_csv(os.path.join(file_path, target_property, "train.csv"))
df_test = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
df_val = pd.read_csv(os.path.join(file_path, target_property, "val.csv"))
key = str(df_train.shape[0])
featurize = key not in feature_grp.keys()


df_train["composition"] = df_train["formula"].map(Composition) # maps composition to a pymatgen composition object
df_test["composition"] = df_test["formula"].map(Composition)
df_val["composition"] = df_val["formula"].map(Composition) # maps composition to a pymatgen composition object
with tf.device('/device:GPU:1'):
    if featurize:
        feature_grp[key]= [target_property]
        data_train = MODData(materials = df_train["composition"],
                       targets = df_train[target_name],
                       target_names=[target_name],
                       featurizer=basic_featurizer,
                       structure_ids=df_train.formula)
    
        data_train.featurize()
        data_train.feature_selection(n=200)
        data_train.df_featurized.to_csv(os.path.join(file_path, f"df_train_featurized_{key}.csv"))
    
        
        data_val = MODData(materials = df_val["composition"],
                       targets = df_val[target_name],
                       target_names=[target_name],
                       featurizer=basic_featurizer,
                       structure_ids=df_val.formula)
        data_val.featurize()
        data_val.df_featurized.to_csv(os.path.join(file_path, f"df_val_featurized_{key}.csv"))
        
    else:
        feature_grp[key].append(target_property)
    
        df_train_featurized = pd.read_csv(os.path.join(file_path, f"df_train_featurized_{key}.csv"), index_col = 0)
        data_train = MODData(materials = df_train["composition"],
                             targets = df_train[target_name],
                             target_names=[target_name],
                             df_featurized = df_train_featurized,
                             structure_ids=df_train_featurized.index)
        data_train.feature_selection(n=200)
        
        df_val_featurized = pd.read_csv(os.path.join(file_path, f"df_val_featurized_{key}.csv"), index_col = 0)
        data_val = MODData(materials = df_val["composition"],
                           targets = df_val[target_name],
                           target_names=[target_name],
                           df_featurized = df_val_featurized,
                           structure_ids=df_val_featurized.index)
    
    model = MODNetModel([[[target_name]]],
                        weights={target_name:1},
                        num_neurons=[[256],[64],[64],[32]],
                       )
    model.fit(data_train,
              val_data = data_val,
              epochs = 250,
              verbose = 1
             )
    if featurize:
        data_to_predict = MODData(materials = df_test["composition"],
                       featurizer=basic_featurizer,
                       structure_ids=df_test.formula)
        data_to_predict.featurize()
        data_to_predict.df_featurized.to_csv(os.path.join(file_path, f"df_pred_featurized_{key}.csv"))

    else:
        df_pred_featurized = pd.read_csv(os.path.join(file_path, f"df_pred_featurized_{key}.csv"), index_col = 0)
        data_to_predict = MODData(materials = df_test["composition"],
                                  df_featurized = df_pred_featurized,
                                  structure_ids=df_pred_featurized.index)
    

df_predictions = model.predict(data_to_predict)
df_test_pred = df_test.merge(df_predictions, how = 'left', left_on = "formula", right_index = True, suffixes=('_true', '_pred'))
mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
print("-" * 40)
print(f"{target_property}: {mae}")
if os.path.exists(os.path.join(file_path, "mae_all.csv")):
    df_mae_all = pd.read_csv(os.path.join(file_path, "mae_all.csv"), index_col = 0)
    new_entry = {'target': target_property, 'mae': mae}
    df_mae_all.loc[len(df_mae_all)] = new_entry
else:
    mae_dic = {'target': [target_property], 'mae': [mae]}
    df_mae_all = pd.DataFrame.from_dict(mae_dic)
df_mae_all.to_csv(os.path.join(file_path, "mae_all.csv"))
feature_grp_str = json.dumps(feature_grp)
with open(os.path.join(file_path, "feature_grp.json"), 'w') as json_file:
    json_file.write(feature_grp_str)
print(feature_grp)

['aflow_density', 'aflow_Egap', 'aflow_enthalpy_formation_atom', 'aflow_volume_atom', 'jarvis_bulk_modulus', 'jarvis_e_form', 'jarvis_gap_opt', 'jarvis_gap_tbmbj', 'jarvis_shear_modulus', 'mp_band_gap', 'mp_density', 'mp_e_above_hull', 'mp_formation_energy_per_atom', 'mp_total_magnetization', 'mp_volume', 'oqmd_band_gap', 'oqmd_e_formation_energy', 'oqmd_stability', 'oqmd_volume', '.ipynb_checkpoints']
2023-09-07 00:15:31,958 - modnet - INFO - Multiprocessing on 1 workers.
2023-09-07 00:15:31,961 - modnet - INFO - Computing "self" MI (i.e. information entropy) of features


100%|████████████████████████████████████████| 257/257 [00:00<00:00, 618.30it/s]

2023-09-07 00:15:32,403 - modnet - INFO - Computing cross NMI between all features...



100%|████████████████████████████████████| 12561/12561 [00:18<00:00, 667.95it/s]

2023-09-07 00:15:51,357 - modnet - INFO - Starting target 1/1: target ...
2023-09-07 00:15:51,358 - modnet - INFO - Computing mutual information between features and target...





2023-09-07 00:15:51,967 - modnet - INFO - Computing optimal features...
2023-09-07 00:15:54,525 - modnet - INFO - Selected 50/153 features...
2023-09-07 00:15:56,154 - modnet - INFO - Selected 100/153 features...
2023-09-07 00:15:56,817 - modnet - INFO - Selected 150/153 features...
2023-09-07 00:15:56,826 - modnet - INFO - Done with target 1/1: target.
2023-09-07 00:15:56,827 - modnet - INFO - Merging all features...
2023-09-07 00:15:56,827 - modnet - INFO - Done.
epoch 0: loss: 3710.969, val_loss:2159.019 val_mae:37.709
epoch 1: loss: 3697.459, val_loss:2147.741 val_mae:37.565
epoch 2: loss: 3683.234, val_loss:2136.312 val_mae:37.416
epoch 3: loss: 3668.765, val_loss:2124.414 val_mae:37.259
epoch 4: loss: 3653.584, val_loss:2111.759 val_mae:37.091
epoch 5: loss: 3637.249, val_loss:2097.997 val_mae:36.906
epoch 6: loss: 3619.516, val_loss:2082.836 val_mae:36.703
epoch 7: loss: 3600.091, val_loss:2066.117 val_mae:36.478
epoch 8: loss: 3578.537, val_loss:2047.610 val_mae:36.228
epoch 9:

In [3]:
warnings.filterwarnings('ignore')

def iterate_dataset(folder_path):
    dataset = []
    for root, subfolders, files in os.walk(folder_path):
        dataset.append(subfolders)
    return dataset[0]


file_path = "/scratch/yll6162/modnet/ibrnet_data"
with open(os.path.join(file_path, "feature_grp.json"), 'r') as json_file:
    feature_grp = json.load(json_file)

datasets = iterate_dataset(file_path)
# LOOP 
target_name = "target"
mae_dic = {}
print(datasets)
physical_devices = tf.config.list_physical_devices('GPU')

tf.config.set_visible_devices(physical_devices[1:], 'GPU')

# target_property = datasets[0]
# target_property = "jarvis_bulk_modulus"
target_property = "jarvis_shear_modulus"
df_train = pd.read_csv(os.path.join(file_path, target_property, "train.csv"))

['aflow_density', 'aflow_Egap', 'aflow_enthalpy_formation_atom', 'aflow_volume_atom', 'jarvis_bulk_modulus', 'jarvis_e_form', 'jarvis_gap_opt', 'jarvis_gap_tbmbj', 'jarvis_shear_modulus', 'mp_band_gap', 'mp_density', 'mp_e_above_hull', 'mp_formation_energy_per_atom', 'mp_total_magnetization', 'mp_volume', 'oqmd_band_gap', 'oqmd_e_formation_energy', 'oqmd_stability', 'oqmd_volume', '.ipynb_checkpoints']


In [5]:
nan_rows = df_train.isna().any(axis=1)
df_train[nan_rows]

Unnamed: 0,formula,target
1760,,0.76


In [25]:
df_predictions = model.predict(data_to_predict)
df_test_pred = df_test.merge(df_predictions, how = 'left', left_on = "formula", right_index = True, suffixes=('_true', '_pred'))
mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
print("-" * 40)
print(f"{target_property}: {mae}")
# mae_dic[target_property] = mae
# df_test_pred.to_csv(os.path.join(file_path, target_property, "test_pred.csv"))   
# df_mae_all = pd.from_dict(mae_dic)
# df_mae_all.to_csv(os.path.join(file_path, "mae_all.csv"))  
feature_grp_str = json.dumps(feature_grp)
with open(os.path.join(file_path, "feature_grp.json"), 'w') as json_file:
    json_file.write(feature_grp_str)
print(feature_grp)

----------------------------------------
jarvis_bulk_modulus: 26.92448291236877
{'100': ['jarvis_bulk_modulus'], 100: ['jarvis_bulk_modulus']}


In [8]:
df_train

Unnamed: 0,formula,target
0,Si1W2Yb1,63.6023
1,Be2Li2Zn3,84.9053
2,Be1Mn1Si1V1,42.7211
3,Pa1Sb1V2,71.8008
4,Ho1Nd1Pd1,85.1632
...,...,...
279551,Na1Ni2Pa1,68.6927
279552,Ba1Sb2Tb1,129.4200
279553,Ac1Pb2Re1,99.8546
279554,Nb2Ni1Ru1,61.6680


In [13]:
df_mae_all = pd.read_csv(os.path.join(file_path, "mae_all.csv"), index_col = 0)
lst = ['aflow_density', 'aflow_Egap', 'aflow_enthalpy_formation_atom', 'aflow_volume_atom', 'jarvis_bulk_modulus', 'jarvis_e_form', 'jarvis_gap_opt', 'jarvis_gap_tbmbj', 'jarvis_shear_modulus', 'mp_band_gap', 'mp_density', 'mp_e_above_hull', 'mp_formation_energy_per_atom', 'mp_total_magnetization', 'mp_volume', 'oqmd_band_gap', 'oqmd_e_formation_energy', 'oqmd_stability', 'oqmd_volume']
df_mae_all

Unnamed: 0,target,mae
0,aflow_density,0.244915
1,aflow_Egap,0.122328
2,aflow_enthalpy_formation_atom,0.072389
3,aflow_volume_atom,0.844885
4,jarvis_gap_tbmbj,0.533603
5,oqmd_band_gap,0.076796
6,oqmd_e_formation_energy,0.097109
7,oqmd_stability,0.087722
8,oqmd_volume,25.060371
9,jarvis_bulk_modulus,13.108102


In [14]:
for e in lst:
    if e not in df_mae_all.target.values:
        print("'" + e + "'")