In [1]:
from jarvis.db.figshare import data
from jarvis.core.atoms import Atoms
import pandas as pd
from modnet.featurizers.presets import DeBreuck2020Featurizer
import os
# os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
from pymatgen.core import Composition
import warnings
from sklearn.metrics import mean_absolute_error
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm
2023-09-08 12:37:25.272442: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-08 12:37:26.689282: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-08 12:37:26.689398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:

class BasicFeaturizer(DeBreuck2020Featurizer):
    from pymatgen.analysis.local_env import VoronoiNN

    from matminer.featurizers.composition import (
        AtomicOrbitals,
        ElementFraction,
        ElementProperty,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
    )

    from matminer.featurizers.structure import (
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )

    oxid_composition_featurizers = ()

    composition_featurizers = (
        AtomicOrbitals(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
    )

    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

basic_featurizer = BasicFeaturizer()
basic_featurizer.set_n_jobs(20)
# basic_featurizer._n_jobs = None

Newer versions of matminer will not work, and older versions may not be compatible with newer MODNet versions due to other conflicts.
To use this featurizer robustly, please install `modnet==0.1.13` with its pinned dependencies.

This preset will now be initialised without importing matminer featurizers to enable use with existing previously featurized data, but attempts to perform further featurization will result in an error.


In [47]:

warnings.filterwarnings('ignore')

def iterate_dataset(folder_path):
    dataset = []
    for root, subfolders, files in os.walk(folder_path):
        dataset.append(subfolders)
    return dataset[0]


file_path = "/scratch/yll6162/modnet/materials_data"

datasets = iterate_dataset(file_path)
# LOOP 
target_name = "target"
mae_dic = {}
print(datasets)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[1:], 'GPU')
with tf.device('/device:GPU:1'):
    for target_property in datasets[:1]:
        df_train = pd.read_csv(os.path.join(file_path, target_property, "train.csv"))
        df_test = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
        df_val = pd.read_csv(os.path.join(file_path, target_property, "val.csv"))
                              
        df_train["composition"] = df_train["formula"].map(Composition) # maps composition to a pymatgen composition object
    
        # Creating MODData
        data_train = MODData(materials = df_train["composition"],
                       targets = df_train[target_name],
                       target_names=[target_name],
                       featurizer=basic_featurizer,
                       structure_ids=df_train.index, )
    
        data_train.featurize()
        data_train.feature_selection(n=200)
        df_val["composition"] = df_val["formula"].map(Composition) # maps composition to a pymatgen composition object
        data_val = MODData(materials = df_val["composition"],
                       targets = df_val[target_name],
                       target_names=[target_name],
                       featurizer=basic_featurizer,
                       structure_ids=df_val.index, )
    
        data_val.featurize()
        # data_val.feature_selection(n=200)
        
        # Creating MODNetModel
        model = MODNetModel([[[target_name]]],
                            weights={target_name:1},
                            num_neurons=[[256],[64],[64],[32]],
                           )
        
        model.fit(data_train,
                  val_data = data_val,
                  epochs = 250,
                  verbose = 0
                 )
        
        # # Predicting on unlabeled data
        df_test["composition"] = df_test["formula"].map(Composition)
        data_to_predict = MODData(materials = df_test["composition"],
                       featurizer=basic_featurizer,
                       structure_ids=df_test.index)
        data_to_predict.featurize()
        # data_to_predict.feature_selection(n=200)
        df_predictions = model.predict(data_to_predict)
        df_test_pred = df_test.merge(df_predictions, how = 'left', left_index = True, right_index = True, suffixes=('_true', '_pred'))
        mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
        print("-" * 40)
        print(f"{target_property}: {mae}")
        # mae_dic[target_property] = mae
        # df_test_pred.to_csv(os.path.join(file_path, target_property, "test_pred.csv"))   
    # mae_dic['mepsx'] = 35.89810254446666
    # mae_dic['et_c55'] = 17.68120662339898
    # mae_dic['n-powerfact'] = 589.4109630783031
    # mae_dic['mbj_bandgap'] = 0.4537690369214026
    # mae_dic['mepsy'] = 36.51089946686116
    # df_mae_all = pd.from_dict(mae_dic)
    # df_mae_all.to_csv(os.path.join(file_path, "mae_all.csv"))   

['mepsx', 'et_c55', 'n-powerfact', 'mbj_bandgap', 'mepsy', 'n-Seebeck', 'exfoliation_energy', 'p-powerfact', 'max_ir_mode', 'p_em300k', 'avg_elec_mass', 'bulk_modulus_kv', 'magmom_oszicar', 'encut', 'n_em300k', 'dfpt_piezo_max_dielectric_ionic', 'p-Seebeck', 'density', 'epsy', 'et_c13', 'et_c22', 'shear_modulus_gv', 'magmom_outcar', 'dfpt_piezo_max_dij', 'epsx', 'et_c12', 'min_mode', 'et_c33', 'dfpt_piezo_max_dielectric', 'et_c66', 'formation_energy_peratom', 'mepsz', 'optb88vdw_bandgap', 'slme', 'poisson', 'max_mode', 'dfpt_piezo_max_eij', 'dfpt_piezo_max_dielectric_electronic', 'min_ir_mode', 'epsz', 'max_efg', 'et_c44', 'ehull', 'avg_hole_mass', 'et_c11', 'kpoint_length_unit', 'optb88vdw_total_energy', 'spillage', '.ipynb_checkpoints']
2023-09-06 17:55:27,265 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 17:55:27,281 - modnet - INFO - Computing features, this can take time...
2023-09-06 17:55:27,283 - modnet - INFO - Applying composition featurizers...
2023-09-06 1

MultipleFeaturizer: 100%|██████████████████| 9104/9104 [00:33<00:00, 275.21it/s]


2023-09-06 17:56:01,419 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:30<00:00, 297.40it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 304.58it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:30<00:00, 295.08it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:30<00:00, 296.99it/s]

SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:52<00:00, 172.25it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 309.16it/s]

SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:31<00:00, 285.15it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:28<00:00, 317.60it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 308.34it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:34<00:00, 260.26it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:31<00:00, 285.39it/s]

SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 311.52it/s]



2023-09-06 18:02:44,435 - modnet - INFO - Data has successfully been featurized!
2023-09-06 18:02:44,502 - modnet - INFO - Multiprocessing on 1 workers.
2023-09-06 18:02:44,506 - modnet - INFO - Computing "self" MI (i.e. information entropy) of features


100%|█████████████████████████████████████████| 257/257 [00:06<00:00, 38.39it/s]

2023-09-06 18:02:51,244 - modnet - INFO - Computing cross NMI between all features...



100%|█████████████████████████████████████| 18336/18336 [08:33<00:00, 35.74it/s]

2023-09-06 18:11:24,519 - modnet - INFO - Starting target 1/1: target ...
2023-09-06 18:11:24,521 - modnet - INFO - Computing mutual information between features and target...





2023-09-06 18:11:39,010 - modnet - INFO - Computing optimal features...
2023-09-06 18:11:42,138 - modnet - INFO - Selected 50/188 features...
2023-09-06 18:11:44,514 - modnet - INFO - Selected 100/188 features...
2023-09-06 18:11:45,956 - modnet - INFO - Selected 150/188 features...
2023-09-06 18:11:46,352 - modnet - INFO - Done with target 1/1: target.
2023-09-06 18:11:46,353 - modnet - INFO - Merging all features...
2023-09-06 18:11:46,354 - modnet - INFO - Done.
2023-09-06 18:11:46,367 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 18:11:46,374 - modnet - INFO - Computing features, this can take time...
2023-09-06 18:11:46,375 - modnet - INFO - Applying composition featurizers...
2023-09-06 18:11:46,398 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'Melti

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:04<00:00, 272.63it/s]


2023-09-06 18:11:51,073 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 423.15it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 395.71it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 455.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 474.03it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:06<00:00, 181.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 455.50it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 383.50it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 448.38it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 436.00it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 334.72it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 378.58it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 450.29it/s]


2023-09-06 18:12:40,193 - modnet - INFO - Data has successfully been featurized!


AttributeError: 'MODData' object has no attribute 'df_targets'

In [49]:
data_val = MODData(materials = df_val["composition"],
               targets = df_val[target_name],
               target_names=[target_name],
               featurizer=basic_featurizer,
               structure_ids=df_val.index, )

data_val.featurize()
# data_val.feature_selection(n=200)

# Creating MODNetModel
model = MODNetModel([[[target_name]]],
                    weights={target_name:1},
                    num_neurons=[[256],[64],[64],[32]],
                   )

model.fit(data_train,
          val_data = data_val,
          epochs = 250,
          verbose = 0
         )

# # Predicting on unlabeled data
df_test["composition"] = df_test["formula"].map(Composition)
data_to_predict = MODData(materials = df_test["composition"],
               featurizer=basic_featurizer,
               structure_ids=df_test.index)
data_to_predict.featurize()
# data_to_predict.feature_selection(n=200)
df_predictions = model.predict(data_to_predict)
df_test_pred = df_test.merge(df_predictions, how = 'left', left_index = True, right_index = True, suffixes=('_true', '_pred'))
mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
print("-" * 40)
print(f"{target_property}: {mae}")

2023-09-06 18:34:35,846 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 18:34:35,854 - modnet - INFO - Computing features, this can take time...
2023-09-06 18:34:35,855 - modnet - INFO - Applying composition featurizers...
2023-09-06 18:34:35,860 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'maximum', 'range', 'mean', 'avg_dev',
           

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:03<00:00, 295.44it/s]


2023-09-06 18:34:40,141 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 437.42it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 466.08it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 470.52it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 465.95it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:06<00:00, 185.21it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 430.52it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 368.94it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 456.18it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 455.89it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 350.04it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 377.16it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 451.13it/s]


2023-09-06 18:35:27,861 - modnet - INFO - Data has successfully been featurized!
2023-09-06 18:36:33,661 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 18:36:33,671 - modnet - INFO - Computing features, this can take time...
2023-09-06 18:36:33,673 - modnet - INFO - Applying composition featurizers...
2023-09-06 18:36:33,693 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
   

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:04<00:00, 280.69it/s]


2023-09-06 18:36:38,395 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 421.72it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 422.13it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 444.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 431.15it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:07<00:00, 154.71it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 429.63it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 345.68it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 421.39it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 416.53it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 322.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 358.82it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 421.64it/s]


2023-09-06 18:37:31,693 - modnet - INFO - Data has successfully been featurized!
----------------------------------------
mepsx: 34.07253679257838


In [80]:
data_val = MODData(materials = df_val["composition"],
               targets = df_val[target_name],
               target_names=[target_name],
               featurizer=basic_featurizer,
               structure_ids=df_val.formula)
data_val.featurize()
data_val.df_featurized

2023-09-06 22:44:34,004 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 22:44:34,012 - modnet - INFO - Computing features, this can take time...
2023-09-06 22:44:34,013 - modnet - INFO - Applying composition featurizers...
2023-09-06 22:44:34,026 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'maximum', 'range', 'mean', 'avg_dev',
           

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:04<00:00, 265.44it/s]


2023-09-06 22:44:38,876 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 430.56it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 439.38it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 436.48it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 423.40it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:06<00:00, 171.84it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 423.98it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 359.57it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 436.96it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 407.55it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 326.94it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 363.87it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 415.25it/s]



2023-09-06 22:45:32,019 - modnet - INFO - Data has successfully been featurized!


Unnamed: 0_level_0,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,ElementFraction|H,ElementFraction|He,ElementFraction|Li,...,Stoichiometry|10-norm,TMetalFraction|transition metal fraction,ValenceOrbital|avg s valence electrons,ValenceOrbital|avg p valence electrons,ValenceOrbital|avg d valence electrons,ValenceOrbital|avg f valence electrons,ValenceOrbital|frac s valence electrons,ValenceOrbital|frac p valence electrons,ValenceOrbital|frac d valence electrons,ValenceOrbital|frac f valence electrons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sc2ZnGa,3.0,21,-0.131080,3.0,21,-0.131080,0.000000,0.000000,0,0.000000,...,0.500098,0.000000,2.000000,0.250000,5.500,0.000000,0.258065,0.032258,0.709677,0.000000
U3P2S,4.0,92,-0.366543,4.0,92,-0.366543,0.000000,0.000000,0,0.000000,...,0.500861,0.000000,2.000000,1.666667,0.500,1.500000,0.352941,0.294118,0.088235,0.264706
AcMnO3,3.0,25,-0.266540,3.0,25,-0.266540,0.000000,0.000000,0,0.000000,...,0.600002,0.200000,2.000000,2.400000,1.200,0.000000,0.357143,0.428571,0.214286,0.000000
K2NaGaAs2,2.0,33,-0.197497,2.0,33,-0.197497,0.000000,0.000000,0,0.000000,...,0.357293,0.000000,1.500000,1.166667,5.000,0.000000,0.195652,0.152174,0.652174,0.000000
YAuO2,3.0,79,-0.304738,1.0,79,-0.162334,0.142404,0.000000,0,0.000000,...,0.500098,0.000000,1.750000,2.000000,2.750,3.500000,0.175000,0.200000,0.275000,0.350000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TmAl3,2.0,13,-0.102545,2.0,13,-0.102545,0.000000,0.000000,0,0.000000,...,0.750001,0.000000,2.000000,0.750000,0.000,3.250000,0.333333,0.125000,0.000000,0.541667
AgGe3,2.0,32,-0.149882,2.0,32,-0.149882,0.000000,0.000000,0,0.000000,...,0.750001,0.250000,1.750000,1.500000,10.000,0.000000,0.132075,0.113208,0.754717,0.000000
ReSbO6,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.000000,0,0.000000,...,0.750000,0.125000,2.000000,3.375000,1.875,1.750000,0.222222,0.375000,0.208333,0.194444
LiBePt,1.0,78,-0.161308,1.0,78,-0.161308,0.000000,0.000000,0,0.333333,...,0.372041,0.333333,1.333333,0.000000,3.000,4.666667,0.148148,0.000000,0.333333,0.518519


In [57]:
df_test["composition"] = df_test["formula"].map(Composition)
data_to_predict_new = MODData(materials = df_test["composition"],
                              df_featurized = data_to_predict.df_featurized,
                              structure_ids=df_test.index)

# data_to_predict_new.feature_selection(n=200)
df_predictions = model.predict(data_to_predict_new)
df_test_pred = df_test.merge(df_predictions, how = 'left', left_index = True, right_index = True, suffixes=('_true', '_pred'))
mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
print("-" * 40)
print(f"{target_property}: {mae}")

----------------------------------------
mepsx: 34.07253679257838


In [46]:
    mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
    print(f"{target_property}: {mae}")
    mae_dic[target_property] = mae
    df_test_pred.to_csv(os.path.join(file_path, target_property, "test_pred.csv"))   
# df_mae_all = pd.from_dict(mae_dic)

mepsx: 34.622218064060945


In [58]:
data_to_predict.df_featurized

Unnamed: 0_level_0,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,ElementFraction|H,ElementFraction|He,ElementFraction|Li,...,Stoichiometry|10-norm,TMetalFraction|transition metal fraction,ValenceOrbital|avg s valence electrons,ValenceOrbital|avg p valence electrons,ValenceOrbital|avg d valence electrons,ValenceOrbital|avg f valence electrons,ValenceOrbital|frac s valence electrons,ValenceOrbital|frac p valence electrons,ValenceOrbital|frac d valence electrons,ValenceOrbital|frac f valence electrons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,46,-0.160771,3.0,46,-0.160771,0.000000,0.00,0,0.0,...,0.500098,0.750000,0.750000,0.000000,10.000000,0.00,0.069767,0.000000,0.930233,0.000000
1,2.0,7,-0.266297,2.0,7,-0.266297,0.000000,0.50,0,0.0,...,0.500010,0.000000,1.500000,1.700000,1.000000,0.00,0.357143,0.404762,0.238095,0.000000
2,1.0,81,-0.285020,1.0,21,-0.156478,0.128542,0.00,0,0.0,...,0.600001,0.000000,1.900000,3.200000,2.100000,2.80,0.190000,0.320000,0.210000,0.280000
3,2.0,7,-0.266297,2.0,7,-0.266297,0.000000,0.00,0,0.0,...,0.378291,0.235294,1.764706,1.764706,2.058824,0.00,0.315789,0.315789,0.368421,0.000000
4,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.25,0,0.0,...,0.500049,0.000000,1.625000,2.375000,0.000000,0.00,0.406250,0.593750,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,3.0,41,-0.125252,3.0,41,-0.125252,0.000000,0.00,0,0.0,...,0.500098,0.250000,1.250000,0.000000,1.000000,0.00,0.555556,0.000000,0.444444,0.000000
1134,1.0,25,-0.191136,1.0,43,-0.183636,0.007500,0.00,0,0.0,...,0.666667,0.333333,2.000000,0.000000,1.666667,0.00,0.545455,0.000000,0.454545,0.000000
1135,4.0,61,-0.200159,4.0,61,-0.200159,0.000000,0.00,0,0.0,...,0.750001,0.000000,2.000000,0.000000,7.500000,11.75,0.094118,0.000000,0.352941,0.552941
1136,1.0,41,-0.144272,1.0,41,-0.144272,0.000000,0.00,0,0.0,...,0.666668,0.111111,1.666667,3.333333,0.444444,0.00,0.306122,0.612245,0.081633,0.000000


In [60]:
data_to_predict.__dict__.keys()

dict_keys(['__modnet_version__', 'df_featurized', 'featurizer', 'cross_nmi', '_composition_only', 'df_structure'])

In [66]:
df = data_to_predict.df_featurized

In [69]:
df.index = df_test.index
df['formula'] = df_test['formula']

Unnamed: 0,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,ElementFraction|H,ElementFraction|He,ElementFraction|Li,...,TMetalFraction|transition metal fraction,ValenceOrbital|avg s valence electrons,ValenceOrbital|avg p valence electrons,ValenceOrbital|avg d valence electrons,ValenceOrbital|avg f valence electrons,ValenceOrbital|frac s valence electrons,ValenceOrbital|frac p valence electrons,ValenceOrbital|frac d valence electrons,ValenceOrbital|frac f valence electrons,formula
0,3.0,46,-0.160771,3.0,46,-0.160771,0.000000,0.00,0,0.0,...,0.750000,0.750000,0.000000,10.000000,0.00,0.069767,0.000000,0.930233,0.000000,ZnCuPd2
1,2.0,7,-0.266297,2.0,7,-0.266297,0.000000,0.50,0,0.0,...,0.000000,1.500000,1.700000,1.000000,0.00,0.357143,0.404762,0.238095,0.000000,InH5(NF)2
2,1.0,81,-0.285020,1.0,21,-0.156478,0.128542,0.00,0,0.0,...,0.000000,1.900000,3.200000,2.100000,2.80,0.190000,0.320000,0.210000,0.280000,NaScTl2F6
3,2.0,7,-0.266297,2.0,7,-0.266297,0.000000,0.00,0,0.0,...,0.235294,1.764706,1.764706,2.058824,0.00,0.315789,0.315789,0.368421,0.000000,KMnAg3(CN)6
4,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.25,0,0.0,...,0.000000,1.625000,2.375000,0.000000,0.00,0.406250,0.593750,0.000000,0.000000,RbP(HO2)2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,3.0,41,-0.125252,3.0,41,-0.125252,0.000000,0.00,0,0.0,...,0.250000,1.250000,0.000000,1.000000,0.00,0.555556,0.000000,0.444444,0.000000,K2BeNb
1134,1.0,25,-0.191136,1.0,43,-0.183636,0.007500,0.00,0,0.0,...,0.333333,2.000000,0.000000,1.666667,0.00,0.545455,0.000000,0.454545,0.000000,Ba4MnTc
1135,4.0,61,-0.200159,4.0,61,-0.200159,0.000000,0.00,0,0.0,...,0.000000,2.000000,0.000000,7.500000,11.75,0.094118,0.000000,0.352941,0.552941,PmHg3
1136,1.0,41,-0.144272,1.0,41,-0.144272,0.000000,0.00,0,0.0,...,0.111111,1.666667,3.333333,0.444444,0.00,0.306122,0.612245,0.081633,0.000000,Rb2NbF6


In [6]:
warnings.filterwarnings('ignore')

def iterate_dataset(folder_path):
    dataset = []
    for root, subfolders, files in os.walk(folder_path):
        dataset.append(subfolders)
    return dataset[0]


file_path = "/scratch/yll6162/modnet/materials_data"
# with open(os.path.join(file_path, "feature_grp.json"), 'r') as json_file:
#     feature_grp = json.load(json_file)

datasets = iterate_dataset(file_path)
feature_grp = {}
df_train_all = pd.read_csv(os.path.join(file_path, 'optb88vdw_bandgap', "train.csv")) 
for target_property in datasets:
    if not target_property.startswith(".") and target_property != 'optb88vdw_bandgap':
        print(target_property)
        df_train = pd.read_csv(os.path.join(file_path, target_property, "train.csv"))
        print(df_train[~df_train.formula.isin(df_train_all.formula)])
        
        print(df_train.shape[0])
    
    # df_test = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
    # df_val = pd.read_csv(os.path.join(file_path, target_property, "val.csv"))

mepsx
       formula    target
0        KClO4    2.0750
2       AsIrBr  124.6429
3      Te3MoWS   12.7271
4       TeAsIr   54.2954
7     Al2CdTe4    6.6264
...        ...       ...
9074     K2Se3    4.5072
9079   Ba4NiBr   87.6977
9083    K3AlF6    1.6857
9088      KH2N    2.1382
9103   HoTlTe2    8.3508

[1855 rows x 2 columns]
9104
et_c55
      formula  target
0     Cd3AsI3     3.0
3     NaTlHg2     8.9
5     TmMg2Sc    46.1
6       BaEu3    19.5
8       NdGe2    14.1
...       ...     ...
9310  Ce(BC)2   106.5
9321     CdO3    10.4
9323    BaBr2    11.7
9334   AlCrGe    59.4
9335   LiGaS2    25.0

[1849 rows x 2 columns]
9339
n-powerfact
        formula       target
0      Cr(PO3)2   716.723333
4        Ba3As2   368.680000
16           KI   580.850000
17     Li2FePO5   302.110000
23     Al2HgSe4   513.040000
...         ...          ...
13096   PmAgAu2   345.790000
13110   Rb2PtS2  2116.346667
13117  Nd2MgNi2    60.630000
13124     AlPS4  1181.576667
13126       NaS   878.786667

[2