In [4]:
# from jarvis.db.figshare import data
# from jarvis.core.atoms import Atoms
import pandas as pd
import tensorflow as tf
from modnet.featurizers.presets import DeBreuck2020Featurizer
import os
# os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
from pymatgen.core import Composition
import warnings
from sklearn.metrics import mean_absolute_error


2023-09-09 11:32:57.175778: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-09 11:32:58.004856: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-09 11:32:58.004959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  from .autonotebook import tqdm as notebook_tqdm


In [6]:

class BasicFeaturizer(DeBreuck2020Featurizer):
    from pymatgen.analysis.local_env import VoronoiNN

    from matminer.featurizers.composition import (
        AtomicOrbitals,
        ElementFraction,
        ElementProperty,
        Stoichiometry,
        TMetalFraction,
        ValenceOrbital,
    )

    from matminer.featurizers.structure import (
        BondFractions,
        ChemicalOrdering,
        CoulombMatrix,
        DensityFeatures,
        EwaldEnergy,
        GlobalSymmetryFeatures,
        MaximumPackingEfficiency,
        RadialDistributionFunction,
        SineCoulombMatrix,
        StructuralHeterogeneity,
        XRDPowderPattern,
    )

    from matminer.featurizers.site import (
        AGNIFingerprints,
        AverageBondAngle,
        AverageBondLength,
        BondOrientationalParameter,
        ChemEnvSiteFingerprint,
        CoordinationNumber,
        CrystalNNFingerprint,
        GaussianSymmFunc,
        GeneralizedRadialDistributionFunction,
        LocalPropertyDifference,
        OPSiteFingerprint,
        VoronoiFingerprint,
    )

    oxid_composition_featurizers = ()

    composition_featurizers = (
        AtomicOrbitals(),
        ElementFraction(),
        ElementProperty.from_preset("magpie"),
        Stoichiometry(),
        TMetalFraction(),
        ValenceOrbital(),
    )

    site_featurizers = (
        AGNIFingerprints(),
        AverageBondAngle(VoronoiNN()),
        AverageBondLength(VoronoiNN()),
        BondOrientationalParameter(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        CrystalNNFingerprint.from_preset("ops"),
        GaussianSymmFunc(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        LocalPropertyDifference(),
        OPSiteFingerprint(),
        VoronoiFingerprint(),
    )

basic_featurizer = BasicFeaturizer()
basic_featurizer.set_n_jobs(20)
# basic_featurizer._n_jobs = None

Newer versions of matminer will not work, and older versions may not be compatible with newer MODNet versions due to other conflicts.
To use this featurizer robustly, please install `modnet==0.1.13` with its pinned dependencies.

This preset will now be initialised without importing matminer featurizers to enable use with existing previously featurized data, but attempts to perform further featurization will result in an error.


In [47]:

warnings.filterwarnings('ignore')

def iterate_dataset(folder_path):
    dataset = []
    for root, subfolders, files in os.walk(folder_path):
        dataset.append(subfolders)
    return dataset[0]


file_path = "/scratch/yll6162/modnet/materials_data"

datasets = iterate_dataset(file_path)
# LOOP 
target_name = "target"
mae_dic = {}
print(datasets)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices[1:], 'GPU')
with tf.device('/device:GPU:1'):
    for target_property in datasets[:1]:
        df_train = pd.read_csv(os.path.join(file_path, target_property, "train.csv"))
        df_test = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
        df_val = pd.read_csv(os.path.join(file_path, target_property, "val.csv"))
                              
        df_train["composition"] = df_train["formula"].map(Composition) # maps composition to a pymatgen composition object
    
        # Creating MODData
        data_train = MODData(materials = df_train["composition"],
                       targets = df_train[target_name],
                       target_names=[target_name],
                       featurizer=basic_featurizer,
                       structure_ids=df_train.index, )
    
        data_train.featurize()
        data_train.feature_selection(n=200)
        df_val["composition"] = df_val["formula"].map(Composition) # maps composition to a pymatgen composition object
        data_val = MODData(materials = df_val["composition"],
                       targets = df_val[target_name],
                       target_names=[target_name],
                       featurizer=basic_featurizer,
                       structure_ids=df_val.index, )
    
        data_val.featurize()
        # data_val.feature_selection(n=200)
        
        # Creating MODNetModel
        model = MODNetModel([[[target_name]]],
                            weights={target_name:1},
                            num_neurons=[[256],[64],[64],[32]],
                           )
        
        model.fit(data_train,
                  val_data = data_val,
                  epochs = 250,
                  verbose = 0
                 )
        
        # # Predicting on unlabeled data
        df_test["composition"] = df_test["formula"].map(Composition)
        data_to_predict = MODData(materials = df_test["composition"],
                       featurizer=basic_featurizer,
                       structure_ids=df_test.index)
        data_to_predict.featurize()
        # data_to_predict.feature_selection(n=200)
        df_predictions = model.predict(data_to_predict)
        df_test_pred = df_test.merge(df_predictions, how = 'left', left_index = True, right_index = True, suffixes=('_true', '_pred'))
        mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
        print("-" * 40)
        print(f"{target_property}: {mae}")
        # mae_dic[target_property] = mae
        # df_test_pred.to_csv(os.path.join(file_path, target_property, "test_pred.csv"))   
    # mae_dic['mepsx'] = 35.89810254446666
    # mae_dic['et_c55'] = 17.68120662339898
    # mae_dic['n-powerfact'] = 589.4109630783031
    # mae_dic['mbj_bandgap'] = 0.4537690369214026
    # mae_dic['mepsy'] = 36.51089946686116
    # df_mae_all = pd.from_dict(mae_dic)
    # df_mae_all.to_csv(os.path.join(file_path, "mae_all.csv"))   

['mepsx', 'et_c55', 'n-powerfact', 'mbj_bandgap', 'mepsy', 'n-Seebeck', 'exfoliation_energy', 'p-powerfact', 'max_ir_mode', 'p_em300k', 'avg_elec_mass', 'bulk_modulus_kv', 'magmom_oszicar', 'encut', 'n_em300k', 'dfpt_piezo_max_dielectric_ionic', 'p-Seebeck', 'density', 'epsy', 'et_c13', 'et_c22', 'shear_modulus_gv', 'magmom_outcar', 'dfpt_piezo_max_dij', 'epsx', 'et_c12', 'min_mode', 'et_c33', 'dfpt_piezo_max_dielectric', 'et_c66', 'formation_energy_peratom', 'mepsz', 'optb88vdw_bandgap', 'slme', 'poisson', 'max_mode', 'dfpt_piezo_max_eij', 'dfpt_piezo_max_dielectric_electronic', 'min_ir_mode', 'epsz', 'max_efg', 'et_c44', 'ehull', 'avg_hole_mass', 'et_c11', 'kpoint_length_unit', 'optb88vdw_total_energy', 'spillage', '.ipynb_checkpoints']
2023-09-06 17:55:27,265 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 17:55:27,281 - modnet - INFO - Computing features, this can take time...
2023-09-06 17:55:27,283 - modnet - INFO - Applying composition featurizers...
2023-09-06 1

MultipleFeaturizer: 100%|██████████████████| 9104/9104 [00:33<00:00, 275.21it/s]


2023-09-06 17:56:01,419 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:30<00:00, 297.40it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 304.58it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:30<00:00, 295.08it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:30<00:00, 296.99it/s]

SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:52<00:00, 172.25it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 309.16it/s]

SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:31<00:00, 285.15it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:28<00:00, 317.60it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 308.34it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:34<00:00, 260.26it/s]
SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:31<00:00, 285.39it/s]

SiteStatsFingerprint: 100%|████████████████| 9104/9104 [00:29<00:00, 311.52it/s]



2023-09-06 18:02:44,435 - modnet - INFO - Data has successfully been featurized!
2023-09-06 18:02:44,502 - modnet - INFO - Multiprocessing on 1 workers.
2023-09-06 18:02:44,506 - modnet - INFO - Computing "self" MI (i.e. information entropy) of features


100%|█████████████████████████████████████████| 257/257 [00:06<00:00, 38.39it/s]

2023-09-06 18:02:51,244 - modnet - INFO - Computing cross NMI between all features...



100%|█████████████████████████████████████| 18336/18336 [08:33<00:00, 35.74it/s]

2023-09-06 18:11:24,519 - modnet - INFO - Starting target 1/1: target ...
2023-09-06 18:11:24,521 - modnet - INFO - Computing mutual information between features and target...





2023-09-06 18:11:39,010 - modnet - INFO - Computing optimal features...
2023-09-06 18:11:42,138 - modnet - INFO - Selected 50/188 features...
2023-09-06 18:11:44,514 - modnet - INFO - Selected 100/188 features...
2023-09-06 18:11:45,956 - modnet - INFO - Selected 150/188 features...
2023-09-06 18:11:46,352 - modnet - INFO - Done with target 1/1: target.
2023-09-06 18:11:46,353 - modnet - INFO - Merging all features...
2023-09-06 18:11:46,354 - modnet - INFO - Done.
2023-09-06 18:11:46,367 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 18:11:46,374 - modnet - INFO - Computing features, this can take time...
2023-09-06 18:11:46,375 - modnet - INFO - Applying composition featurizers...
2023-09-06 18:11:46,398 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'Melti

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:04<00:00, 272.63it/s]


2023-09-06 18:11:51,073 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 423.15it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 395.71it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 455.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 474.03it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:06<00:00, 181.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 455.50it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 383.50it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 448.38it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 436.00it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 334.72it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 378.58it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 450.29it/s]


2023-09-06 18:12:40,193 - modnet - INFO - Data has successfully been featurized!


AttributeError: 'MODData' object has no attribute 'df_targets'

In [49]:
data_val = MODData(materials = df_val["composition"],
               targets = df_val[target_name],
               target_names=[target_name],
               featurizer=basic_featurizer,
               structure_ids=df_val.index, )

data_val.featurize()
# data_val.feature_selection(n=200)

# Creating MODNetModel
model = MODNetModel([[[target_name]]],
                    weights={target_name:1},
                    num_neurons=[[256],[64],[64],[32]],
                   )

model.fit(data_train,
          val_data = data_val,
          epochs = 250,
          verbose = 0
         )

# # Predicting on unlabeled data
df_test["composition"] = df_test["formula"].map(Composition)
data_to_predict = MODData(materials = df_test["composition"],
               featurizer=basic_featurizer,
               structure_ids=df_test.index)
data_to_predict.featurize()
# data_to_predict.feature_selection(n=200)
df_predictions = model.predict(data_to_predict)
df_test_pred = df_test.merge(df_predictions, how = 'left', left_index = True, right_index = True, suffixes=('_true', '_pred'))
mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
print("-" * 40)
print(f"{target_property}: {mae}")

2023-09-06 18:34:35,846 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 18:34:35,854 - modnet - INFO - Computing features, this can take time...
2023-09-06 18:34:35,855 - modnet - INFO - Applying composition featurizers...
2023-09-06 18:34:35,860 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'maximum', 'range', 'mean', 'avg_dev',
           

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:03<00:00, 295.44it/s]


2023-09-06 18:34:40,141 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 437.42it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 466.08it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 470.52it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 465.95it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:06<00:00, 185.21it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 430.52it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 368.94it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 456.18it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 455.89it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 350.04it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 377.16it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 451.13it/s]


2023-09-06 18:35:27,861 - modnet - INFO - Data has successfully been featurized!
2023-09-06 18:36:33,661 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 18:36:33,671 - modnet - INFO - Computing features, this can take time...
2023-09-06 18:36:33,673 - modnet - INFO - Applying composition featurizers...
2023-09-06 18:36:33,693 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
   

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:04<00:00, 280.69it/s]


2023-09-06 18:36:38,395 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 421.72it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 422.13it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 444.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 431.15it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:07<00:00, 154.71it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 429.63it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 345.68it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 421.39it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 416.53it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 322.59it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 358.82it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 421.64it/s]


2023-09-06 18:37:31,693 - modnet - INFO - Data has successfully been featurized!
----------------------------------------
mepsx: 34.07253679257838


In [80]:
data_val = MODData(materials = df_val["composition"],
               targets = df_val[target_name],
               target_names=[target_name],
               featurizer=basic_featurizer,
               structure_ids=df_val.formula)
data_val.featurize()
data_val.df_featurized

2023-09-06 22:44:34,004 - modnet - INFO - Loaded BasicFeaturizer featurizer.
2023-09-06 22:44:34,012 - modnet - INFO - Computing features, this can take time...
2023-09-06 22:44:34,013 - modnet - INFO - Applying composition featurizers...
2023-09-06 22:44:34,026 - modnet - INFO - Applying featurizers (AtomicOrbitals(), ElementFraction(), ElementProperty(data_source=<matminer.utils.data.MagpieData object at 0x7f6457401f10>,
                features=['Number', 'MendeleevNumber', 'AtomicWeight',
                          'MeltingT', 'Column', 'Row', 'CovalentRadius',
                          'Electronegativity', 'NsValence', 'NpValence',
                          'NdValence', 'NfValence', 'NValence', 'NsUnfilled',
                          'NpUnfilled', 'NdUnfilled', 'NfUnfilled', 'NUnfilled',
                          'GSvolume_pa', 'GSbandgap', 'GSmagmom',
                          'SpaceGroupNumber'],
                stats=['minimum', 'maximum', 'range', 'mean', 'avg_dev',
           

MultipleFeaturizer: 100%|██████████████████| 1138/1138 [00:04<00:00, 265.44it/s]


2023-09-06 22:44:38,876 - modnet - INFO - Applying site featurizers...


SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 430.56it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 439.38it/s]

SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 436.48it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 423.40it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:06<00:00, 171.84it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 423.98it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 359.57it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 436.96it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 407.55it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 326.94it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:03<00:00, 363.87it/s]
SiteStatsFingerprint: 100%|████████████████| 1138/1138 [00:02<00:00, 415.25it/s]



2023-09-06 22:45:32,019 - modnet - INFO - Data has successfully been featurized!


Unnamed: 0_level_0,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,ElementFraction|H,ElementFraction|He,ElementFraction|Li,...,Stoichiometry|10-norm,TMetalFraction|transition metal fraction,ValenceOrbital|avg s valence electrons,ValenceOrbital|avg p valence electrons,ValenceOrbital|avg d valence electrons,ValenceOrbital|avg f valence electrons,ValenceOrbital|frac s valence electrons,ValenceOrbital|frac p valence electrons,ValenceOrbital|frac d valence electrons,ValenceOrbital|frac f valence electrons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sc2ZnGa,3.0,21,-0.131080,3.0,21,-0.131080,0.000000,0.000000,0,0.000000,...,0.500098,0.000000,2.000000,0.250000,5.500,0.000000,0.258065,0.032258,0.709677,0.000000
U3P2S,4.0,92,-0.366543,4.0,92,-0.366543,0.000000,0.000000,0,0.000000,...,0.500861,0.000000,2.000000,1.666667,0.500,1.500000,0.352941,0.294118,0.088235,0.264706
AcMnO3,3.0,25,-0.266540,3.0,25,-0.266540,0.000000,0.000000,0,0.000000,...,0.600002,0.200000,2.000000,2.400000,1.200,0.000000,0.357143,0.428571,0.214286,0.000000
K2NaGaAs2,2.0,33,-0.197497,2.0,33,-0.197497,0.000000,0.000000,0,0.000000,...,0.357293,0.000000,1.500000,1.166667,5.000,0.000000,0.195652,0.152174,0.652174,0.000000
YAuO2,3.0,79,-0.304738,1.0,79,-0.162334,0.142404,0.000000,0,0.000000,...,0.500098,0.000000,1.750000,2.000000,2.750,3.500000,0.175000,0.200000,0.275000,0.350000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TmAl3,2.0,13,-0.102545,2.0,13,-0.102545,0.000000,0.000000,0,0.000000,...,0.750001,0.000000,2.000000,0.750000,0.000,3.250000,0.333333,0.125000,0.000000,0.541667
AgGe3,2.0,32,-0.149882,2.0,32,-0.149882,0.000000,0.000000,0,0.000000,...,0.750001,0.250000,1.750000,1.500000,10.000,0.000000,0.132075,0.113208,0.754717,0.000000
ReSbO6,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.000000,0,0.000000,...,0.750000,0.125000,2.000000,3.375000,1.875,1.750000,0.222222,0.375000,0.208333,0.194444
LiBePt,1.0,78,-0.161308,1.0,78,-0.161308,0.000000,0.000000,0,0.333333,...,0.372041,0.333333,1.333333,0.000000,3.000,4.666667,0.148148,0.000000,0.333333,0.518519


In [57]:
df_test["composition"] = df_test["formula"].map(Composition)
data_to_predict_new = MODData(materials = df_test["composition"],
                              df_featurized = data_to_predict.df_featurized,
                              structure_ids=df_test.index)

# data_to_predict_new.feature_selection(n=200)
df_predictions = model.predict(data_to_predict_new)
df_test_pred = df_test.merge(df_predictions, how = 'left', left_index = True, right_index = True, suffixes=('_true', '_pred'))
mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
print("-" * 40)
print(f"{target_property}: {mae}")

----------------------------------------
mepsx: 34.07253679257838


In [46]:
    mae = mean_absolute_error(df_test_pred[target_name+'_true'].values,df_test_pred[target_name+'_pred'].values)
    print(f"{target_property}: {mae}")
    mae_dic[target_property] = mae
    df_test_pred.to_csv(os.path.join(file_path, target_property, "test_pred.csv"))   
# df_mae_all = pd.from_dict(mae_dic)

mepsx: 34.622218064060945


In [15]:
warnings.filterwarnings('ignore')

def iterate_dataset(folder_path):
    dataset = []
    for root, subfolders, files in os.walk(folder_path):
        dataset.append(subfolders)
    return dataset[0]


file_path = "/scratch/yll6162/modnet/materials_data"
# with open(os.path.join(file_path, "feature_grp.json"), 'r') as json_file:
#     feature_grp = json.load(json_file)

datasets = iterate_dataset(file_path)
feature_grp = {}
total_num = 0
df_lst = []
count = 0
for target_property in datasets:
    if not target_property.startswith("."):
        print(target_property)
        df_train = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
        df_lst.append(df_train)
        total_num += df_train.shape[0]
        count += 1
combined_df = pd.concat(df_lst, ignore_index=True)
combined_df = combined_df.drop_duplicates(subset = 'formula', keep = 'first')
print(count)
print(combined_df.shape[0])
print(total_num)
    
    # df_test = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
    # df_val = pd.read_csv(os.path.join(file_path, target_property, "val.csv"))

avg_elec_mass
avg_hole_mass
bulk_modulus_kv
dfpt_piezo_max_dielectric
density
dfpt_piezo_max_dij
dfpt_piezo_max_eij
dfpt_piezo_max_dielectric_electronic
encut
epsx
epsy
epsz
et_c11
dfpt_piezo_max_dielectric_ionic
ehull
et_c12
et_c13
et_c22
et_c33
et_c66
et_c44
et_c55
exfoliation_energy
formation_energy_peratom
kpoint_length_unit
magmom_oszicar
magmom_outcar
max_efg
max_ir_mode
max_mode
mbj_bandgap
mepsx
mepsy
min_mode
n-Seebeck
min_ir_mode
n-powerfact
n_em300k
mepsz
optb88vdw_bandgap
optb88vdw_total_energy
p-Seebeck
p-powerfact
poisson
shear_modulus_gv
p_em300k
slme
spillage
48
28934
80936


In [5]:

# LOOP 
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler

target_name = "target"
mae_dic = {}
epochs = 600
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.set_visible_devices(physical_devices[1:], 'GPU')

file_path = "/scratch/yll6162/modnet/materials_data"
target_property = "avg_elec_mass"

# with tf.device('/device:GPU:1'):
df_train = pd.read_csv(os.path.join(file_path, target_property, "train.csv"))
# df_test = pd.read_csv(os.path.join(file_path, target_property, "test.csv"))
# df_val = pd.read_csv(os.path.join(file_path, target_property, "val.csv"))

df_train_featurized = pd.read_csv(os.path.join(file_path, f"df_train_all_featurized.csv"), index_col = 0).drop(columns = 'target', errors='ignore') 
# df_val_featurized = pd.read_csv(os.path.join(file_path, f"df_val_all_featurized.csv"), index_col = 0).drop(columns = 'target', errors='ignore')
# df_test_featurized = pd.read_csv(os.path.join(file_path, f"df_test_all_featurized.csv"), index_col = 0).drop(columns = 'target', errors='ignore')
# print(df_train_featurized)                        
# df_train["composition"] = df_train["formula"].map(Composition) # maps composition to a pymatgen composition object

df_train_featurized = df_train.merge(df_train_featurized, how='left', left_on = 'formula', right_index = True)
# df_val_featurized = df_val.merge(df_val_featurized, how='left', left_on = 'formula', right_index = True)
# df_test_featurized = df_test.merge(df_test_featurized, how='left', left_on = 'formula', right_index = True)
# Creating MODData
df_train_featurized["composition"] = df_train_featurized["formula"].map(Composition)
df_train_featurized[df_train_featurized.isna().any(axis=1)]

scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
x = df_train_featurized.drop(columns = ["formula")

Unnamed: 0,formula,target,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,ElementFraction|H,...,TMetalFraction|transition metal fraction,ValenceOrbital|avg s valence electrons,ValenceOrbital|avg p valence electrons,ValenceOrbital|avg d valence electrons,ValenceOrbital|avg f valence electrons,ValenceOrbital|frac s valence electrons,ValenceOrbital|frac p valence electrons,ValenceOrbital|frac d valence electrons,ValenceOrbital|frac f valence electrons,composition
49,NpSi2,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,1.333333,0.333333,1.333333,0.400000,0.266667,0.066667,0.266667,"(Np, Si)"
107,Np(FeGe)2,0.0,,-1,,,-1,,,0.0,...,0.400000,2.000000,0.800000,6.600000,0.800000,0.196078,0.078431,0.647059,0.078431,"(Np, Fe, Ge)"
111,NpGa5Co,0.0,,-1,,,-1,,,0.0,...,0.142857,2.000000,0.714286,8.285714,0.571429,0.172840,0.061728,0.716049,0.049383,"(Np, Ga, Co)"
204,SrBe13,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,"(Sr, Be)"
206,YbCd,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,0.000000,5.000000,7.000000,0.142857,0.000000,0.357143,0.500000,"(Yb, Cd)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9792,NpIO,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,3.000000,3.666667,1.333333,0.200000,0.300000,0.366667,0.133333,"(Np, I, O)"
9893,Zr2Np,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,0.000000,1.666667,1.333333,0.400000,0.000000,0.333333,0.266667,"(Zr, Np)"
9923,PuClO,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,3.000000,0.000000,2.000000,0.285714,0.428571,0.000000,0.285714,"(Pu, Cl, O)"
9998,YbBe13,0.0,,-1,,,-1,,,0.0,...,0.000000,2.000000,0.000000,0.000000,1.000000,0.666667,0.000000,0.000000,0.333333,"(Yb, Be)"


In [9]:
# len(df_train_featurized.columns)
len(df_train_featurized.dropna(axis=1, how='all').columns)

260

In [10]:
df_train_featurized.dropna(axis=1, how='any')

Unnamed: 0,formula,target,AtomicOrbitals|HOMO_element,AtomicOrbitals|LUMO_element,ElementFraction|H,ElementFraction|He,ElementFraction|Li,ElementFraction|Be,ElementFraction|B,ElementFraction|C,...,TMetalFraction|transition metal fraction,ValenceOrbital|avg s valence electrons,ValenceOrbital|avg p valence electrons,ValenceOrbital|avg d valence electrons,ValenceOrbital|avg f valence electrons,ValenceOrbital|frac s valence electrons,ValenceOrbital|frac p valence electrons,ValenceOrbital|frac d valence electrons,ValenceOrbital|frac f valence electrons,composition
0,Pr3(BN2)2,0.00,59,59,0.0,0,0.0,0.0,0.222222,0.0,...,0.000000,2.0,1.555556,0.000000,1.000000,0.439024,0.341463,0.000000,0.219512,"(Pr, B, N)"
1,Ho3Lu,0.00,67,71,0.0,0,0.0,0.0,0.000000,0.0,...,0.000000,2.0,0.000000,0.250000,11.750000,0.142857,0.000000,0.017857,0.839286,"(Ho, Lu)"
2,RbInO3,0.00,8,8,0.0,0,0.0,0.0,0.000000,0.0,...,0.000000,1.8,2.600000,2.000000,0.000000,0.281250,0.406250,0.312500,0.000000,"(Rb, In, O)"
3,Pm5Mg,0.00,61,61,0.0,0,0.0,0.0,0.000000,0.0,...,0.000000,2.0,0.000000,0.000000,4.166667,0.324324,0.000000,0.000000,0.675676,"(Pm, Mg)"
4,YbSb2,0.00,51,51,0.0,0,0.0,0.0,0.000000,0.0,...,0.000000,2.0,2.000000,6.666667,4.666667,0.130435,0.130435,0.434783,0.304348,"(Yb, Sb)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10106,Fe(NiS2)2,0.00,16,16,0.0,0,0.0,0.0,0.000000,0.0,...,0.428571,2.0,2.285714,3.142857,0.000000,0.269231,0.307692,0.423077,0.000000,"(Fe, Ni, S)"
10107,Yb4Sb3,0.00,51,51,0.0,0,0.0,0.0,0.000000,0.0,...,0.000000,2.0,1.285714,4.285714,8.000000,0.128440,0.082569,0.275229,0.513761,"(Yb, Sb)"
10108,InSiIr,0.00,77,14,0.0,0,0.0,0.0,0.000000,0.0,...,0.333333,2.0,1.000000,5.666667,4.666667,0.150000,0.075000,0.425000,0.350000,"(In, Si, Ir)"
10109,CaP,0.29,15,15,0.0,0,0.0,0.0,0.000000,0.0,...,0.000000,2.0,1.500000,0.000000,0.000000,0.571429,0.428571,0.000000,0.000000,"(Ca, P)"


In [12]:
s = df_train_featurized['AtomicOrbitals|HOMO_character']
nan_percentage = (s.isnull().sum() / len(s)) * 100
print(nan_percentage)

1.8000197804371478
