## ElemNet: A formation energy prediction tool from elemental composition

### A formation energy prediction tool using 17-layered deep neural network that achieves an accuracy of 0.042 on the Open Quantum Materials Database (OQMD).
### Input: Takes a 2D numpy array with the rows representing different compounds, and columns representing the elemental compositions with 86 elements in the set elements- ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu'], elemental compositon does not contain any element from ['He', 'Ne', 'Ar', 'Po', 'At','Rn','Fr','Ra']
### Output: Returns a 1D numpy array with the predicted formation energy

In [1]:
import tensorflow as tf
import numpy as np
import tensorflow.contrib.slim as slim
import time, os, re
from collections import OrderedDict, defaultdict

In [2]:
elements = ['H', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V', 
            'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 
            'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 
            'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 
            'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']

In [3]:
formulare = re.compile(r'([A-Z][a-z]*)(\d*)')
def parse_formula(formula):
    pairs = formulare.findall(formula)
    length = sum((len(p[0]) + len(p[1]) for p in pairs))
    assert length == len(formula)
    formula_dict = defaultdict(int)
    for el, sub in pairs:
        formula_dict[el] += float(sub) if sub else 1
    return formula_dict

In [4]:
formulas = ['H2O','NaCl', 'H2SO4']

In [5]:
formulas = [parse_formula(x) for x in formulas]
print(formulas)

[defaultdict(<class 'int'>, {'H': 2.0, 'O': 1}), defaultdict(<class 'int'>, {'Na': 1, 'Cl': 1}), defaultdict(<class 'int'>, {'H': 2.0, 'S': 1, 'O': 4.0})]


In [6]:
input = np.zeros(shape=(len(formulas), 86), dtype=np.float32)
i = -1
for formula in formulas:
    i+=1
    keys = formula.keys()
    values = formula.values()
    total = float(sum(values))
    for k in keys:
        input[i][elements.index(k)] = formula[k]/total
data = input

In [7]:
test_X = data
test_y = np.zeros((86), np.float32)

In [8]:
batch_size = 1

In [9]:
architecture = '1024x4D-512x3D-256x3D-128x3D-64x2-32x1-1'
activation = 'relu'
dropouts = [0.8, 0.9, 0.7, 0.8]
SEED = 66478
num_input = 86

In [10]:
    def model_slim(data, architecture, train=True, num_labels=1, activation='relu', dropouts=dropouts):
        if train:
            reuse = None
        else:
            reuse = True

        if activation == 'relu':
            activation = tf.nn.relu
        assert '-' in architecture
        archs = architecture.strip().split('-')
        net = data
        pen_layer = net
        prev_layer = net
        prev_num_outputs = None
        prev_block_num_outputs = None
        prev_stub_output = net
        for i in range(len(archs)):
            arch = archs[i]
            if 'x' in arch:
                arch = arch.split('x')
                num_outputs = int(re.findall(r'\d+',arch[0])[0])
                layers = int(re.findall(r'\d+',arch[1])[0])
                j = 0
                aux_layers = re.findall(r'[A-Z]',arch[0])
                for l in range(layers):
                    if aux_layers and aux_layers[0] == 'B':
                        if len(aux_layers)>1 and aux_layers[1]=='A':
                            print('adding fully connected layers with %d outputs followed by batch_norm and act' % num_outputs)

                            net = slim.layers.fully_connected(net, num_outputs=num_outputs,
                                                              scope='fc' + str(i) + '_' + str(j),
                                                              activation_fn=None, reuse=reuse)
                            net = slim.layers.batch_norm(net, center=True, scale=True, reuse=reuse, scope='fc_bn'+str(i)+'_'+str(j))
                            net = activation(net)
                        else:
                            print('adding fully connected layers with %d outputs followed by batch_norm' % num_outputs)
                            net = slim.layers.fully_connected(net, num_outputs=num_outputs,
                                                              scope='fc' + str(i) + '_' + str(j),
                                                              activation_fn=activation, reuse=reuse)
                            net = slim.layers.batch_norm(net, center=True, scale=True, reuse=reuse,
                                             scope='fc_bn' + str(i) + '_' + str(j))

                    else:
                        print('adding fully connected layers with %d outputs' % num_outputs)

                        net = slim.layers.fully_connected(net, num_outputs=num_outputs,
                                                          scope='fc' + str(i) + '_' + str(j), activation_fn=activation,
                                                              reuse=reuse)
                    if 'R' in aux_layers:
                        if prev_num_outputs and prev_num_outputs==num_outputs:
                            print('adding residual, both sizes are same')

                            net = net+prev_layer
                        else:
                            print('adding residual with fc as the size are different')
                            net = net + slim.layers.fully_connected(prev_layer, num_outputs=num_outputs,
                                                                  scope='fc' + str(i) + '_' +'dim_'+ str(j),
                                                          activation_fn=None, reuse=reuse)
                    prev_num_outputs = num_outputs
                    j += 1
                    prev_layer = net
                aux_layers_sub = re.findall(r'[A-Z]', arch[1])
                if 'D' in aux_layers_sub and (train or num_labels == 1) and len(dropouts) > i:
                    print('adding dropout', dropouts[i])
                    net = tf.nn.dropout(net, dropouts[i], seed=SEED)
                prev_stub_output = net
                prev_block_num_outputs = num_outputs
                prev_layer = net

            else:
                if 'R' in arch:
                    act_fun = tf.nn.relu
                    print('using ReLU at last layer')
                else:
                    act_fun = None
                pen_layer = net
                print('adding final layer with ' + str(num_labels) + ' output')
                net = slim.layers.fully_connected(net, num_outputs=num_labels, scope='fc' + str(i),
                                                  activation_fn=act_fun, reuse=reuse)

        net = tf.squeeze(net)
        return net

In [11]:
tf.reset_default_graph()
train_data_node = tf.placeholder(tf.float32, shape=(batch_size, num_input))
eval_data = tf.placeholder(tf.float32, shape=(batch_size, num_input))
logits = model_slim(train_data_node, architecture)
train_labels_node = tf.placeholder(tf.float32, shape=(batch_size))
eval_prediction = model_slim(eval_data, architecture,train=False)

adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding dropout 0.8
adding fully connected layers with 512 outputs
adding fully connected layers with 512 outputs
adding fully connected layers with 512 outputs
adding dropout 0.9
adding fully connected layers with 256 outputs
adding fully connected layers with 256 outputs
adding fully connected layers with 256 outputs
adding dropout 0.7
adding fully connected layers with 128 outputs
adding fully connected layers with 128 outputs
adding fully connected layers with 128 outputs
adding dropout 0.8
adding fully connected layers with 64 outputs
adding fully connected layers with 64 outputs
adding fully connected layers with 32 outputs
adding final layer with 1 output
adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 outputs
adding fully connected layers with 1024 ou

In [12]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())
train_writer = tf.summary.FileWriter('summary', graph_def=sess.graph_def)
saver = tf.train.Saver()

model_path = os.getcwd() + '/sample/sample_model'
assert  model_path is not None
print('Restoring model from %s' % model_path)
saver.restore(sess, model_path)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Restoring model from /raid/dkj755/git-repos/ElemNet/elemnet/sample/sample_model
INFO:tensorflow:Restoring parameters from /raid/dkj755/git-repos/ElemNet/elemnet/sample/sample_model


In [13]:
size = data.shape[0]
predictions = np.ndarray(shape=(size), dtype=np.float32)
for begin in range(0, size, batch_size):
    end = begin + batch_size
    if end <= size:
        # predictions[:,begin:end] \
        outputs = sess.run(eval_prediction, feed_dict={eval_data: data[begin:end, ...]})
        predictions[begin:end] = outputs
    else:
        outputs = sess.run(eval_prediction, feed_dict={eval_data: data[-batch_size:, ...]})
        predictions[-batch_size:] = outputs

In [14]:
print(predictions)

[-0.33150914 -1.911143   -1.3807236 ]
