# MAE on Proposed Method

In [1]:
from collections import Counter
import re, os
import math
import operator
import numpy as np
import matplotlib.pyplot as plt
import csv
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
#Contains 86 elements (Without Noble elements as it does not forms compounds in normal condition)
elements = ['H','Li','Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl',
            'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe','Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge',
            'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd',
            'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd',
            'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er','Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 
            'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']

In [3]:
# Regex to Choose from Element Name, Number and Either of the Brackets
token = re.compile('[A-Z][a-z]?|\d+|[()]')

# Create a dictionary with the Name of the Element as Key and No. of elements as Value
def count_elements(formula):
    tokens = token.findall(formula)
    stack = [[]]
    for t in tokens:
        if t.isalpha():
            last = [t]
            stack[-1].append(t)
        elif t.isdigit():
             stack[-1].extend(last*(int(t)-1))
        elif t == '(':
            stack.append([])
        elif t == ')':
            last = stack.pop()
            stack[-1].extend(last)   
    return dict(Counter(stack[-1]))

In [4]:
#Normalize the Value of the Dictionary
def normalize_elements(dictionary):
    factor=1.0/sum(dictionary.values())
    for k in dictionary:
        dictionary[k] = dictionary[k]*factor
    return dictionary

In [5]:
#Optional Function
def vectorize_elements(dictionary):
    keys = np.array(list(dictionary.keys()))
    values = np.array(list(dictionary.values()))
    vector_elem = np.vstack((keys, values)).T
    return vector_elem

In [6]:
#Check Making of the dictionary
elem = count_elements("HF(Mg(H2F3)2As(H2F3)3(Cl((H2F3)3(Br(H2F3)4I)2)3Na)4Cs(H2F3)2)5")
print(elem)

{'H': 1391, 'F': 2086, 'Mg': 5, 'As': 5, 'Cl': 20, 'Br': 120, 'I': 120, 'Na': 20, 'Cs': 5}


In [7]:
#Check the normalization of the values
norm_elem = normalize_elements(elem)
print(elem)

{'H': 0.36876988335100747, 'F': 0.5530222693531284, 'Mg': 0.001325556733828208, 'As': 0.001325556733828208, 'Cl': 0.005302226935312832, 'Br': 0.03181336161187699, 'I': 0.03181336161187699, 'Na': 0.005302226935312832, 'Cs': 0.001325556733828208}


In [8]:
# List of compound as input
compounds = ['HF(Mg(H2F3)2As(H2F3)3(Cl((H2F3)3(Br(H2F3)4I)2)3Na)4Cs(H2F3)2)5','Ca3(Co(CO3)3)2']

#Make Dictionary out of the list of compound
compounds = [count_elements(x) for x in compounds]

#Normalize the value of dictionary
compounds = [normalize_elements(x) for x in compounds]

In [9]:
#Pre-process the dictionary to create a vector to make it suitable as an input for the DNN
in_elements = np.zeros(shape=(len(compounds), len(elements)))
comp_no = 0

for compound in compounds:
    keys = compound.keys()
    for key in keys:
        in_elements[comp_no][elements.index(key)] = compound[key]
    comp_no+=1  
    
data = in_elements

In [10]:
# import training data 
def load_data(csvname):
    # load in data
    data = np.asarray(pd.read_csv(csvname))

    # import data and reshape appropriately
    X = data[:,0:-1]
    y = data[:,-1]
    y.shape = (len(y),1)
    
    # pad data with ones for more compact gradient computation
    #o = np.ones((np.shape(X)[0],1))
    #X = np.concatenate((o,X),axis = 1)
    #X = X.T
    
    return X,y

In [11]:
def convert(lst): 
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)} 
    return res_dct 

In [12]:
separate = re.compile('[A-Z][a-z]?|\d+\.\d')

def correction(x_train):
    new_x = []
    for i in range (0,x_train.shape[0]):
        new_x.append(separate.findall(x_train[i][0])) 
    new_x = np.asarray(new_x)
    new_x.shape = (len(new_x),1)    
    dict_x = convert(new_x[0][0])
    input_x = []
    for i in range (0,new_x.shape[0]):
        input_x.append(convert(new_x[i][0]))
        
    in_elements = np.zeros(shape=(len(input_x), len(elements)))
    comp_no = 0

    for compound in input_x:
        keys = compound.keys()
        for key in keys:
            in_elements[comp_no][elements.index(key)] = compound[key]
        comp_no+=1  

    data = in_elements    
    
    return data

In [13]:
x_train, y_train = load_data('train_set.csv')
x_test, y_test = load_data('test_set.csv')

In [14]:
new_x_train = correction(x_train)
new_x_test = correction(x_test)

In [15]:
new_y_train = y_train
new_y_test = y_test

In [16]:
new_y_train.shape = (len(new_y_train),)
new_y_test.shape = (len(new_y_test),)

## Method

In [None]:
def elemnet_model():
	# create model
    model = Sequential()
    model.add(Dense(1024, activation='relu', input_dim=86))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(1024, activation='relu'))
    #model.add(Dropout(0.2))(training=True)
    #model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(512, activation='relu'))
    #model.add(Dropout(0.1)(training=True))
    #model.add(Dropout(0.1))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(256, activation='relu'))
    #model.add(Dropout(0.3)(training=True))
    #model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    #model.add(Dropout(0.2)(training=True))
    #model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))

	# Compile model
    adam = optimizers.Adam(lr=0.0001)
    model.compile(loss=tf.keras.losses.mean_absolute_error, optimizer=adam, metrics=['mean_absolute_error'])
    return model
# build the model
model = elemnet_model()

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
# Fit the model
model.fit(new_x_train, new_y_train,verbose=2, validation_data=(new_x_test, new_y_test), epochs=1000, batch_size=32, callbacks=[es])
y_predict = model.predict(new_x_test)
f = open( 'resultElemNet.txt', 'w' )
f.write(y_predict)
f.close()
model.save_weights("model1.h5")