In [149]:
from collections import Counter
import re
import math
import operator
import numpy as np

In [150]:
elements = ['H','Li','Be', 'B', 'C', 'N', 'O', 'F', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
            'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd','Ag', 
            'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er',
            'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']

In [151]:
token = re.compile('[A-Z][a-z]?|\d+|[()]')
def count_elements(formula):
    tokens = token.findall(formula)
    stack = [[]]
    for t in tokens:
        if t.isalpha():
            last = [t]
            stack[-1].append(t)
        elif t.isdigit():
             stack[-1].extend(last*(int(t)-1))
        elif t == '(':
            stack.append([])
        elif t == ')':
            last = stack.pop()
            stack[-1].extend(last)   
    return dict(Counter(stack[-1]))

In [152]:
def normalize_elements(dictionary):
    factor=1.0/sum(dictionary.values())
    for k in dictionary:
        dictionary[k] = dictionary[k]*factor
    return dictionary

In [153]:
def vectorize_elements(dictionary):
    keys = np.array(list(dictionary.keys()))
    values = np.array(list(dictionary.values()))
    vector_elem = np.vstack((keys, values)).T
    return vector_elem

In [154]:
elem = count_elements("HF(Mg(H2F3)2As(H2F3)3(Cl((H2F3)3(Br(H2F3)4I)2)3Na)4Cs(H2F3)2)5")
print(elem)

{'H': 1391, 'F': 2086, 'Mg': 5, 'As': 5, 'Cl': 20, 'Br': 120, 'I': 120, 'Na': 20, 'Cs': 5}


In [155]:
norm_elem = normalize_elements(elem)
print(elem)

{'H': 0.36876988335100747, 'F': 0.5530222693531284, 'Mg': 0.001325556733828208, 'As': 0.001325556733828208, 'Cl': 0.005302226935312832, 'Br': 0.03181336161187699, 'I': 0.03181336161187699, 'Na': 0.005302226935312832, 'Cs': 0.001325556733828208}


In [156]:
compounds = ['HF(Mg(H2F3)2As(H2F3)3(Cl((H2F3)3(Br(H2F3)4I)2)3Na)4Cs(H2F3)2)5','Ca3(Co(CO3)3)2']
compounds = [count_elements(x) for x in compounds]
compounds = [normalize_elements(x) for x in compounds]

In [157]:
in_elements = np.zeros(shape=(len(compounds), len(elements)))
comp_no = 0
for compound in compounds:
    keys = compound.keys()
    for key in keys:
        in_elements[comp_no][elements.index(key)] = compound[key]
    comp_no+=1   
data = in_elements

In [158]:
data

array([[0.36876988, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.55302227, 0.00530223, 0.00132556,
        0.        , 0.        , 0.        , 0.        , 0.00530223,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00132556,
        0.        , 0.03181336, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.03181336,
        0.        , 0.00132556, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  