In [8]:
import time
import networkx as nx
from nfp.preprocessing import features_graph
import numpy as np
import pandas as pd

In [9]:
dataIni = pd.read_csv('Oads_Mo2C_catalysts_graphml.csv')
dataIni['graphFileName'] = dataIni['graphFileName'].str.slice_replace(0,0,repl='Oads_Mo2C_graphml/')
print(dataIni.graphFileName)

0            Oads_Mo2C_graphml/Mo2C_101_1_O_0_19_1.graphml
1            Oads_Mo2C_graphml/Mo2C_110_4_O_0_19_2.graphml
2            Oads_Mo2C_graphml/Mo2C_011_8_O_0_31_5.graphml
3            Oads_Mo2C_graphml/Mo2C_101_5_O_0_23_6.graphml
4            Oads_Mo2C_graphml/Mo2C_110_2_O_0_18_7.graphml
                               ...                        
20172    Oads_Mo2C_graphml/Mo2C_100_4_Au_1_O_0_6_26624....
20173    Oads_Mo2C_graphml/Mo2C_100_0_Ru_1_O_0_8_26626....
20174    Oads_Mo2C_graphml/Mo2C_110_2_Ir_0_O_0_6_26627....
20175    Oads_Mo2C_graphml/Mo2C_110_1_Ru_0_O_0_3_26631....
20176    Oads_Mo2C_graphml/Mo2C_101_7_Os_0_O_0_5_26632....
Name: graphFileName, Length: 20177, dtype: object


In [4]:
# Prepare graph information for faster preprocessing
def construct_graph_data(graphDF,numOfShell=2):
    """ 
    Returns
    dict with entries
    'n_atom' : number of atoms in the molecule
    'n_bond' : number of bonds in the molecule 
    'connectivity' : (n_bond, 2) array of source atom, target atom pairs.
    """
    dataList = []
    for index,row in graphDF.iterrows():
        graph = row.graphFileName
        s1 = graph.split("/")[-1]
        shortName = s1.split(".")[0]
        G = nx.read_graphml(graph)
        if numOfShell==2:
            nodes = (
                     node
                     for node, data
                     in G.nodes(data=True)
                     if data.get("type") != "thirdCoordinationShell"
                     )
            G = G.subgraph(nodes)

        elif numOfShell==1:
            nodes = (
                     node
                     for node, data
                     in G.nodes(data=True)
                     if data.get("type") != "thirdCoordinationShell" and data.get("type") != 'secondCoordinationShell'
                     )
            G = G.subgraph(nodes)

        n_atom = G.number_of_nodes()
        n_bond = 2 * G.number_of_edges()

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1

        connectivity = np.zeros((n_bond, 2), dtype='int')
        nodeList = []
        edgeList = []
        revList = []
        atomFeatList = []
        bondFeatList = []
        bond_index = 0
        for n,node in enumerate(G.nodes):
            # Atom Classes
            start_index = list(G.nodes).index(node)
            nodeList.append(node)
            atomFeat = features_graph.atom_features_ver1(G.nodes[node])
            atomFeatList.append(atomFeat)
            for m,edge in enumerate(G.edges):
                if node in edge:
                    # Is the bond pointing at the target atom     
                    rev = list(G.nodes).index(list(G.edges)[m][0]) != start_index
                    bondFeat = features_graph.bond_features_v1(G.edges[edge],flipped=rev)
                    bondFeatList.append(bondFeat)
                    edgeList.append(edge)
                    revList.append(rev)
                    # Connectivity
                    if not rev:  # Original direction
                        connectivity[bond_index, 0] = list(G.nodes).index(list(G.edges)[m][0])
                        connectivity[bond_index, 1] = list(G.nodes).index(list(G.edges)[m][1])
                    else:  # Reversed
                        connectivity[bond_index, 0] = list(G.nodes).index(list(G.edges)[m][1])
                        connectivity[bond_index, 1] = list(G.nodes).index(list(G.edges)[m][0])
                    bond_index += 1      
        connectivity = connectivity.tolist()
        dataList.append([shortName,n_atom, n_bond, nodeList, edgeList, atomFeatList, bondFeatList, revList, connectivity])
    return dataList

In [10]:
start_time = time.time()
structList = construct_graph_data(dataIni,numOfShell=2)
dfGS = pd.DataFrame(structList,columns=['graphName','nAtoms','nBonds','nodes','edges','atomFeatures','bondFeatures','revBool','connectivity'])
print('Finished in (s):',time.time()-start_time)
#dfGS.to_csv('graph_structure_2ndNN_ini.csv.gz', index=None, compression='gzip')

Finished in (s): 58491.90770316124
