In [None]:
import numpy as np
import mdtraj as md
import sklearn.metrics as skl

from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse.csgraph import shortest_path

import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms import isomorphism

In [None]:
def data_2_graph(molecule_data, draw_graph=False):
    #calculate distance matrix and minimum spanning tree
    crd_coords = np.array(molecule_data)[:,3:6].astype(float) 
    distance_matrix = skl.pairwise_distances(crd_coords)

    #do not allow hydrogen-hydrogen graph edges
    for i in range(crd_coords.shape[0]):
        for j in range(crd_coords.shape[0]):
            if molecule_data[i][-1] and molecule_data[j][-1]:
                distance_matrix[i][j] = 999
    
    #print(distance_matrix.shape)
    #print(distance_matrix)
    
    #calculate minimum spanning tree
    mst = minimum_spanning_tree(distance_matrix)

    #digitize mst adjacency matrix so it contains binary information about which molecules are covalently bonded
    mst_mat = mst.toarray()

    #distance in angstroms below which to add edges to graph regardless as to whether they're part of the MST; 
    # needed for processing cyclic molecules for which the MST is degenerate or nearly so and hence dependent 
    # on fine details of molecular coordinates that can break the symmetry in different ways for different instances of the same molecule 
    cyclic_dist_threshold = 1.6
    
    for i in range(crd_coords.shape[0]):
        for j in range(crd_coords.shape[0]):
            if distance_matrix[i][j] < cyclic_dist_threshold:
                #print(distance_matrix[i][j])
                mst_mat[i][j] = distance_matrix[i][j]
    
    #print(mst_mat[0])

    #where 10**10 is an arbitrary large number which is much larger than the '999' boilerplate large number above
    mst_mat_bin = np.digitize(mst_mat, [0,10**10], right=True) 

    #give make hydrogen-heavy atom edges a different weight so that carbonyls and alkoxides are not mistaken for hydrogens, 
    # leading to unwanted graph degeneracies
    for i in range(crd_coords.shape[0]):
        for j in range(crd_coords.shape[0]):
            if molecule_data[i][-1] and not molecule_data[j][-1]:
                #if mst_mat_bin[i][j] == 1:
                mst_mat_bin[i][j] = 0
    
    # print(mst_mat)
    #print(mst_mat_bin)

    #make nx graph object (not the same as the the mst object above)
    #digraph must be specified for directed graphs
    mst_graph = nx.from_numpy_array(mst_mat_bin, create_using=nx.DiGraph)

    #print(mst_graph)
    
    #draw mst graph
    if draw_graph:
        pos = nx.nx_agraph.graphviz_layout(mst_graph)
        nx.draw(mst_graph, pos, node_size=5)

    return mst_graph

In [None]:
#pdb reader
#pdb file format: https://www.wwpdb.org/documentation/file-format
def pdb_loader(file_path, resi):

    #format: [[index, atom number, atom name, x, y, z, whether the atom is hydrogen],...]
    file_contents = []
    
    with open(file_path, "r") as f:
        x = 0
        for line in f:
            if line[0:6] == "HETATM" and int(line[22:26]) == resi:
                file_contents.append([x, int(line[6:11]), line[12:16], float(line[30:38]), float(line[38:46]), float(line[46:54]), line[76:78].strip() == "H"])
                x+=1
                
    return file_contents