## Load PDB as Graph

- edge: [C-C bound]
- node: [coord,aa]

or refer to: https://blog.csdn.net/C20180602_csq/article/details/138327140

Todo: Let the graph fully connected, and add features to edge: [L2_distance_of_ATOM_C > ??, C-C bound]

In [1]:
from Bio.PDB.PDBParser import PDBParser
import numpy as np
import pandas as pd
import scipy.linalg as linalg
import torch
import dgl
import os
p = PDBParser(PERMISSIVE=1)

SHORTEN_dict= {'VAL':'V', 'ILE':'I', 'LEU':'L', 'GLU':'E', 'GLN':'Q',
            'ASP':'D', 'ASN':'N', 'HIS':'H', 'TRP':'W', 'PHE':'F', 
            'TYR':'Y', 'ARG':'R', 'LYS':'K', 'SER':'S', 'THR':'T', 
            'MET':'M', 'ALA':'A', 'GLY':'G', 'PRO':'P', 'CYS':'C'}   ## BZJX*

def load_embd_dict(AA_selc = list(''.join(SHORTEN_dict.values()) + '*'), file = "BLOSUM62.txt"):
    df = pd.read_csv("BLOSUM62.txt", sep="\\s+", index_col=0)
    df = df[AA_selc].T[AA_selc].T
    df = df.sort_index().loc[:, df.columns.sort_values()]       ## Sort cols and rows by name
    w, v = linalg.eigh(np.exp2(df))
    v = v * np.sign(v[0])              ## v[:,i] is i-th eigen vector
    v = v @ np.diag(w**0.5)            ## scale v by eigen value
    return dict(zip(df.columns.values,v.T))

EMBD_dict = load_embd_dict()


def embedAA(aa):
    if aa in SHORTEN_dict.keys():
        return EMBD_dict[SHORTEN_dict[aa]]
    else:
        return EMBD_dict['*']

def load_pdb(p,id,file):
    model = p.get_structure(id, file).get_models().__next__()  ## Get the first model from the structure
    chains = {}
    for chain in model.get_chains():           ## Usually only one chain as: 'COMPND   3 CHAIN: A'
        id = chain.get_id()
        chains[id] = {
            'aa': [],
            'coord': []
        }
        for residue in chain.get_residues(): 
            aa = residue.get_resname()
            coord = residue['C'].get_coord()
            chains[id]['aa'].append(aa)
            chains[id]['coord'].append(coord)
    return chains

def chains_to_graph(chains):
    g = dgl.graph([])
    for id in chains.keys():
        chain = chains[id]
        pos = 0
        for (aa,coord) in zip(chain['aa'],chain['coord']):
            g.add_nodes(1, {
                'coord': torch.tensor(coord).unsqueeze(0), 
                'aa': torch.tensor(embedAA(aa)).unsqueeze(0)
            })
            if pos > 0:
                g.add_edges(torch.tensor([pos-1]), torch.tensor([pos]), {
                    'C-C': torch.tensor([1])         ## others can be S-S bonds / H bounds, etc.  This time we don't calcuate them
                })
            pos += 1
    return g



In [2]:
files = os.listdir('pdb')
ids = [f.split('-model_')[0] for f in files]
files = [os.path.join('pdb',f) for f in files]

file = files[0]
id = ids[0]

chains = load_pdb(p,id,file)
g = chains_to_graph(chains)

id, g

('AF-A0A016HVY8-F1',
 Graph(num_nodes=226, num_edges=225,
       ndata_schemes={'coord': Scheme(shape=(3,), dtype=torch.float32), 'aa': Scheme(shape=(21,), dtype=torch.float64)}
       edata_schemes={'C-C': Scheme(shape=(), dtype=torch.int64)}))

## Try Conv

https://docs.dgl.ai/en/1.1.x/api/python/nn-pytorch.html

In [3]:
from dgl.nn import GraphConv


g.ndata['coord'].size()     ,    g.num_nodes()

(torch.Size([226, 3]), 226)

In [4]:
feat = torch.ones(g.num_nodes(), 10)

convLayer = GraphConv(10, 2, norm='both', weight=True, bias=True, allow_zero_in_degree=True)

convLayer(g, feat).size()

torch.Size([226, 2])