# Structural Assessment of Latent Space
We want to be able to test whether the latent space has any structural ordering.  This is rather complicated because it necessitates finding or predicting 3D structures of short(ish) peptides from sequences.  We first write code to check against the PDB to see if an experimental structure exists. We then need to interface with a prediction tool (there are several prediction servers for short peptides and alphafold2 or similar for longer peptides -- really seems like we would want to write a function to query structural similarity "around" a specific point in the latent space.)

In [129]:
from pypdb import *
import Bio
from Bio.PDB import PDBList
import mdtraj as md
import nglview as nv
import pdb
import time

  and should_run_async(code)


In [None]:
class Peptide:
    def __init__(self,seq):
        self.sequence = seq
        self.structuretraj = None
    def strucFromPDB(self):
        q = Query(self.sequence,query_type("sequence",return_type="polymer_entity"))

### Access sequence data

In [2]:
data = './data/peptides/datasets/starpep_maxlength100_dataset/peptide_combined_no_shuff.txt'
seqf = open(data,'r')
dirtyseqs = seqf.readlines()
seqs = [seq.rstrip() for seq in dirtyseqs]

  and should_run_async(code)


### Demo query PDB for existence of experimental structure

In [98]:
q = Query(seqs[0],query_type="sequence",return_type="polymer_entity")
sq = q.search()

  and should_run_async(code)


In [4]:
#get all sequence/entity IDs with 100% match
pdbids = []
for result in sq['result_set']:
    if result['services'][0]['nodes'][0]['match_context'][0]['sequence_identity'] == 1.0:
        pdbids.append(result['identifier'])
print(pdbids)

['1BH1_1', '2MW6_1', '6DST_1', '2MLT_1', '3QRX_2', '6O4M_2']


  and should_run_async(code)


In [20]:
#download structures from PDB
pdbl = PDBList()
for i in pdbids:
    pdb_file = get_pdb_file(i.split('_')[0],filetype='pdb',compression=False)
    newf = open(i.split('_')[0]+'.pdb','w')
    newf.write(pdb_file)
    newf.close()

  and should_run_async(code)


Sending GET request to https://files.rcsb.org/download/1BH1.pdb to fetch 1BH1's pdb file as a string.




Sending GET request to https://files.rcsb.org/download/2MW6.pdb to fetch 2MW6's pdb file as a string.




Sending GET request to https://files.rcsb.org/download/6DST.pdb to fetch 6DST's pdb file as a string.




Sending GET request to https://files.rcsb.org/download/2MLT.pdb to fetch 2MLT's pdb file as a string.




Sending GET request to https://files.rcsb.org/download/3QRX.pdb to fetch 3QRX's pdb file as a string.




Sending GET request to https://files.rcsb.org/download/6O4M.pdb to fetch 6O4M's pdb file as a string.


In [21]:
t = md.load('3QRX.pdb')

  and should_run_async(code)


In [22]:
view = nv.show_mdtraj(t)
view

  and should_run_async(code)


NGLWidget()

In [55]:
ats = [str(a) for a in t.topology.chain(1).atoms]

  and should_run_async(code)


In [56]:
ats

  and should_run_async(code)


['ILE2-N',
 'ILE2-CA',
 'ILE2-C',
 'ILE2-O',
 'ILE2-CB',
 'GLY3-N',
 'GLY3-CA',
 'GLY3-C',
 'GLY3-O',
 'ALA4-N',
 'ALA4-CA',
 'ALA4-C',
 'ALA4-O',
 'ALA4-CB',
 'VAL5-N',
 'VAL5-CA',
 'VAL5-C',
 'VAL5-O',
 'VAL5-CB',
 'VAL5-CG1',
 'VAL5-CG2',
 'LEU6-N',
 'LEU6-CA',
 'LEU6-C',
 'LEU6-O',
 'LEU6-CB',
 'LEU6-CG',
 'LEU6-CD1',
 'LEU6-CD2',
 'LYS7-N',
 'LYS7-CA',
 'LYS7-C',
 'LYS7-O',
 'LYS7-CB',
 'VAL8-N',
 'VAL8-CA',
 'VAL8-C',
 'VAL8-O',
 'VAL8-CB',
 'VAL8-CG1',
 'VAL8-CG2',
 'LEU9-N',
 'LEU9-CA',
 'LEU9-C',
 'LEU9-O',
 'LEU9-CB',
 'LEU9-CG',
 'LEU9-CD1',
 'LEU9-CD2',
 'THR10-N',
 'THR10-CA',
 'THR10-C',
 'THR10-O',
 'THR10-CB',
 'THR10-OG1',
 'THR10-CG2',
 'THR11-N',
 'THR11-CA',
 'THR11-C',
 'THR11-O',
 'THR11-CB',
 'THR11-OG1',
 'THR11-CG2',
 'GLY12-N',
 'GLY12-CA',
 'GLY12-C',
 'GLY12-O',
 'LEU13-N',
 'LEU13-CA',
 'LEU13-C',
 'LEU13-O',
 'LEU13-CB',
 'LEU13-CG',
 'LEU13-CD1',
 'LEU13-CD2',
 'PRO14-N',
 'PRO14-CA',
 'PRO14-C',
 'PRO14-O',
 'PRO14-CB',
 'PRO14-CG',
 'PRO14-CD',
 'ALA15-

In [64]:
t.topology.select('chainid 2')

  and should_run_async(code)


array([1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303,
       1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314,
       1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325,
       1326, 1327])

In [41]:
len(seqs[0])

  and should_run_async(code)


26

In [87]:
import pdb


  and should_run_async(code)


In [88]:
pdbids

  and should_run_async(code)


['1BH1_1', '2MW6_1', '6DST_1', '2MLT_1', '3QRX_2', '6O4M_2']

In [91]:
t = extractStructure(seqs[0],'1BH1.pdb',0)

  and should_run_async(code)


<mdtraj.Trajectory with 20 frames, 436 atoms, 27 residues, without unitcells>


In [92]:
print(t)

<mdtraj.Trajectory with 20 frames, 436 atoms, 27 residues, without unitcells>


  and should_run_async(code)


In [93]:
t = extractStructure(seqs[0],'6O4M.pdb',1)

<mdtraj.Trajectory with 1 frames, 1948 atoms, 276 residues, and unitcells>


  and should_run_async(code)


In [94]:
print(t)

<mdtraj.Trajectory with 1 frames, 436 atoms, 27 residues, and unitcells>


  and should_run_async(code)


In [191]:
#create a peptide class with a query feature
#that checks for structures against the pdb
#pulls anything it can from the pdb and adds the full matching structures to 
#a structure trajectory in the peptide
def extractStructure(sequence,pdbfile,chainid):
    """
    From a downloaded pdb file and entity # pull an md trajectory if it matches the full sequence
    
    Inputs
    
    sequence : string
    pdbfile : string
    chainid : int
    
    Returns:
    
    traj: mdtraj object or None
    """
    traj = None
    p = md.load(pdbfile)
    fasta = p.topology.to_fasta(chainid)
    if len(fasta) == len(sequence):
        traj = p.atom_slice(p.topology.select('protein and chainid {}'.format(chainid)))
    return traj

class Peptide:
    def __init__(self,seq):
        self.sequence = seq
        self.pdbids = []
        self.strucstable = None
        self.strucpredict = None
        
    def __str__(self):
        return self.sequence
    
    def hasExperimentalStructure(self):
        if len(self.pdbids) > 0:
            return True
        else:
            q = Query(self.sequence,query_type="sequence",return_type="polymer_entity")
            sq = q.search()
            pdbids = []
            if sq is not None:
                for result in sq['result_set']:
                    if result['services'][0]['nodes'][0]['match_context'][0]['sequence_identity'] == 1.0:
                        pdbids.append(result['identifier'])
            self.pdbids = pdbids
            if len(self.pdbids) > 0:
                return True
            else:
                return False
        
    def strucFromPDB(self):
        if self.hasExperimentalStructure():
            for i in self.pdbids:
                pdb_file = get_pdb_file(i.split('_')[0],filetype='pdb',compression=False)
                fname = i.split('_')[0]+'.pdb'
                newf = open(fname,'w')
                newf.write(pdb_file)
                newf.close()

                chainid = int(i.split('_')[1])-1
                structraj = extractStructure(self.sequence,fname,chainid)
                if self.strucstable is None:
                    self.strucstable= [structraj]
                else:
                    self.strucstable.append(structraj)

  and should_run_async(code)


In [192]:
start = time.time()
Melittin = Peptide(seqs[0])
print(Melittin.hasExperimentalStructure())
end = time.time()
print('time to establish: {}'.format(end-start))

  and should_run_async(code)


True
time to establish: 0.2820441722869873


In [193]:
start = time.time()
Melittin = Peptide(seqs[0])
end = time.time()
print('time to establish: {}'.format(end-start))

time to establish: 0.00010514259338378906


  and should_run_async(code)


In [194]:
# to compare two different structures we're going to need to use pymol align, super, or cealign
#depending on sequence identities
#https://pymolwiki.org/index.php/Align

  and should_run_async(code)


In [195]:
peptidelibrary = [Peptide(seq) for seq in seqs]

  and should_run_async(code)


In [196]:
print(peptidelibrary[10_000])

IKWEYVLLLFLL


  and should_run_async(code)


In [197]:
start = time.time()
structurecount = 0
checkcount = 0
for i in range(0,len(seqs),100):
    if peptidelibrary[i].hasExperimentalStructure():
        structurecount+=1
    checkcount+=1
end = time.time()
print('time: {}'.format(end-start))

  and should_run_async(code)


time: 339.0192632675171


In [198]:
checkcount

  and should_run_async(code)


424

In [199]:
structurecount

  and should_run_async(code)


42