In [77]:
import Bio
from lib import DihedralAdherence
import os
from lib.constants import AMINO_ACID_CODES
from Bio import PDB
from Bio.Align import PairwiseAligner

In [78]:
INCLUSION_RADIUS = 15 # Angstroms
THRESHOLDS = [0.5, 1, 2, 4] # Angstroms

In [79]:
PDBMINE_URL = os.getenv("PDBMINE_URL")
PROJECT_DIR = 'casp_da'
proteins = [
  'T1024', 'T1030', 'T1030-D2', 'T1024-D1', 'T1032-D1', 'T1053-D1', 'T1027-D1', 'T1029-D1',
  'T1025-D1', 'T1028-D1', 'T1030-D1', 'T1053-D2', 'T1057-D1','T1058-D1', 'T1058-D2'
]
da = DihedralAdherence(proteins[1], [4,5,6,7], PDBMINE_URL, PROJECT_DIR, kdews=[1,32,64,128], 
                      mode='ml', weights_file='ml_runs/best_model-kde_16-32_383.pt', device='cpu')

Initializing T1030 ...
Results already exist
Casp ID: T1030 	PDB: 6poo
Structure exists: 'pdb/pdb6poo.ent' 


UniProt ID: Q8DWZ6


In [84]:
da.xray_phi_psi

Unnamed: 0,pos,seq_ctxt,res,phi,psi,protein_id
0,3,DQELGKQ,L,-36.030187,-59.888005,6poo
1,4,QELGKQS,G,-39.019123,-48.178986,6poo
2,5,ELGKQSR,K,-78.704852,-45.558103,6poo
3,6,LGKQSRR,Q,-64.760956,-42.420846,6poo
4,7,GKQSRRS,S,-65.489395,-45.226074,6poo
...,...,...,...,...,...,...
262,265,LQDLTRG,L,-75.336014,-26.474826,6poo
263,266,QDLTRGT,T,-103.125925,2.885212,6poo
264,267,DLTRGTK,R,-62.648810,124.920915,6poo
265,268,LTRGTKE,G,92.642641,-34.052816,6poo


In [74]:
Path(da.xray_fn).open().readlines()

['HEADER    IMMUNE SYSTEM                           04-JUL-19   6POO              \n',
 'TITLE     NOVEL STRUCTURE OF THE N-TERMINAL HELICAL DOMAIN OF BIBA, A GROUP B   \n',
 'TITLE    2 STREPTOCOCCUS IMMUNOGENIC BACTERIAL ADHESIN                          \n',
 'COMPND    MOL_ID: 1;                                                            \n',
 'COMPND   2 MOLECULE: BIBA;                                                      \n',
 'COMPND   3 CHAIN: A;                                                            \n',
 'COMPND   4 SYNONYM: PUTATIVE CELL-WALL ANCHORED SURFACE ADHESIN;                \n',
 'COMPND   5 ENGINEERED: YES                                                      \n',
 'SOURCE    MOL_ID: 1;                                                            \n',
 'SOURCE   2 ORGANISM_SCIENTIFIC: STREPTOCOCCUS AGALACTIAE;                       \n',
 'SOURCE   3 ORGANISM_TAXID: 1311;                                                \n',
 'SOURCE   4 GENE: GBS2018;                

In [31]:
parser = PDB.PDBParser()
xray_structure = parser.get_structure(da.casp_protein_id, da.xray_fn)
pred_structure = parser.get_structure(da.alphafold_id, da.predictions_dir / da.alphafold_id)

Exception ignored.
Some atoms or residues may be missing in the data structure.


In [58]:
chainA = next(iter((xray_structure[0].get_chains())))
chainB = next(iter((pred_structure[0].get_chains())))

In [75]:
[(i,r.resname,AMINO_ACID_CODES.get(r.resname, 'X')) for i,r in enumerate(chainA.get_residues()) if AMINO_ACID_CODES.get(r.resname, 'X') == 'X']

[(62, 'MSE', 'X'), (134, 'MSE', 'X'), (159, 'MSE', 'X'), (251, 'MSE', 'X')]

In [67]:
[(r.resname,AMINO_ACID_CODES.get(r.resname, 'X')) for r in chainB.get_residues() if AMINO_ACID_CODES.get(r.resname, 'X') == 'M']

[('MET', 'M'), ('MET', 'M'), ('MET', 'M'), ('MET', 'M')]

In [33]:
# align
residuesA = ''.join([AMINO_ACID_CODES.get(r.resname, 'X') for r in chainA.get_residues()])
residuesB = ''.join([AMINO_ACID_CODES.get(r.resname, 'X') for r in chainB.get_residues()])

In [41]:
alignments[0].aligned

array([[[  0,  62],
        [ 63, 134],
        [135, 159],
        [160, 251],
        [252, 273]],

       [[  0,  62],
        [ 63, 134],
        [135, 159],
        [160, 251],
        [252, 273]]])

In [57]:
aligner = PairwiseAligner()
aligner.mode = 'global'
alignments =  aligner.align(residuesA, residuesB)
print(alignments[0])
aligned = alignments[0].aligned

# remove breaks that occur due to X in the sequence
for i in range(aligned.shape[1]):
    (t1,t2), (q1,q2) = aligned[:,i]
    # check if next residue is X
    print(t1, t2, q1, q2)
    if i < aligned.shape[1] and (residuesA[t2] == 'X' or residuesB[q2] == 'X'): 
        print(residuesA[t2], residuesB[q2])
        # check if next residues are same
        if (residuesA[t2+1] == residuesB[q2+1]):
            print(residuesA[t2+1], residuesB[q2+1])
# for i,((t1,t2),(q1,q2)) in enumerate(zip(*alignments[0].aligned)):
#     print(i, t1, t2, q1, q2)
#     print(residuesA[t1], residuesB[q1])
#     print(residuesA[t2], residuesB[q2])
#     print(residuesA[t2+1], residuesB[q2+1])

target            0 DQELGKQSRRSQDIIKSLGFLSSDQKDILVKSISSSKDSQLILKFVTQATQLNNAESTKA
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 DQELGKQSRRSQDIIKSLGFLSSDQKDILVKSISSSKDSQLILKFVTQATQLNNAESTKA

target           60 KQX-AQNDVALIKNISPEVLEEYKEKIQRASTKSQVDEFVAEAKKVVNSNKETLVNQANG
                 60 ||--||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 KQ-MAQNDVALIKNISPEVLEEYKEKIQRASTKSQVDEFVAEAKKVVNSNKETLVNQANG

target          119 KKQEIAKLENLSNDEX-LRYNTAIDNVVKQYNEGKLNITAAX-NALNSIKQAAQEVAQKN
                120 |||||||||||||||--||||||||||||||||||||||||--|||||||||||||||||
query           119 KKQEIAKLENLSNDE-MLRYNTAIDNVVKQYNEGKLNITAA-MNALNSIKQAAQEVAQKN

target          177 LQKQYAKKIERISSKGLALSKKAKEIYEKHKSILPTPGYYADSVGTYLNRFRDKQTFGNR
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           177 LQKQYAKKIERISSKGLALSKKAKEIYEKHKSILPTPGYYADSVGTYLNRFRDKQTFGNR

target          237 SVWT

IndexError: string index out of range