In [5]:
amino_acid_codes = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLN": "Q",
    "GLU": "E",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V"
}

In [19]:
from Bio import SeqIO

protein_id_pdb = '4qyz'

# uni-prot accession number: Q46901
print('=== Alphafold Prediction ===')
alphafold_prediction_fn = 'AF-Q46901-F1-alphafold.pdb'
for af_record in SeqIO.parse(alphafold_prediction_fn, "pdb-seqres"):
    print("Record id %s, chain %s" % (af_record.id, af_record.annotations["chain"]))
    print(f'Sequence Length: {len(af_record.seq)}')

# pdb accession number: 4qyz
print('\n=== X-ray cryst. structure ===')
xray_crystal_structure_fn = '4qyz-pdb.pdb'
for pdb_record in SeqIO.parse(xray_crystal_structure_fn, "pdb-seqres"):
    print("Record id %s, chain %s" % (pdb_record.id, pdb_record.annotations["chain"]))
    print(f'Sequence Length: {len(pdb_record.seq)}')
    break # only one chain

=== Alphafold Prediction ===
Record id XXXX:A, chain A
Sequence Length: 502

=== X-ray cryst. structure ===
Record id 4QYZ:A, chain A
Sequence Length: 502


In [41]:
import pandas as pd
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.ic_rebuild import structure_rebuild_test
import numpy as np

def get_phi_psi(protein_id, pdb_file, window_size, chain_id):
    phi_psi = pd.DataFrame(columns=['res','phi', 'psi', 'seq', 'idx_in_seq', 'chain', 'pos_in_chain', 'protein_id'])
    parser = PDBParser()
    protein = parser.get_structure(protein_id, pdb_file)
    chain = protein[0][chain_id]
    chain.atom_to_internal_coordinates(verbose=True)
    resultDict = structure_rebuild_test(chain)
    if not resultDict['pass']:
        print(f"Failed {protein_id} {chain.id}")
        raise Exception('Failed to rebuild')
    residues = list(chain.get_residues())
    j = 0
    seq = ''
    for i in range(len(residues)):
        # Convert 3 char codes to 1 char codes
        if j == 0 and i+window_size < len(residues):
            seq = ''.join([amino_acid_codes.get(r.resname, 'X') for r in residues[i:i+window_size]])
        key = amino_acid_codes.get(residues[i].resname, 'X')
        if not residues[i].internal_coord:
            psi,phi = np.nan, np.nan
        else:
            psi = residues[i].internal_coord.get_angle("psi")
            phi = residues[i].internal_coord.get_angle("phi")
        psi = psi if psi else np.nan
        phi = phi if phi else np.nan
        phi_psi.loc[len(phi_psi)] = [key, phi, psi, seq, j, chain.id, residues[i].get_full_id()[3][1], protein.id]
        j = (j+1) % window_size
    residues = list(chain.get_residues())
    return phi_psi

In [42]:
WINDOW_SIZE = 3
chain_id = 'A'
phi_psi_pdb = get_phi_psi(protein_id_pdb, xray_crystal_structure_fn, WINDOW_SIZE, chain_id) 
phi_psi_af = get_phi_psi(protein_id_pdb, alphafold_prediction_fn, WINDOW_SIZE, chain_id)



chain break at GLU  137  due to MaxPeptideBond (1.4 angstroms) exceeded
chain break at ASN  323  due to MaxPeptideBond (1.4 angstroms) exceeded
chain break at LEU  358  due to MaxPeptideBond (1.4 angstroms) exceeded


In [None]:
from dotenv import load_dotenv
import os
import requests
import time
import pandas as pd
load_dotenv()
PDBMINE_URL = os.getenv("PDBMINE_URL")

residue_chain = str(pdb_record.seq)
code_length = 1
response = requests.post(
    PDBMINE_URL + '/v1/api/query',
    json={
        "residueChain": residue_chain,
        "codeLength": code_length,
        "windowSize": WINDOW_SIZE
    }
)
assert(response.ok)
print(response.json())
query_id = response.json().get('queryID')
assert(query_id)

while True:
    response = requests.get(PDBMINE_URL + f'/v1/api/query/{query_id}')
    if response.ok:
        matches = response.json()['frames']
        break
    else:
        print('waiting')
        time.sleep(15)

In [44]:
response = requests.get(PDBMINE_URL + f'/v1/api/query/{query_id}')
if response.ok:
    matches = response.json()['frames']

ConnectionError: HTTPConnectionPool(host='10.173.98.74', port=8077): Max retries exceeded with url: /v1/api/query/3a81b296-ea76-11ee-b41b-0242ac110002 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x73c7338189a0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [None]:
from tqdm import tqdm

phi_psi_mined = pd.DataFrame(columns=['res','phi', 'psi', 'seq', 'idx_in_seq', 'chain', 'protein_id'])
for seq_win,v in tqdm(matches.items()):
    seq = seq_win[4:]
    for protein,seq_matches in v.items():
        # get phi and psi of each residue in window for each match
        protein_id, chain = protein.split('_')
        for seq_match in seq_matches:
            for i,match in enumerate(seq_match):
                res, phi, psi = match['residueName'], match['phi'], match['psi']
                phi_psi_mined.loc[len(phi_psi_mined)] = [res, phi, psi, seq, i, chain, protein_id]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3,3,figsize=(15,15))

m=0
for i,key in enumerate(phi_psi_pdb.res.unique()[m:m+9]):
    ax = axes.flatten()[i]
    phi_psi_alpha = phi_psi_pdb[phi_psi_pdb.res == key]
    phi_psi_dist = phi_psi_mined[(phi_psi_mined.res == key) & (phi_psi_mined.protein_id != protein_id_pdb)]
    sns.kdeplot(data=phi_psi_dist, x='phi', y='psi', ax=ax, fill=True)
    # ax.scatter(phi_psi_dist['phi'], phi_psi_dist['psi'], label='Res in Matches Found in PDBMine', color='green', marker='.', zorder=5)
    ax.scatter(phi_psi_alpha['phi'], phi_psi_alpha['psi'], label=f'Res in {protein_id_pdb} from PDB', color='red', marker='o', zorder=10)

    ax.set_title(f'Residue {key}')
    ax.set_xticks(ax.get_xticks()[::2])
    ax.set_yticks(ax.get_yticks()[::2])
    ax.set_xlabel('phi' if i > 5 else '')
    ax.set_ylabel('psi' if i % 3 == 0 else '')
    if i == 7:
        ax.legend(loc=(0.25,-0.15))
fig.suptitle(f'Distribution of phi, psi angles for the residues in {protein_id_pdb}')
plt.tight_layout()