In [1]:
## Program to obtain and save UniProt to PDB sequence mappings
## Created by: Joelle Strom
## Adapts code originally created by Chop Yan Lee
## Last updated: 17.05.2024

# Import libraries
import db_utils_sqlalchemy
import numpy as np
import pandas as pd
from tqdm import tqdm
import requests

# Establish connection to MySQL server
engine = db_utils_sqlalchemy.get_connection()

In [2]:
# Read in curated DDI data set
ddi_df = pd.read_csv('/mnt/c/Users/stromjoe/Documents/projects/DDI_IF-Analysis/DDI_dataset.csv')
ddi_df.reset_index(drop=True,inplace=True)

In [6]:
# Fetch residue mappings between UniProt residue numbering and PDB residue IDs
pdb_id = []
chain_id = []
unp_start = []
unp_end = []
auth_start = []
auth_end = []
pdb_start = []
pdb_end = []
for i in tqdm(ddi_df.PDB_ID):
    pdb_id_toget = i
    # Connect to SIFTS mapping API
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id_toget.lower()}"
    response = requests.get(url)
    data = response.json()
    # From API response data, append mapping data for current PDB ID - can contain multiple chains, append each chain separately
    for k, v in data.get(pdb_id_toget.lower()).get('UniProt').items():
        mappings = v.get('mappings')
        for mapping in mappings:
            pdb_id.append(pdb_id_toget)
            chain_id.append(mapping.get('chain_id'))
            unp_start.append(mapping.get('unp_start'))
            unp_end.append(mapping.get('unp_end'))
            auth_start.append(mapping.get('start').get('author_residue_number'))
            auth_end.append(mapping.get('end').get('author_residue_number'))
            pdb_start.append(mapping.get('start').get('residue_number'))
            pdb_end.append(mapping.get('end').get('residue_number'))


unp_pd_mapp_ddi = pd.DataFrame({'pdb_id':pdb_id, 'chain_id':chain_id, 'unp_start':unp_start, 'unp_end':unp_end, 'auth_start':auth_start,
                            'auth_end':auth_end, 'pdb_start':pdb_start, 'pdb_end':pdb_end})

100%|██████████████████████████████████████████████████████████████████████████████████| 80/80 [00:20<00:00,  3.94it/s]


In [4]:
from sqlalchemy import text
# Import curated DMI data set
query = text('SELECT * FROM chopyan_db.AlphaFold_minimal_PRS_DMI_structure_info WHERE for_AF2_benchmark = 1')
dmi_df = pd.read_sql(query, con=engine)

In [7]:
# Repeat mapping fetch process for the DMI dataset
pdb_id = []
chain_id = []
unp_start = []
unp_end = []
auth_start = []
auth_end = []
pdb_start = []
pdb_end = []
for i in tqdm(dmi_df.pdb_id):
    pdb_id_toget = i
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id_toget.lower()}"
    response = requests.get(url)
    data = response.json()
    for k, v in data.get(pdb_id_toget.lower()).get('UniProt').items():
        mappings = v.get('mappings')
        for mapping in mappings:
            pdb_id.append(pdb_id_toget)
            chain_id.append(mapping.get('chain_id'))
            unp_start.append(mapping.get('unp_start'))
            unp_end.append(mapping.get('unp_end'))
            auth_start.append(mapping.get('start').get('author_residue_number'))
            auth_end.append(mapping.get('end').get('author_residue_number'))
            pdb_start.append(mapping.get('start').get('residue_number'))
            pdb_end.append(mapping.get('end').get('residue_number'))


unp_pd_mapp_dmi = pd.DataFrame({'pdb_id':pdb_id, 'chain_id':chain_id, 'unp_start':unp_start, 'unp_end':unp_end, 'auth_start':auth_start,
                            'auth_end':auth_end, 'pdb_start':pdb_start, 'pdb_end':pdb_end})

100%|████████████████████████████████████████████████████████████████████████████████| 136/136 [00:34<00:00,  3.93it/s]


In [8]:
unp_pd_mapp = pd.concat([unp_pd_mapp_ddi, unp_pd_mapp_dmi])
# Compute shifts in residue numbering between author residue numbers and UniProt numbering, and between PDB residue numbers and UniProt numbering
unp_pd_mapp['auth_shift'] = unp_pd_mapp['unp_start'] - unp_pd_mapp['auth_start'] #Author-UniProt shift based on starting residue numbers
unp_pd_mapp['shift'] = unp_pd_mapp['unp_start'] - unp_pd_mapp['pdb_start'] #PDB-UniProt shift based on starting residue numbers
unp_pd_mapp['new_pdb_start'] = unp_pd_mapp['pdb_start'] + unp_pd_mapp['shift'] #Create new 'PDB start' based on the shift
unp_pd_mapp['new_pdb_end'] = unp_pd_mapp['pdb_end'] + unp_pd_mapp['shift'] # Create new 'PDB end' based on the shift
unp_pd_mapp['new_auth_start'] = unp_pd_mapp['auth_start'] + unp_pd_mapp['auth_shift'] # Create new 'Author start' based on shift
unp_pd_mapp['new_auth_end'] = unp_pd_mapp['auth_end'] + unp_pd_mapp['auth_shift'] # Create new 'Author end' based on shift
unp_pd_mapp

Unnamed: 0,pdb_id,chain_id,unp_start,unp_end,auth_start,auth_end,pdb_start,pdb_end,auth_shift,shift,new_pdb_start,new_pdb_end,new_auth_start,new_auth_end
0,6EPF,1,1,238,,,1,238,,0,1,238,,
1,6EPF,2,1,277,,,1,277,,0,1,277,,
2,6EPF,3,1,205,1.0,205.0,1,205,0.0,0,1,205,1.0,205.0
3,6EPF,4,1,201,1.0,,1,201,0.0,0,1,201,1.0,
4,6EPF,5,1,263,,,1,263,,0,1,263,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,2JKR,S,1,142,1.0,142.0,1,142,0.0,0,1,142,1.0,142.0
448,2JKR,M,1,435,1.0,435.0,1,435,0.0,0,1,435,1.0,435.0
449,2JKR,U,1,435,1.0,435.0,1,435,0.0,0,1,435,1.0,435.0
450,2JKR,P,431,441,,,1,11,,430,431,441,,


In [9]:
# Recompute shift based on the mappings between PDB end residue and UniProt end residue to double-check that computing all shifts based on the starting residue mappings is an accurate method
unp_pd_mapp['shift2'] = unp_pd_mapp['unp_end'] - unp_pd_mapp['pdb_end']
# Find cases where shifts based on start residue mapping and shifts based on end residue mapping are NOT equal
np.where(unp_pd_mapp['shift'] != unp_pd_mapp['shift2'])

#6JWP-H, 1W1W-B, 1N4M-A, 4KMD-A, 6FUZ-A: beginning and ending shifts are not equal, must verify
# Manual check within PDB entries confirm the calculated shift - no need to change any data points
# UniProt ending from mapping database is incorrect
# Will proceed with shifts and calculations based on start residue mappings

(array([ 268,  269,  329,  339,  349,  359, 1285, 1286, 1304, 1364, 1365,
        1382], dtype=int64),)

In [10]:
# Write out mapping dataframe to csv
unp_pd_mapp.to_csv('/mnt/c/Users/stromjoe/Documents/projects/DDI_IF-Analysis/UniProt_PDB_Mapping.csv', index=False)