<a href="https://colab.research.google.com/github/MicheleBonus/cpclab_notebooks/blob/main/universal_residue_mapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# @title Run this code cell first
import requests
import pandas as pd
from collections import defaultdict

def fetch_data_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching data: {e}")

def get_residue_info_from_entry(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/pdb/entry/residue_listing/{pdb_id}"
    residue_info_json = fetch_data_from_url(url)
    return residue_info_json

def get_uniprot_mappings_from_entry(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/graph-api/mappings/uniprot/{pdb_id}"
    uniprot_mappings_json = fetch_data_from_url(url)
    return uniprot_mappings_json

def parse_residue_info(residue_info_json):
    residue_info = [
        {
            'PDB ID': pdb_id,
            'PDB Entity ID': molecule['entity_id'],
            'Chain ID (PDB)': chain['struct_asym_id'],
            'Chain ID (Author)': chain['chain_id'],
            'Residue Number (PDB)': residue['residue_number'],
            'Residue Number (Author)': residue['author_residue_number'],
            'Insertion Code (Author)': residue['author_insertion_code'],
            'Residue Number and Insertion Code (Author)': str(residue['author_residue_number']) + residue['author_insertion_code']
        }
        for pdb_id, pdb_info in residue_info_json.items()
        for molecule in pdb_info['molecules']
        for chain in molecule['chains']
        for residue in chain['residues']
    ]

    return residue_info

def parse_uniprot_mappings(uniprot_mappings_json):
    uniprot_mappings = defaultdict(dict)

    for pdb_id, pdb_info in uniprot_mappings_json.items():
        uniprot_data = pdb_info.get('UniProt', {})
        for uniprot_accession, uniprot_entry in uniprot_data.items():
            uniprot_name = uniprot_entry.get('name')
            mappings = uniprot_entry.get('mappings', [])

            for mapping in mappings:
                entity_id = mapping['entity_id']
                chain_id_pdb = mapping['chain_id']
                unp_start = mapping['unp_start']
                unp_end = mapping['unp_end']
                pdb_start = mapping['start'].get('residue_number')
                pdb_end = mapping['end'].get('residue_number')

                uniprot_mappings_key = (pdb_id, entity_id, chain_id_pdb)
                uniprot_mappings_value = {
                    'UniProt ID': uniprot_accession,
                    'UniProt Name': uniprot_name,
                    'UniProt Start': unp_start,
                    'UniProt End': unp_end,
                    'PDB Start': pdb_start,
                    'PDB End': pdb_end,
                }

                uniprot_mappings[uniprot_mappings_key].update(uniprot_mappings_value)

    return uniprot_mappings

def combine_residue_info_and_uniprot_mappings(residue_info, uniprot_mappings):
    combined_info = []

    for residue in residue_info:
        # Create key for uniprot_mappings lookup
        key = (residue['PDB ID'], residue['PDB Entity ID'], residue['Chain ID (Author)'])

        # Initialize UniProt fields to None
        uniprot_id = None
        uniprot_name = None
        uniprot_residue_number = None

        # Use the get() method to attempt to retrieve the data at once
        uniprot_info = uniprot_mappings.get(key)
        if uniprot_info is not None:
            pdb_start = uniprot_info['PDB Start']
            pdb_end = uniprot_info['PDB End']
            pdb_residue_number = residue['Residue Number (PDB)']

            if pdb_start <= pdb_residue_number <= pdb_end:
                uniprot_start = uniprot_info['UniProt Start']
                uniprot_residue_number = uniprot_start + (pdb_residue_number - pdb_start)

                # Update residue with UniProt data
                uniprot_id = uniprot_info['UniProt ID']
                uniprot_name = uniprot_info['UniProt Name']

        # Construct new dictionary with desired order of keys
        new_residue = {k: v for k, v in residue.items() if k != 'Residue Number and Insertion Code (Author)'}
        new_residue['Residue Number and Insertion Code (Author)'] = residue['Residue Number and Insertion Code (Author)']
        new_residue['Residue Number (UniProt)'] = uniprot_residue_number
        new_residue['UniProt ID'] = uniprot_id
        new_residue['UniProt Name'] = uniprot_name

        combined_info.append(new_residue)

    return combined_info

def combine_residue_info_and_uniprot_mappings_to_df(combined_info):
    # Transform dictionary into a DataFrame
    combined_info_df = pd.DataFrame(combined_info)

    # Convert int64 and float64 columns (UniProt Residue number) to Int64 (which is nullable)
    int64_cols = combined_info_df.select_dtypes(include='int64').columns
    float64_cols = combined_info_df.select_dtypes(include='float64').columns

    # Convert columns to Int64
    for col in int64_cols:
        combined_info_df[col] = combined_info_df[col].astype('Int64')
    for col in float64_cols:
        combined_info_df[col] = combined_info_df[col].astype('Int64')

    return combined_info_df

def get_combined_residue_info_and_uniprot_mappings(pdb_id):
    residue_info_json = get_residue_info_from_entry(pdb_id)
    residue_info = parse_residue_info(residue_info_json)
    uniprot_mappings_json = get_uniprot_mappings_from_entry(pdb_id)
    uniprot_mappings = parse_uniprot_mappings(uniprot_mappings_json)
    combined_residue_info_and_uniprot_mappings = combine_residue_info_and_uniprot_mappings(residue_info, uniprot_mappings)
    combined_residue_info_and_uniprot_mappings_df = combine_residue_info_and_uniprot_mappings_to_df(combined_residue_info_and_uniprot_mappings)
    return combined_residue_info_and_uniprot_mappings_df


In [11]:
pdb_id = "1A2C"
mappings = get_combined_residue_info_and_uniprot_mappings(pdb_id.lower())
mappings[:10]

Unnamed: 0,PDB ID,PDB Entity ID,Chain ID (PDB),Chain ID (Author),Residue Number (PDB),Residue Number (Author),Insertion Code (Author),Residue Number and Insertion Code (Author),Residue Number (UniProt),UniProt ID,UniProt Name
0,1a2c,1,A,L,24,14,B,14B,351,P00734,THRB_HUMAN
1,1a2c,1,A,L,25,14,C,14C,352,P00734,THRB_HUMAN
2,1a2c,1,A,L,26,14,D,14D,353,P00734,THRB_HUMAN
3,1a2c,1,A,L,27,14,E,14E,354,P00734,THRB_HUMAN
4,1a2c,1,A,L,28,14,F,14F,355,P00734,THRB_HUMAN
5,1a2c,1,A,L,29,14,G,14G,356,P00734,THRB_HUMAN
6,1a2c,1,A,L,30,14,H,14H,357,P00734,THRB_HUMAN
7,1a2c,1,A,L,31,14,I,14I,358,P00734,THRB_HUMAN
8,1a2c,1,A,L,32,14,J,14J,359,P00734,THRB_HUMAN
9,1a2c,1,A,L,33,14,K,14K,360,P00734,THRB_HUMAN
