# Annotations

In [1]:
from IPython.utils import io

with io.capture_output() as captured:
    !pip install requests
    !pip install spyprot
    !pip install pandas

print(captured.stdout if ("Error" in captured.stdout) or ("ERROR" in captured.stdout) else "Installation successful")

Installation successful


In [2]:
import pandas as pd
import numpy as np
import requests, spyprot, json
from typing import Optional, List

In [3]:
PROTEIN_DIRECTORY = "data/proteins"

chains = pd.read_csv("data/chains.csv")
chains

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A
...,...,...,...
1458,8G9J,synthetic,8G9J:A
1459,8OYV,synthetic,8OYV:A
1460,8TNO,synthetic,8TNO:A
1461,8FJE,synthetic,8FJE:A


In [4]:
def get_sequence(fasta_path: str) -> str:
    """
    This function obtains a sequence from given fasta file.
    
    Parameters:
        fasta_path (str): path to a fasta file containing exactly one
                          sequence on the second line of the file
    
    Returns:
        string of letters without new line at the end
    """
    with open(fasta_path, "r") as file:
        lines = file.readlines()

    assert(len(lines) == 2)
    return lines[1].strip()


def get_method_of_acquisition(pdb_id: str) -> str:
    query = f'{{entry(entry_id:"{pdb_id}") {{exptl {{method}}}}}}'
    data = requests.get(f"https://data.rcsb.org/graphql?query={query}")
    if data.status_code == 200:
        results = data.json()
        method = results["data"]["entry"]["exptl"][0]["method"] if results["data"]["entry"] is not None else None
        return method
    print(f"The request failed with status code {data.status_code}")
    return None
    

chains["sequence"] = chains.apply(lambda row: get_sequence(f"{PROTEIN_DIRECTORY}/{row['pdb_id']}/{row['chain_id']}.fasta"),
                                  axis = 1)

chains["sequence_length"] = chains.apply(lambda row: len(row["sequence"]), axis = 1)


chains["method_of_acquisition"] = chains.apply(lambda row: get_method_of_acquisition(row["pdb_id"]), axis = 1)


chains

Unnamed: 0,pdb_id,label,chain_id,sequence,sequence_length,method_of_acquisition
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION
1,8PX8,monomer,8PX8:A,GSMGKLSEQLKHCNGILKELLSKKHAAYAWPFYKPVDASALGLHDY...,115,X-RAY DIFFRACTION
2,8B2E,monomer,8B2E:A,LVLPGLDALQTRNALAIIAEAKKENVGPHGCQAAITTGLTESSLRI...,143,X-RAY DIFFRACTION
3,8HOE,monomer,8HOE:A,MGSSHHHHHHAFHDVPSLGQKVGAGSQKDVFHSRQDPRQCICLFRP...,189,X-RAY DIFFRACTION
4,8TCE,monomer,8TCE:A,APTENSTGVQDCYRGDGQSYRGTLSTTITGRTCQSWSSMTPHWHRR...,94,X-RAY DIFFRACTION
...,...,...,...,...,...,...
1458,8G9J,synthetic,8G9J:A,HHHHHHSGSGENLYFQGSGSEEIVEEAETALKALLEEAEKGGKEDA...,223,X-RAY DIFFRACTION
1459,8OYV,synthetic,8OYV:A,MNEEKREELLEEAKRLLEESLKLLKQAYNTPIEIDLPISGGVKAIL...,195,X-RAY DIFFRACTION
1460,8TNO,synthetic,8TNO:A,MGSVEEVKRIMDLARQKISDAMDELNMDATLKQSVDESMKRAEQRA...,282,X-RAY DIFFRACTION
1461,8FJE,synthetic,8FJE:A,SGSPTPLETLPLEELERRALKIYLRRHGSVPEEEIETMPLEELERK...,145,X-RAY DIFFRACTION


In [5]:
def get_uniprot_id(pdb_id: str) -> Optional[str]:
    
    sifts_url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
    sifts_response = requests.get(sifts_url)
    uniprot_id = None
    
    if sifts_response.status_code == 200:
        sifts_data = sifts_response.json()[pdb_id.lower()]['UniProt'].keys()
        uniprot_id = list(sifts_data)[0]

    return uniprot_id

chains["UniProt ID"] = chains.apply(
    lambda row: get_uniprot_id(f"{row['pdb_id']}"),
    axis = 1
)
chains

Unnamed: 0,pdb_id,label,chain_id,sequence,sequence_length,method_of_acquisition,UniProt ID
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4
1,8PX8,monomer,8PX8:A,GSMGKLSEQLKHCNGILKELLSKKHAAYAWPFYKPVDASALGLHDY...,115,X-RAY DIFFRACTION,P25440
2,8B2E,monomer,8B2E:A,LVLPGLDALQTRNALAIIAEAKKENVGPHGCQAAITTGLTESSLRI...,143,X-RAY DIFFRACTION,
3,8HOE,monomer,8HOE:A,MGSSHHHHHHAFHDVPSLGQKVGAGSQKDVFHSRQDPRQCICLFRP...,189,X-RAY DIFFRACTION,A0A085GHR3
4,8TCE,monomer,8TCE:A,APTENSTGVQDCYRGDGQSYRGTLSTTITGRTCQSWSSMTPHWHRR...,94,X-RAY DIFFRACTION,P08519
...,...,...,...,...,...,...,...
1458,8G9J,synthetic,8G9J:A,HHHHHHSGSGENLYFQGSGSEEIVEEAETALKALLEEAEKGGKEDA...,223,X-RAY DIFFRACTION,
1459,8OYV,synthetic,8OYV:A,MNEEKREELLEEAKRLLEESLKLLKQAYNTPIEIDLPISGGVKAIL...,195,X-RAY DIFFRACTION,
1460,8TNO,synthetic,8TNO:A,MGSVEEVKRIMDLARQKISDAMDELNMDATLKQSVDESMKRAEQRA...,282,X-RAY DIFFRACTION,
1461,8FJE,synthetic,8FJE:A,SGSPTPLETLPLEELERRALKIYLRRHGSVPEEEIETMPLEELERK...,145,X-RAY DIFFRACTION,


In [6]:
def get_annotations(df: pd.DataFrame, uniprot_ids: List[str], fields: List[str]) -> Optional[str]:
    results = {}
    for uniprot_id in uniprot_ids:
        result = spyprot.UniprotSearch(fields, accessions = uniprot_id).get()
        results.update(result)
    
    for i in range(len(fields)):
        df[fields[i]] = df['UniProt ID'].apply(lambda x: results[x][i] if x is not None else None)
  

uniprot_ids = list(chains["UniProt ID"].dropna())
fields = ["organism_name", "xref_pfam", "xref_interpro"]
get_annotations(chains, uniprot_ids, fields)

In [7]:
def split_ids(id_string: str) -> Optional[List[str]]:
    if id_string is None or id_string == "":
        return None
    return id_string.rstrip(";").split(";")
        


chains["xref_pfam"] = chains["xref_pfam"].apply(split_ids)
chains["xref_interpro"] = chains["xref_interpro"].apply(split_ids)
chains

Unnamed: 0,pdb_id,label,chain_id,sequence,sequence_length,method_of_acquisition,UniProt ID,organism_name,xref_pfam,xref_interpro
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),"[PF01661, PF05407, PF00978, PF12601, PF01443]","[IPR027351, IPR002588, IPR043502, IPR002589, I..."
1,8PX8,monomer,8PX8:A,GSMGKLSEQLKHCNGILKELLSKKHAAYAWPFYKPVDASALGLHDY...,115,X-RAY DIFFRACTION,P25440,Homo sapiens (Human),"[PF17035, PF00439]","[IPR043508, IPR043509, IPR050935, IPR001487, I..."
2,8B2E,monomer,8B2E:A,LVLPGLDALQTRNALAIIAEAKKENVGPHGCQAAITTGLTESSLRI...,143,X-RAY DIFFRACTION,,,,
3,8HOE,monomer,8HOE:A,MGSSHHHHHHAFHDVPSLGQKVGAGSQKDVFHSRQDPRQCICLFRP...,189,X-RAY DIFFRACTION,A0A085GHR3,Ewingella americana (strain ATCC 33852 / DSM 4...,,[IPR054555]
4,8TCE,monomer,8TCE:A,APTENSTGVQDCYRGDGQSYRGTLSTTITGRTCQSWSSMTPHWHRR...,94,X-RAY DIFFRACTION,P08519,Homo sapiens (Human),"[PF00051, PF00089]","[IPR000001, IPR013806, IPR018056, IPR038178, I..."
...,...,...,...,...,...,...,...,...,...,...
1458,8G9J,synthetic,8G9J:A,HHHHHHSGSGENLYFQGSGSEEIVEEAETALKALLEEAEKGGKEDA...,223,X-RAY DIFFRACTION,,,,
1459,8OYV,synthetic,8OYV:A,MNEEKREELLEEAKRLLEESLKLLKQAYNTPIEIDLPISGGVKAIL...,195,X-RAY DIFFRACTION,,,,
1460,8TNO,synthetic,8TNO:A,MGSVEEVKRIMDLARQKISDAMDELNMDATLKQSVDESMKRAEQRA...,282,X-RAY DIFFRACTION,,,,
1461,8FJE,synthetic,8FJE:A,SGSPTPLETLPLEELERRALKIYLRRHGSVPEEEIETMPLEELERK...,145,X-RAY DIFFRACTION,,,,


In [8]:
chains_annotation_pfam = chains.drop("xref_interpro", axis = 1).explode("xref_pfam")
chains_annotation_pfam

Unnamed: 0,pdb_id,label,chain_id,sequence,sequence_length,method_of_acquisition,UniProt ID,organism_name,xref_pfam
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),PF01661
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),PF05407
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),PF00978
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),PF12601
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),PF01443
...,...,...,...,...,...,...,...,...,...
1458,8G9J,synthetic,8G9J:A,HHHHHHSGSGENLYFQGSGSEEIVEEAETALKALLEEAEKGGKEDA...,223,X-RAY DIFFRACTION,,,
1459,8OYV,synthetic,8OYV:A,MNEEKREELLEEAKRLLEESLKLLKQAYNTPIEIDLPISGGVKAIL...,195,X-RAY DIFFRACTION,,,
1460,8TNO,synthetic,8TNO:A,MGSVEEVKRIMDLARQKISDAMDELNMDATLKQSVDESMKRAEQRA...,282,X-RAY DIFFRACTION,,,
1461,8FJE,synthetic,8FJE:A,SGSPTPLETLPLEELERRALKIYLRRHGSVPEEEIETMPLEELERK...,145,X-RAY DIFFRACTION,,,


In [9]:
chains_annotation_interpro = chains.drop("xref_pfam", axis = 1).explode("xref_interpro")
chains_annotation_interpro

Unnamed: 0,pdb_id,label,chain_id,sequence,sequence_length,method_of_acquisition,UniProt ID,organism_name,xref_interpro
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),IPR027351
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),IPR002588
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),IPR043502
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),IPR002589
0,8P0E,monomer,8P0E:A,MADSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGS...,190,X-RAY DIFFRACTION,G3M8F4,Rubella virus (RUBV),IPR043472
...,...,...,...,...,...,...,...,...,...
1458,8G9J,synthetic,8G9J:A,HHHHHHSGSGENLYFQGSGSEEIVEEAETALKALLEEAEKGGKEDA...,223,X-RAY DIFFRACTION,,,
1459,8OYV,synthetic,8OYV:A,MNEEKREELLEEAKRLLEESLKLLKQAYNTPIEIDLPISGGVKAIL...,195,X-RAY DIFFRACTION,,,
1460,8TNO,synthetic,8TNO:A,MGSVEEVKRIMDLARQKISDAMDELNMDATLKQSVDESMKRAEQRA...,282,X-RAY DIFFRACTION,,,
1461,8FJE,synthetic,8FJE:A,SGSPTPLETLPLEELERRALKIYLRRHGSVPEEEIETMPLEELERK...,145,X-RAY DIFFRACTION,,,


In [10]:
chains.to_csv("data/chains_annotations.csv", sep = ",", index = False)
chains_annotation_interpro.to_csv("data/chains_annotation_interpro.csv", sep = ",", index = False)
chains_annotation_pfam.to_csv("data/chains_annotation_pfam.csv", sep = ",", index = False)

## Create dataframes for analysis with filtered chains

In [11]:
chains_filtered = pd.read_csv("data/filtered/chains_filtered.csv")
chains_filtered

Unnamed: 0,pdb_id,label,chain_id
0,8P0E,monomer,8P0E:A
1,8PX8,monomer,8PX8:A
2,8B2E,monomer,8B2E:A
3,8HOE,monomer,8HOE:A
4,8TCE,monomer,8TCE:A
...,...,...,...
1332,8G9J,synthetic,8G9J:A
1333,8OYV,synthetic,8OYV:A
1334,8TNO,synthetic,8TNO:A
1335,8FJE,synthetic,8FJE:A


In [12]:
annotations_filtered = pd.merge(chains, chains_filtered, how='inner', on=["pdb_id", "label", "chain_id"])
chains_annotation_interpro_filtered = pd.merge(chains_annotation_interpro, chains_filtered, how='inner', on=["pdb_id", "label", "chain_id"])
chains_annotation_pfam_filtered = pd.merge(chains_annotation_pfam, chains_filtered, how='inner', on=["pdb_id", "label", "chain_id"])

In [13]:
annotations_filtered.to_csv("data/filtered/chains_annotations_filtered.csv", sep = ",", index = False)
chains_annotation_interpro_filtered.to_csv("data/filtered/chains_annotation_interpro_filtered.csv", sep = ",", index = False)
chains_annotation_pfam_filtered.to_csv("data/filtered/chains_annotation_pfam_filtered.csv", sep = ",", index = False)

## Annotate families

In [14]:
def annotate_families(database: str, accession: str) -> Optional[str]:
    response = requests.get(f"https://www.ebi.ac.uk/interpro/api/entry/{database}/{accession}")
    annotation = None
    
    if response.status_code == 200:
        data = response.json()
        name = data["metadata"]["name"]["name"]
        type = data["metadata"]["type"]
        annotation = f"{name} ({type})"

    return annotation


pfam_families = chains_annotation_pfam_filtered.groupby(["xref_pfam"], dropna = False).size().reset_index(name='count').sort_values(by='count', ascending=False)
pfam_families["pfam_name"] = pfam_families.apply(lambda row: annotate_families("pfam", row["xref_pfam"]), axis = 1)
pfam_families

Unnamed: 0,xref_pfam,count,pfam_name
1127,,284,
35,PF00069,40,Protein kinase domain (domain)
685,PF07714,30,Protein tyrosine and serine/threonine kinase (...
164,PF00503,17,G-protein alpha subunit (domain)
227,PF00680,14,Viral RNA-dependent RNA polymerase (family)
...,...,...,...
560,PF04368,1,Protein of unknown function (DUF507) (family)
561,PF04389,1,Peptidase family M28 (domain)
562,PF04392,1,ABC transporter substrate binding protein (fam...
563,PF04454,1,Encapsulating protein for peroxidase (family)


In [15]:
interpro_families = chains_annotation_interpro_filtered.groupby(["xref_interpro"], dropna = False).size().reset_index(name='count').sort_values(by='count', ascending=False)
interpro_families["interpro_name"] = interpro_families.apply(lambda row: annotate_families("interpro", row["xref_interpro"]), axis = 1)
interpro_families

Unnamed: 0,xref_interpro,count,interpro_name
2751,,259,
1622,IPR027417,76,P-loop containing nucleoside triphosphate hydr...
902,IPR011009,72,Protein kinase-like domain superfamily (homolo...
103,IPR000719,71,Protein kinase domain (domain)
1229,IPR017441,56,"Protein kinase, ATP binding site (binding_site)"
...,...,...,...
1380,IPR020479,1,"Homeodomain, metazoa (domain)"
1379,IPR020476,1,NUDIX hydrolase (domain)
322,IPR001938,1,Thaumatin family (family)
328,IPR001989,1,"Radical-activating enzyme, conserved site (con..."


In [16]:
pfam_families.to_csv("data/filtered/pfam_families.csv", sep = ",", index = False)
interpro_families.to_csv("data/filtered/interpro_families.csv", sep = ",", index = False)