## Extract domain and protein features

Info: This script extracts domains and other features like TM and disordered regions, domain topology and disulfid bonds


## Import functions

In [1]:
# Standard library imports
import gzip
import json
import logging as log
import requests
import sys
import time
import urllib.parse
import urllib.request
import xmltodict

# Third party imports
from bs4 import BeautifulSoup as bs
from IPython.display import display
import numpy as np
import pandas as pd

log_message = "verbose"

if log_message == "verbose":
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.INFO
    )
elif log_message == "debug":
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.WARNING
    )
else:
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.ERROR
    )

logger = log.getLogger(__name__)


# Local application imports
try:
    sys.path.insert(1, '../scripts/')
    from get_domains import extract_single_protein_pfam, extract_pfam_nested, extract_pfam_pdb_mapping, extract_pfam_all_release
    from get_db_features import get_disprot_disordered, get_mobidb_disordered
    from get_human_proteome import extract_uniprot_human_proteome_ids
    from get_uniprot_features import extract_uniprot_human_proteome_info, extract_uniprot_info
    from map_domains_features import extract_single_infos, map_pfam_pdb, get_domain_features_human_proteome
except (ModuleNotFoundError, ImportError) as e:
    logger.error("{} fileure".format(type(e)))
else:
    logger.info("Import succeeded")

INFO:Import succeeded


## User inputs

In [2]:
uniprot_id = 'P10912' # default: ''
write_dir = '' # if empty, no output will be written to file
domainfamily_pdbs = False # provides all pdbs associated to the domain family

## Call running script

In [3]:
run = f"python ../scripts/run.py -h"
!{run}

usage: run.py [-h] [--human_proteome HUMAN_PROTEOME] [--uniprot_id UNIPROT_ID]
              [--reviewed REVIEWED] [--domainfamily_pdbs DOMAINFAMILY_PDBS]
              [--nested NESTED] [--output_dir OUTPUT_DIR]

optional arguments:
  -h, --help            show this help message and exit
  --human_proteome HUMAN_PROTEOME, -hp HUMAN_PROTEOME
                        Calculate domains and features for the human proteome
                        (default: False)
  --uniprot_id UNIPROT_ID, -u UNIPROT_ID
                        Uniprot id for single runs (default: '')
  --reviewed REVIEWED, -r REVIEWED
                        Obtain only reviewed entries from uniprot (options: *,
                        yes [default])
  --domainfamily_pdbs DOMAINFAMILY_PDBS, -dfp DOMAINFAMILY_PDBS
                        Get also all pdbs associated with each single domain
                        family (default: False)
  --nested NESTED, -n NESTED
                        Get information ab

In [4]:
run = f"python ../scripts/run.py\
            --uniprot_id {uniprot_id} \
            --reviewed yes \
            --domainfamily_pdbs true"
!{run}

INFO:({'PF00041': [['fn3', 159, 244], [['Extracellular', 19, 264]], [[['3HHR', 'C', 141, 226, 159, 244], ['1KF9', 'C', 641, 726, 159, 244], ['3HHR', 'B', 141, 226, 159, 244], ['1KF9', 'B', 341, 426, 159, 244], ['1AXI', 'B', 141, 226, 159, 244], ['1KF9', 'E', 1341, 1426, 159, 244], ['2AEW', 'B', 141, 226, 159, 244], ['1HWH', 'B', 141, 226, 159, 244], ['2AEW', 'A', 141, 226, 159, 244], ['1KF9', 'F', 1641, 1726, 159, 244], ['1A22', 'B', 341, 426, 159, 244], ['1HWG', 'B', 141, 226, 159, 244], ['1HWG', 'C', 141, 226, 159, 244]], [['6TPV', 'A', 321, 401, '', ''], ['6TPV', 'A', 415, 500, '', ''], ['5A40', 'E', 5, 75, '', ''], ['2B5I', 'C', 134, 204, '', ''], ['5J12', 'B', 131, 211, '', ''], ['2DM4', 'A', 9, 92, '', ''], ['1TTF', 'A', 4, 84, '', ''], ['3QHT', 'C', 4, 77, '', ''], ['2DLH', 'A', 20, 105, '', ''], ['1F6F', 'B', 105, 195, '', ''], ['4HLJ', 'A', 685, 750, '', ''], ['4HLJ', 'A', 778, 864, '', ''], ['6P67', 'G', 111, 198, '', ''], ['2E7H', 'A', 9, 92, '', ''], ['1HWG', 'B', 152, 219,

## get human proteome
obtain uniprot ids

In [5]:
human_proteome_uniprot_ids = extract_uniprot_human_proteome_ids(reviewed='yes')
logger.info(human_proteome_uniprot_ids)

INFO:['Q9H553', 'Q96QF7', 'Q13023', 'Q8WTP8', 'Q7Z7M1', 'Q9NP61', 'P36404', 'Q6NSI1', 'Q9NQ90', 'Q99873', 'Q9H765', 'P48047', 'O60566', 'P02746', 'Q8ND61', 'Q8N5C1', 'Q6ZP82', 'B7Z1M9', 'Q9BXI9', 'Q5VZK9', 'Q5JW98', 'Q6ZUS6', 'P53634', 'P00746', 'P30260', 'Q86V15', 'Q9HD42', 'P49454', 'Q7L2Z9', 'P47902', 'Q9Y4M8', 'P40259', 'Q96CF2', 'Q9BWV3', 'A8MXV6', 'Q07002', 'Q5T280', 'Q96M91', 'P28907', 'Q96G28', 'Q9NRB3', 'P29016', 'Q8N0X4', 'Q96B33', 'O75128', 'Q9UIV1', 'P02748', 'Q32Q52', 'A6NJI1', 'O14493', 'Q96AJ1', 'P49759', 'O00299', 'Q8WWK9', 'Q9UGL9', 'Q7Z4B0', 'Q9Y534', 'Q8WXD9', 'Q9P1W3', 'P35606', 'Q16526', 'P11509', 'Q96L46', 'Q86Y56', 'P01189', 'P09228', 'Q8WWM9', 'Q9Y5B0', 'P00167', 'Q5DJT8', 'Q8TF08', 'Q08495', 'Q9UJV9', 'Q9BQY9', 'Q92841', 'Q6P3S1', 'Q9UHL0', 'Q05048', 'Q8IXT1', 'P05141', 'P51816', 'Q9NP73', 'P35318', 'Q8N556', 'Q9UKQ2', 'Q08462', 'P33121', 'Q6UXC1', 'Q9Y672', 'O75078', 'P61204', 'Q96Q83', 'Q96PE2', 'Q8NBQ7', 'Q63HQ0', 'O60641', 'Q6NXT1', 'Q15327', 'Q9UHI8', 'Q96

## Domain classification

### pfam domain classification
This calls the most current 'life' version

In [6]:
pfam_dict = extract_single_protein_pfam( uniprot_id, verbose=False )
logger.info(pfam_dict)

INFO:{'PF00041': ['fn3', 159, 244], 'PF09067': ['EpoR_lig-bind', 43, 144], 'PF12772': ['GHBP', 316, 617]}


### mapping pdbs to domains

obtain info from release Pfam33.1 (02/05/2020)

In [7]:
all_pfam_db_release_df = extract_pfam_all_release()
all_pfam_db_release_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,106,340351673,1XAK,PF08779,P59635,A,1,,67,,16,82,2dcf00
1,107,340351673,1YO4,PF08779,P59635,A,1,,84,,16,99,2dcf00
2,108,340372087,6U7K,PF19209,Q91AV1,A,644,,701,,644,701,ff5353
3,109,340372087,6U7K,PF19209,Q91AV1,B,644,,701,,644,701,ff5353
4,110,340372087,6U7K,PF19209,Q91AV1,C,644,,701,,644,701,ff5353
...,...,...,...,...,...,...,...,...,...,...,...,...,...
607898,670735,590052880,4JVD,PF03466,A0A0H2Z7A6,A,122,,293,,122,293,2dcf00
607899,670736,590052880,4JVI,PF03466,A0A0H2Z7A6,A,122,,293,,122,293,2dcf00
607900,670737,590085608,3FD3,PF03466,A9CJQ0,A,112,,305,,94,287,2dcf00
607901,670738,590220247,3ISP,PF03466,P9WMF5,B,93,,290,,93,290,ff5353


### pfam domains mapped to existing pdbs
obtain domains with excisting pdbs on-the-fly/most current version

In [8]:
all_pfam_pdb_df = extract_pfam_pdb_mapping()
all_pfam_pdb_df

Unnamed: 0,PDB_ID,CHAIN_ID,PdbResNumStart,PdbResNumEnd,PFAM_ACC,PFAM_Name,PFAM_desc,eValue
0,5POC,A,40,120,PF00439.25,Bromodomain,Bromodomain,5.3E-26
1,3BLS,A,14,360,PF00144.24,Beta-lactamase,Beta-lactamase,0.0
2,3R2P,A,46,182,PF01442.18,Apolipoprotein,Apolipoprotein A1/A4/E domain,2.8E-45
3,3MQK,A,20,66,PF08068.12,DKCLD,DKCLD (NUC011) domain,7.8E-18
4,3MQK,A,70,121,PF01509.18,TruB_N,TruB family pseudouridylate synthase (N termin...,3.7E-26
...,...,...,...,...,...,...,...,...
112295,2ZUT,A,5,432,PF09508.10,Lact_bio_phlase,Lacto-N-biose phosphorylase N-terminal TIM bar...,0.0
112296,2ZUT,A,440,691,PF17385.2,LBP_M,Lacto-N-biose phosphorylase central domain,0.0
112297,2ZUT,A,697,748,PF17386.2,LBP_C,Lacto-N-biose phosphorylase C-terminal domain,4.2E-20
112298,2K1Z,A,20,91,PF00595.24,PDZ,PDZ domain,2.8E-14


### get nested domains
obtained from release Pfam33.1 (02/05/2020)

In [9]:
nested_pfam_df = extract_pfam_nested(write_dir=write_dir)
nested_pfam_df

Unnamed: 0,pfamA_acc,nested_pfamA_acc,pfamseq_acc,seq_version,seq_start,seq_end
0,PF00001,PF01498,A0A2H2ILN3,1,305,377
1,PF00001,PF13359,W4YLW7,1,294,451
2,PF00006,PF05203,P17255,3,284,737
3,PF00006,PF05204,P17255,3,585,696
4,PF00026,PF03489,B9SVA7,1,323,356
...,...,...,...,...,...,...
160,PF17132,PF00754,A0A243WFR7,1,210,338
161,PF18516,PF18510,M9SJC9,1,1010,1167
162,PF00657,PF01476,A0A226HFV6,1,205,247
163,PF00005,PF00385,O14134,1,820,872


## Get information about features from uniprot
- including domains, disulfid bonds, active and binding sites, TM domains
- for a single or all humand proteomes

In [10]:
#option1: single uniprot id
uniprot_info_df = extract_uniprot_info(uniprot_id, reviewed='yes')
uniprot_info_df

Unnamed: 0,Entry,Entry name,Gene names (primary ),Organism,Sequence,Cross-reference (PDB),Cross-reference (DisProt),Cross-reference (MobiDB),Length,Transmembrane,...,Region,Repeat,Zinc finger,Disulfide bond,Active site,Binding site,Cross-reference (Pfam),Cross-reference (InterPro),Cross-reference (SUPFAM),Cross-reference (PROSITE)
0,P10912,GHR_HUMAN,GHR,Homo sapiens (Human),MDLWQLLLTLALAGSSDAFSGSEATAAILSRAPWSLQSVNPGLKTN...,1A22;1AXI;1HWG;1HWH;1KF9;2AEW;3HHR;5OEK;5OHD;,DP00033;,,638.0,"TRANSMEM 265..288; /note=""Helical""; /evidenc...",...,"REGION 260..262; /note=""Required for ADAM17-m...",,,"DISULFID 56..66; /evidence=""ECO:0000269|PubMe...",,,PF09067;PF00041;PF12772;,IPR003961;IPR036116;IPR025871;IPR015152;IPR013...,SSF49265;,PS50853;PS01352;
1,,,,,,,,,,,...,,,,,,,,,,


In [11]:
#option2: information for complete human proteome
uniprot_proteome_info_df = extract_uniprot_human_proteome_info(reviewed='yes')
uniprot_proteome_info_df

Unnamed: 0,Entry,Entry name,Gene names (primary ),Organism,Sequence,Cross-reference (PDB),Cross-reference (DisProt),Cross-reference (MobiDB),Length,Transmembrane,...,Region,Repeat,Zinc finger,Disulfide bond,Active site,Binding site,Cross-reference (Pfam),Cross-reference (InterPro),Cross-reference (SUPFAM),Cross-reference (PROSITE)
0,Q9H553,ALG2_HUMAN,ALG2,Homo sapiens (Human),MAEEQGRERDSVPKPSVLFLHPDLGVGGAERLVLDAALALQARGCS...,,,,416,"TRANSMEM 85..105; /note=""Helical""; /evidence...",...,,,,,,,PF13439;PF00534;,IPR027054;IPR001296;IPR028098;,,
1,Q96QF7,ACRC_HUMAN,GCNA,Homo sapiens (Human),MDGCKKELPRLQEPEEDEDCYILNVQSSSDDTSGSSVARRAPKRQA...,,,,691,,...,,,,,,,PF10263;,IPR006640;,,
2,Q13023,AKAP6_HUMAN,AKAP6,Homo sapiens (Human),MLTMSVTLSPLRSQDLDPMATDASPMAINMTPTVEQGEGEEAMKDM...,,,,2319,,...,"REGION 2063..2076; /note=""PKA-RII subunit bin...","REPEAT 762..848; /note=""Spectrin 1""; REPEAT 1...",,,,,,IPR018159;,,
3,Q8WTP8,AEN_HUMAN,AEN,Homo sapiens (Human),MVPREAPESAQCLCPSLTIPNAKDVLRKRHKRRSRQHQRFMARKAL...,,,,325,,...,,,,,,,PF00929;,IPR013520;IPR012337;IPR036397;,SSF53098;,
4,Q7Z7M1,AGRD2_HUMAN,ADGRD2,Homo sapiens (Human),MDAPWGAGERWLHGAAVDRSGVSLGPPPTPQVNQGTLGPQVAPVAA...,,,,963,"TRANSMEM 663..683; /note=""Helical; Name=1""; ...",...,,,,"DISULFID 146..212; /evidence=""ECO:0000255|PRO...",,,PF00002;PF01825;PF00354;,IPR013320;IPR017981;IPR000832;IPR000203;IPR001...,SSF49899;,PS50261;PS50221;PS51828;
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20355,Q9Y680,FKBP7_HUMAN,FKBP7,Homo sapiens (Human),MPKTMHFLFRFIVFFYLWGLFTAQRQKKEESTEEVKIEVLHRPENC...,,,,222,,...,,,,,,,PF13202;PF00254;,IPR011992;IPR018247;IPR002048;IPR001179;,SSF47473;,PS00018;PS50222;PS00014;PS50059;
20356,P54803,GALC_HUMAN,GALC,Homo sapiens (Human),MAEWLLSASWQRRAKAMTAAAGSAGRAAVPLLLCALLAPGGAYVLD...,,,,685,,...,,,,"DISULFID 287..394; /evidence=""ECO:0000250""","ACT_SITE 198; /note=""Proton donor/acceptor""; ...","BINDING 109; /note=""Substrate""; /evidence=""E...",PF02057;PF17387;,IPR013785;IPR001286;IPR035394;IPR017853;,SSF51445;,
20357,Q99999,G3ST1_HUMAN,GAL3ST1,Homo sapiens (Human),MLPPQKKPWESMAKGLVLGALFTSFLLLVYSYAVPPLHAGLASTTP...,,,,423,"TRANSMEM 15..35; /note=""Helical; Signal-ancho...",...,,,,,,,PF06990;,IPR009729;IPR027417;,SSF52540;,
20358,Q75VX8,GARE2_HUMAN,GAREM2,Homo sapiens (Human),MEKLAAGLAGLRWSMGAFPLDLIVSRCRLPTLACLGPGEYAEGVSE...,,,,874,,...,"REGION 12..339; /note=""CABIT""",,,,,,PF12736;,IPR025946;IPR013761;,SSF47769;,


## Information about disordered content
- Option1: source=disprot
- Option2: source=mobidb

In [12]:
#option1
disordered_disprot = get_disprot_disordered(uniprot_id)
logger.info(disordered_disprot)

INFO:[[270, 620, 'I']]


In [13]:
#option2
disordered_mobidb = get_mobidb_disordered(uniprot_id)
logger.info(disordered_mobidb)

INFO:[[269, 620, 'D']]


## maped info without pdb_id

In [14]:
uniprot_info_df = extract_uniprot_info(uniprot_id, reviewed='yes')
mapped_protein_info = extract_single_infos(uniprot_id, uniprot_info_df)
mapped_protein_info

{'tm_region': [['tm:Helical', 265, 288]],
 'topol_domain': [['Extracellular', 19, 264], ['Cytoplasmic', 289, 638]],
 'disorder': [['disorder:D', 269, 620]],
 'disulfid': [['disulfid', 56, 66],
  ['disulfid', 101, 112],
  ['disulfid', 126, 140]],
 'func_sites': []}

## Mapping all information together into dictionary
for single protein and human proteome

In [15]:
#option1: single uniprot id
uniprot_info_df = extract_uniprot_info(uniprot_id, reviewed='yes')
all_pfam_db_release_df = extract_pfam_all_release()
if domainfamily_pdbs:
    pdb_pfam_df = extract_pfam_pdb_mapping()
else:
    pdb_pfam_df = []
domain_feature_info = map_pfam_pdb(
    uniprot_id, all_pfam_db_release_df, pdb_pfam_df, uniprot_info_df, write_dir=write_dir)
logger.info(domain_feature_info)

INFO:({'PF00041': [['fn3', 159, 244], [['Extracellular', 19, 264]], [[['3HHR', 'C', 141, 226, 159, 244], ['1KF9', 'C', 641, 726, 159, 244], ['3HHR', 'B', 141, 226, 159, 244], ['1KF9', 'B', 341, 426, 159, 244], ['1AXI', 'B', 141, 226, 159, 244], ['1KF9', 'E', 1341, 1426, 159, 244], ['2AEW', 'B', 141, 226, 159, 244], ['1HWH', 'B', 141, 226, 159, 244], ['2AEW', 'A', 141, 226, 159, 244], ['1KF9', 'F', 1641, 1726, 159, 244], ['1A22', 'B', 341, 426, 159, 244], ['1HWG', 'B', 141, 226, 159, 244], ['1HWG', 'C', 141, 226, 159, 244]], []]], 'PF09067': [['EpoR_lig-bind', 43, 144], [['Extracellular', 19, 264], ['disulfid', 56, 66], ['disulfid', 101, 112], ['disulfid', 126, 140]], [[['3HHR', 'C', 32, 126, 50, 144], ['1KF9', 'C', 533, 626, 51, 144], ['3HHR', 'B', 32, 126, 50, 144], ['2AEW', 'A', 29, 126, 47, 144], ['1KF9', 'B', 233, 326, 51, 144], ['1KF9', 'F', 1533, 1626, 51, 144], ['1A22', 'B', 233, 326, 51, 144], ['1AXI', 'B', 32, 126, 50, 144], ['1HWG', 'B', 32, 126, 50, 144], ['1KF9', 'E', 1233,

In [None]:
#option2: human proteome
human_proteome_uniprot_ids = extract_uniprot_human_proteome_ids(reviewed='yes')
all_pfam_db_release_df = extract_pfam_all_release()
if domainfamily_pdbs:
    pdb_pfam_df = extract_pfam_pdb_mapping()
else:
    pdb_pfam_df = []
uniprot_proteome_info_df = extract_uniprot_human_proteome_info(reviewed='yes')
domain_feature_info_proteome = get_domain_features_human_proteome(
    human_proteome_uniprot_ids, all_pfam_db_release_df, pdb_pfam_df,  
    uniprot_proteome_info_df, write_dir=write_dir)
logger.info(domain_feature_info_proteome)