## Extract domain and protein features

Info: This script extracts domains and other features like TM and disordered regions, domain topology and disulfid bonds


## Import functions

In [1]:
# Standard library imports
import gzip
import json
import logging as log
import requests
import sys
import time
import urllib.parse
import urllib.request

# Third party imports
from IPython.display import display
import numpy as np
import pandas as pd
import xmltodict

log_message = "verbose"

if log_message == "verbose":
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.INFO
    )
elif log_message == "debug":
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.WARNING
    )
else:
    log.basicConfig(
        format='%(levelname)s:%(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p',
        level=log.ERROR
    )

logger = log.getLogger(__name__)


# Local application imports
try:
    sys.path.insert(1, '../scripts/')
    from get_domains import extract_single_protein_pfam, extract_pfam_nested, extract_pfam_pdb_mapping, extract_pfam_all_release
    from get_db_features import get_disprot_disordered, get_mobidb_disordered
    from get_human_proteome import extract_uniprot_human_proteome_ids
    from get_uniprot_features import extract_uniprot_human_proteome_info, extract_uniprot_info
    from map_domains_features import extract_single_infos, map_pfam_pdb, get_domain_features_human_proteome
except (ModuleNotFoundError, ImportError) as e:
    logger.error("{} fileure".format(type(e)))
else:
    logger.info("Import succeeded")

INFO:Import succeeded


## User inputs

In [2]:
uniprot_id = 'P10912' # default: ''
write_dir = '' # if empty, no output will be written to file
domainfamily_pdbs = False # provides all pdbs associated to the domain family

## Call running script

In [3]:
run = f"python ../scripts/run.py -h"
!{run}

usage: run.py [-h] [--human_proteome HUMAN_PROTEOME] [--uniprot_id UNIPROT_ID]
              [--reviewed REVIEWED] [--domainfamily_pdbs DOMAINFAMILY_PDBS]
              [--nested NESTED] [--output_dir OUTPUT_DIR]

optional arguments:
  -h, --help            show this help message and exit
  --human_proteome HUMAN_PROTEOME, -hp HUMAN_PROTEOME
                        Calculate domains and features for the human proteome
                        (default: False)
  --uniprot_id UNIPROT_ID, -u UNIPROT_ID
                        Uniprot id for single runs (default: '')
  --reviewed REVIEWED, -r REVIEWED
                        Obtain only reviewed entries from uniprot (options: *,
                        yes [default])
  --domainfamily_pdbs DOMAINFAMILY_PDBS, -dfp DOMAINFAMILY_PDBS
                        Get also all pdbs associated with each single domain
                        family (default: False)
  --nested NESTED, -n NESTED
                        Get information about nested domains

In [4]:
run = f"python ../scripts/run.py\
            --uniprot_id {uniprot_id} \
            --reviewed yes \
            --domainfamily_pdbs true"
!{run}

INFO:({0: {'PF00041': [['fn3', 159, 244], [['Extracellular', 19, 264]], [[['3HHR', 'C', 141, 226, 159, 244], ['1KF9', 'C', 641, 726, 159, 244], ['3HHR', 'B', 141, 226, 159, 244], ['1KF9', 'B', 341, 426, 159, 244], ['1AXI', 'B', 141, 226, 159, 244], ['1KF9', 'E', 1341, 1426, 159, 244], ['2AEW', 'B', 141, 226, 159, 244], ['1HWH', 'B', 141, 226, 159, 244], ['2AEW', 'A', 141, 226, 159, 244], ['1KF9', 'F', 1641, 1726, 159, 244], ['1A22', 'B', 341, 426, 159, 244], ['1HWG', 'B', 141, 226, 159, 244], ['1HWG', 'C', 141, 226, 159, 244]], [['6TPV', 'A', 321, 401, '', ''], ['6TPV', 'A', 415, 500, '', ''], ['5A40', 'E', 5, 75, '', ''], ['2B5I', 'C', 134, 204, '', ''], ['5J12', 'B', 131, 211, '', ''], ['2DM4', 'A', 9, 92, '', ''], ['1TTF', 'A', 4, 84, '', ''], ['3QHT', 'C', 4, 77, '', ''], ['2DLH', 'A', 20, 105, '', ''], ['1F6F', 'B', 105, 195, '', ''], ['4HLJ', 'A', 685, 750, '', ''], ['4HLJ', 'A', 778, 864, '', ''], ['6P67', 'G', 111, 198, '', ''], ['2E7H', 'A', 9, 92, '', ''], ['1HWG', 'B', 152, 

## get human proteome
obtain uniprot ids

In [5]:
human_proteome_uniprot_ids = extract_uniprot_human_proteome_ids(reviewed='yes')
logger.info(human_proteome_uniprot_ids)

INFO:['Q8N7X0', 'Q5T1N1', 'Q92667', 'Q5VUY0', 'P62736', 'Q9H553', 'P0C7M7', 'P49703', 'Q5TGY3', 'Q75V66', 'Q96JD6', 'P02655', 'P55056', 'Q8WXK4', 'Q9ULK2', 'Q99700', 'O75882', 'Q6ZP68', 'P56385', 'P56945', 'Q9HBH7', 'Q6ZU67', 'Q8TDH9', 'Q9BQP9', 'Q9H3F6', 'P04920', 'Q969J3', 'Q8TDN6', 'O95816', 'A2RUR9', 'O43286', 'Q86Y30', 'Q8WUZ0', 'O60512', 'P42575', 'Q6P9G0', 'P31415', 'P40121', 'Q6ZTR7', 'Q9H425', 'Q5VU69', 'O60911', 'Q8N6G1', 'Q8ND23', 'Q4AC94', 'Q8TAB5', 'Q99616', 'Q5VUE5', 'Q16581', 'P43166', 'Q8IYX3', 'O95319', 'Q9UPV0', 'O15519', 'Q32Q52', 'Q96LP6', 'Q3C1V1', 'P41002', 'P51790', 'P78369', 'Q8NCH0', 'Q9NQW8', 'Q8TBE1', 'P51959', 'Q9HBJ8', 'Q6P4Q7', 'O43741', 'Q13362', 'Q9UH17', 'P14060', 'P22760', 'Q9Y478', 'Q9Y2T4', 'A5X5Y0', 'P33897', 'Q8TDX5', 'Q15758', 'Q9Y2D5', 'P49902', 'Q8NFM4', 'P28332', 'Q5BKT4', 'Q8IWT0', 'Q86TB3', 'Q9UNK9', 'Q969Q4', 'Q8IUR7', 'O14791', 'Q9BVC5', 'P17342', 'Q96GD4', 'Q96SQ7', 'Q92858', 'P13497', 'Q7Z5W3', 'P27482', 'Q8NEL0', 'Q6PII3', 'Q8NHW4', 'Q8N

## Domain classification

### pfam domain classification
This calls the most current 'life' version

In [6]:
pfam_dict = extract_single_protein_pfam( uniprot_id, verbose=False )
logger.info(pfam_dict)

INFO:[{'acc': 'PF00041', 'id': 'fn3', 'start': 159, 'end': 244}, {'acc': 'PF09067', 'id': 'EpoR_lig-bind', 'start': 43, 'end': 144}, {'acc': 'PF12772', 'id': 'GHBP', 'start': 316, 'end': 617}]


### mapping pdbs to domains

obtain info from release Pfam33.1 (02/05/2020)

In [8]:
all_pfam_db_release_df = extract_pfam_all_release()
all_pfam_db_release_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,106,340351673,1XAK,PF08779,P59635,A,1,,67,,16,82,2dcf00
1,107,340351673,1YO4,PF08779,P59635,A,1,,84,,16,99,2dcf00
2,108,340372087,6U7K,PF19209,Q91AV1,A,644,,701,,644,701,ff5353
3,109,340372087,6U7K,PF19209,Q91AV1,B,644,,701,,644,701,ff5353
4,110,340372087,6U7K,PF19209,Q91AV1,C,644,,701,,644,701,ff5353
...,...,...,...,...,...,...,...,...,...,...,...,...,...
607898,670735,590052880,4JVD,PF03466,A0A0H2Z7A6,A,122,,293,,122,293,2dcf00
607899,670736,590052880,4JVI,PF03466,A0A0H2Z7A6,A,122,,293,,122,293,2dcf00
607900,670737,590085608,3FD3,PF03466,A9CJQ0,A,112,,305,,94,287,2dcf00
607901,670738,590220247,3ISP,PF03466,P9WMF5,B,93,,290,,93,290,ff5353


### pfam domains mapped to existing pdbs
obtain domains with excisting pdbs on-the-fly/most current version

In [9]:
all_pfam_pdb_df = extract_pfam_pdb_mapping()
all_pfam_pdb_df

Unnamed: 0,PDB_ID,CHAIN_ID,PdbResNumStart,PdbResNumEnd,PFAM_ACC,PFAM_Name,PFAM_desc,eValue
0,5POC,A,40,120,PF00439.25,Bromodomain,Bromodomain,5.3E-26
1,3BLS,A,14,360,PF00144.24,Beta-lactamase,Beta-lactamase,0.0
2,3R2P,A,46,182,PF01442.18,Apolipoprotein,Apolipoprotein A1/A4/E domain,2.8E-45
3,3MQK,A,20,66,PF08068.12,DKCLD,DKCLD (NUC011) domain,7.8E-18
4,3MQK,A,70,121,PF01509.18,TruB_N,TruB family pseudouridylate synthase (N termin...,3.7E-26
...,...,...,...,...,...,...,...,...
377958,6WUA,m,3,108,PF00416.22,Ribosomal_S13,Ribosomal protein S13/S18,3.3E-33
377959,6WUA,n,13,60,PF00253.21,Ribosomal_S14,Ribosomal protein S14p/S29e,1.1E-24
377960,6WUA,s,4,80,PF00203.21,Ribosomal_S19,Ribosomal protein S19,6.4E-36
377961,5RWQ,A,411,703,PF03372.23,Exo_endo_phos,Endonuclease/Exonuclease/phosphatase family,1.9E-10


### get nested domains
obtained from release Pfam33.1 (02/05/2020)

In [10]:
nested_pfam_df = extract_pfam_nested(write_dir=write_dir)
nested_pfam_df

Unnamed: 0,pfamA_acc,nested_pfamA_acc,pfamseq_acc,seq_version,seq_start,seq_end
0,PF00001,PF01498,A0A2H2ILN3,1,305,377
1,PF00001,PF13359,W4YLW7,1,294,451
2,PF00006,PF05203,P17255,3,284,737
3,PF00006,PF05204,P17255,3,585,696
4,PF00026,PF03489,B9SVA7,1,323,356
...,...,...,...,...,...,...
160,PF17132,PF00754,A0A243WFR7,1,210,338
161,PF18516,PF18510,M9SJC9,1,1010,1167
162,PF00657,PF01476,A0A226HFV6,1,205,247
163,PF00005,PF00385,O14134,1,820,872


## Get information about features from uniprot
- including domains, disulfid bonds, active and binding sites, TM domains
- for a single or all humand proteomes

In [11]:
#option1: single uniprot id
uniprot_info_df = extract_uniprot_info(uniprot_id, reviewed='yes')
uniprot_info_df

Unnamed: 0,Entry,Entry name,Gene names (primary ),Organism,Sequence,Cross-reference (PDB),Cross-reference (DisProt),Cross-reference (MobiDB),Length,Transmembrane,...,Biotechnological use,Disruption phenotype,Involvement in disease,Pharmaceutical use,Toxic dose,Natural variant,Glycosylation,Initiator methionine,Lipidation,Modified residue
0,P10912,GHR_HUMAN,GHR,Homo sapiens (Human),MDLWQLLLTLALAGSSDAFSGSEATAAILSRAPWSLQSVNPGLKTN...,1A22;1AXI;1HWG;1HWH;1KF9;2AEW;3HHR;5OEK;5OHD;,DP00033;,,638.0,"TRANSMEM 265..288; /note=""Helical""; /evidenc...",...,,,DISEASE: Laron syndrome (LARS) [MIM:262500]: A...,,,"VARIANT 56; /note=""C -> S (in LARS)""; /evide...","CARBOHYD 46; /note=""N-linked (GlcNAc...) aspa...",,,"MOD_RES 341; /note=""Phosphoserine""; /evidenc..."
1,,,,,,,,,,,...,,,,,,,,,,


In [12]:
#option2: information for complete human proteome
uniprot_proteome_info_df = extract_uniprot_human_proteome_info(reviewed='yes')
uniprot_proteome_info_df

Unnamed: 0,Entry,Entry name,Gene names (primary ),Organism,Sequence,Cross-reference (PDB),Cross-reference (DisProt),Cross-reference (MobiDB),Length,Transmembrane,...,Region,Repeat,Zinc finger,Disulfide bond,Active site,Binding site,Cross-reference (Pfam),Cross-reference (InterPro),Cross-reference (SUPFAM),Cross-reference (PROSITE)
0,Q8N7X0,ADGB_HUMAN,ADGB,Homo sapiens (Human),MASKQTKKKEVHRINSAHGSDKSKDFYPFGSNVQSGSTEQKKGKFP...,,,,1667,,...,,,,,,,PF00648;,IPR012292;IPR000048;IPR038765;IPR001300;,SSF54001;,PS50203;PS50096;
1,Q5T1N1,AKND1_HUMAN,AKNAD1,Homo sapiens (Human),MDEADFSEHTTYKQEDLPYDGDLSQIKIGNDYSFTSKKDGLEVLNQ...,,,,836,,...,,,,,,,PF12443;,IPR022150;,,
2,Q92667,AKAP1_HUMAN,AKAP1,Homo sapiens (Human),MAIQFRSLFPLALPGMLALLGWWWFFSRKKGHVSSHDEQQVEAGAV...,,,,903,,...,"REGION 344..357; /note=""PKA-RII subunit bindi...",,,,,,PF00013;PF00567;,IPR033104;IPR004087;IPR004088;IPR036612;IPR035...,SSF54791;,PS50084;PS50304;
3,Q5VUY0,ADCL3_HUMAN,AADACL3,Homo sapiens (Human),MWDLALIFLAAACVFSLGVTLWVICSHFFTVHIPAAVGHPVKLRVL...,,,,407,,...,,,,,"ACT_SITE 193; /evidence=""ECO:0000250|UniProtK...",,PF07859;,IPR029058;IPR013094;IPR017157;,SSF53474;,
4,P62736,ACTA_HUMAN,ACTA2,Homo sapiens (Human),MCEEEDSTALVCDNGSGLCKAGFAGDDAPRAVFPSIVGRPRHQGVM...,,,,377,,...,,,,,,,PF00022;,IPR004000;IPR020902;IPR004001;IPR043129;,SSF53067;,PS00406;PS00432;PS01132;
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20366,Q9UK55,ZPI_HUMAN,SERPINA10,Homo sapiens (Human),MKVVPSLLLSVLLAQVWLVPGLAPSPQSPETPAPQNQTSRVVQAPK...,3F1S;3H5C;4AFX;4AJU;,,,444,,...,"REGION 136..153; /note=""Heparin-binding""",,,,,,PF00079;,IPR023796;IPR000215;IPR036186;IPR042178;IPR042...,SSF56574;,
20367,Q96MX3,ZNF48_HUMAN,ZNF48,Homo sapiens (Human),MERAVEPWGPDLHRPEEREPQRGARTGLGSENVISQPNEFEHTPQE...,,,,618,,...,,,"ZN_FING 112..134; /note=""C2H2-type 1""; /evid...",,,,PF00096;,IPR036236;IPR013087;,SSF57667;,PS00028;PS50157;
20368,A6NGD5,ZSA5C_HUMAN,ZSCAN5C,Homo sapiens (Human),MAANCTSSWSLGESCNSPGSEPPQSMPSPATQLGNHDSDPETCHVN...,,,,496,,...,,,"ZN_FING 356..378; /note=""C2H2-type 1""; /evid...",,,,PF02023;PF00096;,IPR003309;IPR038269;IPR036236;IPR013087;,SSF57667;,PS50804;PS00028;PS50157;
20369,Q9Y4E5,ZN451_HUMAN,ZNF451,Homo sapiens (Human),MGDPGSEIIESVPPAGPEASESTTDENEDDIQFVSEGPLRPVLEYI...,5D2M;,,,1061,,...,"REGION 1..344; /note=""Important for interacti...",,"ZN_FING 169..195; /note=""C2H2-type 1""; /evid...",,,,PF18479;,IPR041192;IPR013087;,,PS00028;PS50157;


## Information about disordered content
- Option1: source=disprot
- Option2: source=mobidb

In [13]:
#option1
disordered_disprot = get_disprot_disordered(uniprot_id)
logger.info(disordered_disprot)

INFO:[[270, 620, 'I']]


In [14]:
#option2
disordered_mobidb = get_mobidb_disordered(uniprot_id)
logger.info(disordered_mobidb)

INFO:[[269, 620, 'D']]


## maped info without pdb_id

In [15]:
uniprot_info_df = extract_uniprot_info(uniprot_id, reviewed='yes')
mapped_protein_info = extract_single_infos(uniprot_id, uniprot_info_df)
mapped_protein_info

{'tm_region': [['tm:Helical', 265, 288]],
 'topol_domain': [['Extracellular', 19, 264], ['Cytoplasmic', 289, 638]],
 'disorder': [['disorder:D', 269, 620]],
 'disulfid': [['disulfid', 56, 66],
  ['disulfid', 101, 112],
  ['disulfid', 126, 140]],
 'func_sites': []}

## Mapping all information together into dictionary
for single protein and human proteome

In [16]:
#option1: single uniprot id
uniprot_info_df = extract_uniprot_info(uniprot_id, reviewed='yes')
all_pfam_db_release_df = extract_pfam_all_release()
if domainfamily_pdbs:
    pdb_pfam_df = extract_pfam_pdb_mapping()
else:
    pdb_pfam_df = []
domain_feature_info = map_pfam_pdb(
    uniprot_id, all_pfam_db_release_df, pdb_pfam_df, uniprot_info_df, write_dir=write_dir)
logger.info(domain_feature_info)

INFO:({0: {'PF00041': [['fn3', 159, 244], [['Extracellular', 19, 264]], [[['3HHR', 'C', 141, 226, 159, 244], ['1KF9', 'C', 641, 726, 159, 244], ['3HHR', 'B', 141, 226, 159, 244], ['1KF9', 'B', 341, 426, 159, 244], ['1AXI', 'B', 141, 226, 159, 244], ['1KF9', 'E', 1341, 1426, 159, 244], ['2AEW', 'B', 141, 226, 159, 244], ['1HWH', 'B', 141, 226, 159, 244], ['2AEW', 'A', 141, 226, 159, 244], ['1KF9', 'F', 1641, 1726, 159, 244], ['1A22', 'B', 341, 426, 159, 244], ['1HWG', 'B', 141, 226, 159, 244], ['1HWG', 'C', 141, 226, 159, 244]], []]]}, 1: {'PF09067': [['EpoR_lig-bind', 43, 144], [['Extracellular', 19, 264], ['disulfid', 56, 66], ['disulfid', 101, 112], ['disulfid', 126, 140]], [[['3HHR', 'C', 32, 126, 50, 144], ['1KF9', 'C', 533, 626, 51, 144], ['3HHR', 'B', 32, 126, 50, 144], ['2AEW', 'A', 29, 126, 47, 144], ['1KF9', 'B', 233, 326, 51, 144], ['1KF9', 'F', 1533, 1626, 51, 144], ['1A22', 'B', 233, 326, 51, 144], ['1AXI', 'B', 32, 126, 50, 144], ['1HWG', 'B', 32, 126, 50, 144], ['1KF9', '

In [None]:
#option2: human proteome
human_proteome_uniprot_ids = extract_uniprot_human_proteome_ids(reviewed='yes')
all_pfam_db_release_df = extract_pfam_all_release()
if domainfamily_pdbs:
    pdb_pfam_df = extract_pfam_pdb_mapping()
else:
    pdb_pfam_df = []
uniprot_proteome_info_df = extract_uniprot_human_proteome_info(reviewed='yes')
domain_feature_info_proteome = get_domain_features_human_proteome(
    human_proteome_uniprot_ids, all_pfam_db_release_df, pdb_pfam_df,  
    uniprot_proteome_info_df, write_dir=write_dir)
logger.info(domain_feature_info_proteome)