In [22]:
import requests, sys
from lxml import etree
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from biopandas.pdb import PandasPdb

In [23]:
def build_xml():
    requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&reviewed=true&isoform=0"

    r = requests.get(requestURL, headers={ "Accept" : "application/xml"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    responseBody = r.text
    
    file_path = 'output.xml'
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(responseBody)
    
    print(responseBody)

In [30]:
def get_atomic_structure(pdb_id):
    ppdb = PandasPdb().fetch_pdb(pdb_id)
    atom = ppdb.df['ATOM']
    structure = list(zip(atom.element_symbol,atom.x_coord,atom.y_coord,atom.z_coord))
    return structure

def get_prot_analysis(sequence):
    analysis = ProteinAnalysis(sequence)
    mw = analysis.molecular_weight()
    pI = analysis.isoelectric_point()
    instability = analysis.instability_index()
    aromaticity = analysis.aromaticity()
    gravy = analysis.gravy()
    return mw,pI, instability,aromaticity,gravy

def read_xml(filepath = "output.xml"):
    with open(filepath, 'r', encoding='utf-8') as file:
        xml_data_str = file.read()
        
    xml_data_bytes = xml_data_str.encode('utf-8')
    root = etree.fromstring(xml_data_bytes)
    
    namespaces = {
        'uniprot': "http://uniprot.org/uniprot",
        'xsi': "http://www.w3.org/2001/XMLSchema-instance"
    }
    
    # for protein_name in root.findall('.//uniprot:protein/uniprot:recommendedName/uniprot:fullName', namespaces):
    #     print(protein_name.text)
    
    df = pd.DataFrame(columns=['pdb','accession','name','sequence','coords','mw','pI','II','aromaticity','gravy'])
    
    for entry in root.findall('uniprot:entry', namespaces):
        sequence = entry.find('uniprot:sequence', namespaces).text
        # print(sequence)
        accession = entry.find('uniprot:accession', namespaces).text
        # print(f"Accession: {accession}")
        name = entry.find('uniprot:name', namespaces).text
        # print(f"Name: {name}")
        pdb_ids = entry.findall(".//uniprot:dbReference[@type='PDB']", namespaces)
        for pdb_id in pdb_ids:
            # print(f"PDB ID: {pdb_id.get('id')}")
            df.loc[len(df.index)] = [pdb_id.get('id'),accession,name,sequence,None,None,None,None,None,None]
    df = df.drop_duplicates()
    
    for _,row in df.iterrows():
        pdb_id = row.pdb
        row['coords'] = get_atomic_structure(row.pdb)
        row['mw'],row['pI'],row['II'],row['aromaticity'],row['gravy'] = get_prot_analysis(row.sequence)

    return df


In [31]:
build_xml()
df = read_xml()
df

<?xml version='1.0' encoding='UTF-8'?><uniprot xmlns="http://uniprot.org/uniprot" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><entry xmlns="http://uniprot.org/uniprot" dataset="Swiss-Prot" created="2020-02-26" modified="2024-01-24" version="24"><accession>A0A009IHW8</accession><name>ABTIR_ACIB9</name><protein><recommendedName><fullName evidence="6">2' cyclic ADP-D-ribose synthase AbTIR</fullName><shortName evidence="6">2'cADPR synthase AbTIR</shortName><ecNumber evidence="4">3.2.2.-</ecNumber></recommendedName><alternativeName><fullName evidence="6">NAD(+) hydrolase AbTIR</fullName><ecNumber evidence="3">3.2.2.6</ecNumber></alternativeName><alternativeName><fullName evidence="5">TIR domain-containing protein in A.baumannii</fullName><shortName evidence="5">AbTIR</shortName></alternativeName></protein><gene><name evidence="8" type="ORF">J512_3302</name></gene><organism><name type="s

Unnamed: 0,pdb,accession,name,sequence,coords,mw,pI,II,aromaticity,gravy
0,7UWG,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[(N, -23.326, -13.053, 44.949), (C, -22.948, -...",30921.9061,6.990139,44.952045,0.052045,-0.667286
1,7UXU,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[(N, 101.794, 110.252, 96.567), (C, 100.72, 11...",30921.9061,6.990139,44.952045,0.052045,-0.667286
2,8G83,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[(N, 22.774, 18.021, 14.509), (C, 24.106, 18.5...",30921.9061,6.990139,44.952045,0.052045,-0.667286
8,7S4N,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, -5.908, -3.083, 17.415), (C, -6.559, -2.2...",13277.1305,5.440083,44.787288,0.067797,-0.557627
9,7S58,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, 8.441, -28.691, -37.934), (C, 9.581, -28....",13277.1305,5.440083,44.787288,0.067797,-0.557627
10,7S59,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, 23.086, -7.86, 7.817), (C, 22.47, -9.146,...",13277.1305,5.440083,44.787288,0.067797,-0.557627
11,7SO0,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, 16.821, -34.822, -20.879), (C, 16.071, -3...",13277.1305,5.440083,44.787288,0.067797,-0.557627
14,4K1Y,A0A023GPI8,LECA_CANBL,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,"[(N, -1.926, 8.923, 14.54), (C, -0.571, 9.144,...",25571.1119,5.625264,30.408903,0.092827,-0.208861
15,4K1Z,A0A023GPI8,LECA_CANBL,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,"[(N, 14.429, -4.808, -1.212), (C, 15.57, -5.08...",25571.1119,5.625264,30.408903,0.092827,-0.208861
16,4K20,A0A023GPI8,LECA_CANBL,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,"[(N, -2.766, -34.844, 5.165), (C, -1.742, -33....",25571.1119,5.625264,30.408903,0.092827,-0.208861


In [26]:
df

2' cyclic ADP-D-ribose synthase AbTIR
L-2-hydroxyglutarate dehydrogenase
Short-chain dehydrogenase fogD
Transcriptional regulator fogI
Prenyltransferase fogH
Highly reducing polyketide synthase fogA
Short-chain dehydrogenase fogG
Cytochrome P450 monooxygenase fogE
FAD-linked oxidoreductase fogF
Echinulin prenyltransferase 1
Echinulin prenyltransferase 2
Nonribosomal peptide synthetase echPS
Echilunin cytochrome P450 monooxygenase
Evasin P1142
Evasin P546
Evasin P974
Evasin P1126
Evasin P983
Evasin P991
Evasin P985
Evasin P1180
Evasin P1183
Lectin alpha chain
Immunity protein CdiI
Bi-functional coumaroyl CoA and feruloyl CoA ortho-hydroxylase Diox1
Bi-functional coumaroyl CoA and feruloyl CoA ortho-hydroxylase Diox2
Short-chain dehydrogenase/reductase prx4
FAD-dependent monooxygenase prx3
Short-chain dehydrogenase/reductase prx1
Glucan endo-1,3-beta-D-glucosidase 1
MSDIN-like toxin proprotein 4
MSDIN-like toxin proprotein 1
MSDIN-like toxin proprotein 4
Beta-amanitin proprotein
Alpha-am

7UWG
7UXU
8G83
7S4N
7S58
7S59
7SO0
4K1Y
4K1Z
4K20
4K21
4NTQ
4K35
4K3A
5XBZ
5XC2


In [15]:
df

Unnamed: 0,pdb,accession,name,sequence,coords,mw,pI,II,aromaticity,gravy
0,7UWG,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[(N, -23.326, -13.053, 44.949), (C, -22.948, -...",30921.9061,6.990139,44.952045,0.052045,-0.667286
1,7UXU,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[(N, 101.794, 110.252, 96.567), (C, 100.72, 11...",30921.9061,6.990139,44.952045,0.052045,-0.667286
2,8G83,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"[(N, 22.774, 18.021, 14.509), (C, 24.106, 18.5...",30921.9061,6.990139,44.952045,0.052045,-0.667286
8,7S4N,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, -5.908, -3.083, 17.415), (C, -6.559, -2.2...",13277.1305,5.440083,44.787288,0.067797,-0.557627
9,7S58,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, 8.441, -28.691, -37.934), (C, 9.581, -28....",13277.1305,5.440083,44.787288,0.067797,-0.557627
10,7S59,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, 23.086, -7.86, 7.817), (C, 22.47, -9.146,...",13277.1305,5.440083,44.787288,0.067797,-0.557627
11,7SO0,A0A023FDY8,EV974_AMBCJ,MKVLLCIAASCLMLLALNVSAENTQQEEQDYDYGTDTCPFPVLANK...,"[(N, 16.821, -34.822, -20.879), (C, 16.071, -3...",13277.1305,5.440083,44.787288,0.067797,-0.557627
14,4K1Y,A0A023GPI8,LECA_CANBL,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,"[(N, -1.926, 8.923, 14.54), (C, -0.571, 9.144,...",25571.1119,5.625264,30.408903,0.092827,-0.208861
15,4K1Z,A0A023GPI8,LECA_CANBL,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,"[(N, 14.429, -4.808, -1.212), (C, 15.57, -5.08...",25571.1119,5.625264,30.408903,0.092827,-0.208861
16,4K20,A0A023GPI8,LECA_CANBL,ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGK...,"[(N, -2.766, -34.844, 5.165), (C, -1.742, -33....",25571.1119,5.625264,30.408903,0.092827,-0.208861


In [17]:
dataset = df[['pdb', 'coords', 'gravy']]
dataset

Unnamed: 0,pdb,coords,gravy
0,7UWG,"[(N, -23.326, -13.053, 44.949), (C, -22.948, -...",-0.667286
1,7UXU,"[(N, 101.794, 110.252, 96.567), (C, 100.72, 11...",-0.667286
2,8G83,"[(N, 22.774, 18.021, 14.509), (C, 24.106, 18.5...",-0.667286
8,7S4N,"[(N, -5.908, -3.083, 17.415), (C, -6.559, -2.2...",-0.557627
9,7S58,"[(N, 8.441, -28.691, -37.934), (C, 9.581, -28....",-0.557627
10,7S59,"[(N, 23.086, -7.86, 7.817), (C, 22.47, -9.146,...",-0.557627
11,7SO0,"[(N, 16.821, -34.822, -20.879), (C, 16.071, -3...",-0.557627
14,4K1Y,"[(N, -1.926, 8.923, 14.54), (C, -0.571, 9.144,...",-0.208861
15,4K1Z,"[(N, 14.429, -4.808, -1.212), (C, 15.57, -5.08...",-0.208861
16,4K20,"[(N, -2.766, -34.844, 5.165), (C, -1.742, -33....",-0.208861


Below is just workshopping pulling remarks from PDB

In [212]:
ppdb = PandasPdb().fetch_pdb('7UXU')

In [156]:
ppdb.df.keys()

dict_keys(['ATOM', 'HETATM', 'ANISOU', 'OTHERS'])

In [213]:
remarks = ppdb.df['OTHERS'][ppdb.df['OTHERS']['record_name'].str.contains('REMARK')]
remarks

Unnamed: 0,record_name,entry,line_idx
30,REMARK,2,30
31,REMARK,2 RESOLUTION. 2.74 ANGSTROMS.,31
32,REMARK,3,32
33,REMARK,3 REFINEMENT.,33
34,REMARK,"3 SOFTWARE PACKAGES : ISOLDE, CRYOSP...",34
...,...,...,...
171,REMARK,500 REMARK: NULL,171
172,REMARK,900,172
173,REMARK,900 RELATED ENTRIES,173
174,REMARK,900 RELATED ID: EMD-26862 RELATED DB: EMDB,174


In [217]:
temp = remarks[remarks.entry.str.contains(r'TEMPERATURE')]
temp

Unnamed: 0,record_name,entry,line_idx
82,REMARK,245 TEMPERATURE (KELVIN) : NULL,82


In [200]:
ph = float((remarks[remarks.entry.str.contains(r'200\s*PH')]).entry.iloc[0].split(':')[-1].strip())