In [4]:
import requests, sys
from lxml import etree
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from biopandas.pdb import PandasPdb
import torch
import numpy as np
from periodictable import elements
from torch_geometric.data import Data
import os

In [5]:
def build_xml():
    requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&reviewed=true&isoform=0"

    r = requests.get(requestURL, headers={ "Accept" : "application/xml"})

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    responseBody = r.text
    
    file_path = 'output.xml'
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(responseBody)
    
    print(responseBody)

In [6]:
def get_atomic_structure(pdb_id):
    ppdb = PandasPdb().fetch_pdb(pdb_id)
    atom = ppdb.df['ATOM']
    structure = list(zip(atom.element_symbol,atom.x_coord,atom.y_coord,atom.z_coord))
    return structure

def get_prot_analysis(sequence):
    analysis = ProteinAnalysis(sequence)
    mw = analysis.molecular_weight()
    pI = analysis.isoelectric_point()
    instability = analysis.instability_index()
    aromaticity = analysis.aromaticity()
    gravy = analysis.gravy()
    return mw,pI, instability,aromaticity,gravy

def read_xml(filepath = "output.xml"):
    with open(filepath, 'r', encoding='utf-8') as file:
        xml_data_str = file.read()
        
    xml_data_bytes = xml_data_str.encode('utf-8')
    root = etree.fromstring(xml_data_bytes)
    
    namespaces = {
        'uniprot': "http://uniprot.org/uniprot",
        'xsi': "http://www.w3.org/2001/XMLSchema-instance"
    }
    
    # for protein_name in root.findall('.//uniprot:protein/uniprot:recommendedName/uniprot:fullName', namespaces):
    #     print(protein_name.text)
    
    df = pd.DataFrame(columns=['pdb','accession','name','sequence','coords','mw','pI','II','aromaticity','gravy'])
    
    for entry in root.findall('uniprot:entry', namespaces):
        sequence = entry.find('uniprot:sequence', namespaces).text
        # print(sequence)
        accession = entry.find('uniprot:accession', namespaces).text
        # print(f"Accession: {accession}")
        name = entry.find('uniprot:name', namespaces).text
        # print(f"Name: {name}")
        pdb_ids = entry.findall(".//uniprot:dbReference[@type='PDB']", namespaces)
        for pdb_id in pdb_ids:
            # print(f"PDB ID: {pdb_id.get('id')}")
            df.loc[len(df.index)] = [pdb_id.get('id'),accession,name,sequence,None,None,None,None,None,None]
    df = df.drop_duplicates()
    for _,row in df.iterrows():
        pdb_id = row.pdb
        row['coords'] = get_atomic_structure(row.pdb)
        row['mw'],row['pI'],row['II'],row['aromaticity'],row['gravy'] = get_prot_analysis(row.sequence)
    return df


In [7]:
df = read_xml()

In [8]:
output = open("PyMol/pdb_ids.txt", "w")
for id in list(df.pdb):
    output.write(f"{id}\n")

output.close()

In [9]:
os.system('pymol PyMol/PyMOL_Analysis.py')

QStandardPaths: wrong permissions on runtime directory /run/user/1000/, 0755 instead of 0700


 PyMOL(TM) 3.0.0 - Incentive Product
 Copyright (C) Schrodinger, LLC
 
 This Executable Build integrates and extends Open-Source PyMOL.
No License File - For Evaluation Only (30 days remaining)
 Detected OpenGL version 4.1. Shaders available.
 ShaderPrg-Error: vertex shader compilation failed.; name='bezier'
 ShaderPrg-Error-InfoLog:
0:1(10): error: GLSL 4.50 is not supported. Supported versions are: 1.10, 1.20, 1.30, 1.40, 1.50, 3.30, 4.00, 4.10, 1.00 ES, and 3.00 ES

 Detected GLSL version 4.10.
 OpenGL graphics engine:
  GL_VENDOR:   Microsoft Corporation
  GL_RENDERER: D3D12 (Intel(R) UHD Graphics 770)
  GL_VERSION:  4.1 (Compatibility Profile) Mesa 23.2.1-1ubuntu3.1~22.04.2
 ShaderPrg-Error: vertex shader compilation failed.; name='bezier'
 ShaderPrg-Error-InfoLog:
0:1(10): error: GLSL 4.50 is not supported. Supported versions are: 1.10, 1.20, 1.30, 1.40, 1.50, 3.30, 4.00, 4.10, 1.00 ES, and 3.00 ES


 Detected 24 CPU cores.  Enabled multithreaded rendering.
 Setting: fetch_path s

0

In [11]:
df2 = pd.read_table("PyMol/Results_34.txt")

In [13]:
df2

Unnamed: 0,PDB,No. a.a.,Glycine,S.S.,Long SS,Charge,SASA,No. pos.,No. Surf. pos.,Pos. area,No. neg.,No. Surf. neg.,Neg. area,No. hyd.,No. Surf. hyd.,hyd. area,Alpha,Beta,Salt bridges,H-bonds
0,7UWG,137,6,0.0,N,-8,7009.034668,14,14,1418.057495,22,22,1683.536377,58,37,1938.264038,57.037037,21.481481,16,162
1,7UXU,137,6,0.0,N,-8,7905.958008,14,14,1673.709229,22,22,1841.111938,58,39,2104.51709,64.179104,14.179104,6,155
2,8G83,184,20,0.0,N,-7,6722.083496,17,15,1187.827515,24,22,1519.462646,65,33,1643.040405,49.275362,21.014493,10,151
3,7S4N,85,8,4.0,4,1,5121.149902,14,14,1203.411743,13,11,1010.917664,24,21,1318.673706,7.317073,51.219512,3,67
4,7S58,85,8,4.0,4,1,5371.293457,14,14,1286.778076,13,12,1050.52124,24,22,1457.38208,9.52381,47.619048,7,74
5,7S59,85,8,4.0,4,1,5425.188965,14,14,1391.244507,13,12,1189.390381,24,21,1214.505981,9.638554,48.192771,1,2
6,7SO0,85,8,4.0,4,1,5613.602539,14,14,1432.863281,13,12,1140.178955,24,22,1375.787476,8.333333,48.809524,6,72
7,4K1Y,237,18,0.0,N,-5,10457.401367,19,18,1847.884521,24,21,1298.425659,91,61,2348.869385,4.219409,46.413502,9,242
8,4K1Z,237,18,0.0,N,-5,10515.277344,19,18,1790.241821,24,21,1384.893311,91,63,2333.527344,7.594937,45.56962,9,242
9,4K20,237,18,0.0,N,-5,10497.344727,19,18,1756.681274,24,21,1359.937256,91,63,2419.152832,6.329114,46.413502,11,243


In [17]:
df_merge = df.merge(df2,how='left',left_on="pdb",right_on="PDB")

In [15]:
def build_dataset(df):
    df = df.copy().reset_index()
    element_translation = {el.symbol.lower(): el.number for el in elements}
    
    data_list=list()
    for j, row in df.iterrows():
        name = row['pdb']
        y = torch.Tensor([row['gravy'], row['mw'], row['pI'], row['II'], row['aromaticity'],row['Alpha']])
        
        n_atoms = len(row['coords'])
        z = torch.empty((n_atoms)); pos = torch.empty((n_atoms, 3)) 
        for i, x in enumerate(row['coords']):
            z[i] = torch.Tensor([element_translation[x[0].lower()]])
            pos[i] = torch.Tensor([x[1], x[2], x[3]])
        
        data=Data(
            z=z,
            pos=pos,
            y=y,
            name=name,
            idx=j
        )
        data_list.append(data)
    
    return data_list


In [31]:
build_xml()
df = read_xml()


<?xml version='1.0' encoding='UTF-8'?><uniprot xmlns="http://uniprot.org/uniprot" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><entry xmlns="http://uniprot.org/uniprot" dataset="Swiss-Prot" created="2020-02-26" modified="2024-01-24" version="24"><accession>A0A009IHW8</accession><name>ABTIR_ACIB9</name><protein><recommendedName><fullName evidence="6">2' cyclic ADP-D-ribose synthase AbTIR</fullName><shortName evidence="6">2'cADPR synthase AbTIR</shortName><ecNumber evidence="4">3.2.2.-</ecNumber></recommendedName><alternativeName><fullName evidence="6">NAD(+) hydrolase AbTIR</fullName><ecNumber evidence="3">3.2.2.6</ecNumber></alternativeName><alternativeName><fullName evidence="5">TIR domain-containing protein in A.baumannii</fullName><shortName evidence="5">AbTIR</shortName></alternativeName></protein><gene><name evidence="8" type="ORF">J512_3302</name></gene><organism><name type="s

In [18]:

data = build_dataset(df_merge)

In [22]:
data[0]

Data(y=[6], pos=[4372, 3], z=[4372], name='7UWG', idx=0)

Below is just workshopping pulling remarks from PDB

In [6]:
ppdb = PandasPdb().fetch_pdb('7UXU')

In [7]:
ppdb.df.keys()

dict_keys(['ATOM', 'HETATM', 'ANISOU', 'OTHERS'])

In [8]:
remarks = ppdb.df['OTHERS'][ppdb.df['OTHERS']['record_name'].str.contains('REMARK')]
remarks

Unnamed: 0,record_name,entry,line_idx
30,REMARK,2,30
31,REMARK,2 RESOLUTION. 2.74 ANGSTROMS.,31
32,REMARK,3,32
33,REMARK,3 REFINEMENT.,33
34,REMARK,"3 SOFTWARE PACKAGES : ISOLDE, CRYOSP...",34
...,...,...,...
171,REMARK,500 REMARK: NULL,171
172,REMARK,900,172
173,REMARK,900 RELATED ENTRIES,173
174,REMARK,900 RELATED ID: EMD-26862 RELATED DB: EMDB,174


In [9]:
temp = remarks[remarks.entry.str.contains(r'TEMPERATURE')]
temp

Unnamed: 0,record_name,entry,line_idx
82,REMARK,245 TEMPERATURE (KELVIN) : NULL,82


In [10]:
ph = float((remarks[remarks.entry.str.contains(r'200\s*PH')]).entry.iloc[0].split(':')[-1].strip())

IndexError: single positional indexer is out-of-bounds