In [1]:
from pyprot.protein import Protein

First, let's load up some proteins.

In [10]:
proteins = [
    Protein.fetch("1H3E"), # you can retrieve it from the PDB by id
    Protein("1I7L.pdb") # or you can just pass a file directly.
]

['./1H3E.pdb']




In [12]:
list(proteins[1].pdb.get_chains())

[<Chain id=A>, <Chain id=B>]

We automatically have a dataframe of the atoms for each protein:

In [13]:
proteins[0].df.head()

Unnamed: 0,bfactor,chain,coord,disordered_flag,element,full_id,res_full_id,mass,resname,occupancy,x,y,z
0,67.65,A,"[119.906, 61.902, 77.413]",0,N,"(1H3E, 0, A, ( , 6, ), (N, ))","(1H3E, 0, A, ( , 6, ))",14.0067,HIS,1.0,119.905998,61.902,77.413002
1,67.68,A,"[119.998, 60.401, 77.41]",0,C,"(1H3E, 0, A, ( , 6, ), (CA, ))","(1H3E, 0, A, ( , 6, ))",12.0107,HIS,1.0,119.998001,60.401001,77.410004
2,67.12,A,"[119.982, 59.836, 75.971]",0,C,"(1H3E, 0, A, ( , 6, ), (CB, ))","(1H3E, 0, A, ( , 6, ))",12.0107,HIS,1.0,119.982002,59.835999,75.971001
3,67.18,A,"[119.58, 60.793, 74.893]",0,C,"(1H3E, 0, A, ( , 6, ), (CG, ))","(1H3E, 0, A, ( , 6, ))",12.0107,HIS,1.0,119.580002,60.792999,74.892998
4,66.87,A,"[120.374, 61.868, 74.517]",0,N,"(1H3E, 0, A, ( , 6, ), (ND1, ))","(1H3E, 0, A, ( , 6, ))",14.0067,HIS,1.0,120.374001,61.868,74.516998


We might want to add some features, a target distance for example:

In [15]:
import numpy as np
for protein in proteins:
    protein.df = protein.df[~protein.df.coord.isnull()]
    ATP_coords = protein.df[protein.df.resname == "ATP"].coord.to_list()
    print("Found {} ATP atoms".format(len(ATP_coords)))
    protein.df["distance"] = protein.df.coord.apply(
        lambda atom: min(map(lambda atp_atom: np.linalg.norm(atom-atp_atom), ATP_coords))
    )
    #protein.discard_ligands()
    # Sanity check
    protein.df = protein.df.loc[
        protein.df.apply(lambda row: row["full_id"][4][0] == "CA", axis=1),
        :].reset_index(drop=True)

Found 31 ATP atoms
Found 62 ATP atoms


And discard ligands to keep only protein atoms:

In [27]:
for protein in proteins:
    chains_with_ligand = protein.df[protein.df.distance <= 6.0].chain.unique()
    protein.select_chains(chains_with_ligand)

Make a graph by Delauney triangulation:

In [29]:
from pyprot.structure import Perseus
import pyprot.graph_models as graph_models
structure = protein.generate_structure(lambda row: row["full_id"][4][0] == "CA")

perseus = Perseus()
perseus.execute_persistent_hom(protein)

structure_model = graph_models.StructureGraphGenerator()
protein.generate_graph(structure_model,
    {"step": structure.persistent_hom_params["b3_step"]})

<networkx.classes.graph.Graph at 0x7f395170f710>

In [30]:
# Depth features
depths, _ = structure.calculate_depth(protein.graph)
for node_idx, depth in depths.items():
    protein.graph.nodes[node_idx]["depth"] = depth

# Rest of features
structure_model.add_features(protein.df, columns = [
    "bfactor", "score", "color",
    "color_confidence_interval_high", "color_confidence_interval_low",
    "score_confidence_interval_high", "score_confidence_interval_low",
    "resname", "coord", "distance"
])

<networkx.classes.graph.Graph at 0x7f395170f710>

We can turn a graph into a dataframe by propagating features along neighbors:

In [35]:
graph_models.GraphModel.graph_to_dataframe(graph_models.GraphModel.get_diffused_graph(protein.graph)).head()

Unnamed: 0,depth_1,bfactor_1,distance_1,full_id,depth,bfactor,resname,coord,distance
0,4.438341,30.047143,34.550079,"(1I7L, 0, A, ( , 113, ), (CA, ))",0.0,39.58,LYS,"[22.906, 96.881, 33.044]",38.68087
1,6.73004,29.493333,32.634772,"(1I7L, 0, A, ( , 114, ), (CA, ))",3.857951,33.99,ALA,"[23.368, 93.506, 31.233]",35.246246
2,7.020148,27.827647,31.86915,"(1I7L, 0, A, ( , 115, ), (CA, ))",7.336214,29.89,LYS,"[25.696, 90.591, 30.5]",32.552814
3,11.103894,24.0575,28.630311,"(1I7L, 0, A, ( , 116, ), (CA, ))",10.299733,23.45,VAL,"[25.603, 89.305, 26.891]",29.821009
4,10.563309,21.645263,26.083789,"(1I7L, 0, A, ( , 117, ), (CA, ))",13.947106,19.2,LEU,"[25.132, 85.563, 26.245]",26.119415


Aaaand we can also make folds using sequence similarity: