Graphein Experimentation

1. Download a PDB set of TCR structures
2. generate graphs
3. graph feature annotation using esm2 (graph level)
4. node feature annotation using ?? (residue level)
5. make PyG dataloader

First of all, a sample protein will be used

In [50]:
!pip install graphein -q

In [51]:
!pip install "graphein[extras]" -q

In [33]:
from graphein.protein.utils import download_alphafold_structure

# Download the PDB file for an exmaple protein (UniProt: P0DTU3) with the aligned score
protein_path = download_alphafold_structure("P0DTU3", out_dir = "./alphafold_structures", aligned_score=True)


In [34]:
from graphein.protein.config import ProteinGraphConfig

config = ProteinGraphConfig()
config.dict()

{'granularity': 'CA',
 'keep_hets': [],
 'insertions': True,
 'alt_locs': 'max_occupancy',
 'pdb_dir': None,
 'verbose': False,
 'exclude_waters': True,
 'deprotonate': False,
 'protein_df_processing_functions': None,
 'edge_construction_functions': [<function graphein.protein.edges.distance.add_peptide_bonds(G: 'nx.Graph') -> 'nx.Graph'>],
 'node_metadata_functions': [<function graphein.protein.features.nodes.amino_acid.meiler_embedding(n: str, d: Dict[str, Any], return_array: bool = False) -> Union[pandas.core.series.Series, numpy.ndarray]>],
 'edge_metadata_functions': None,
 'graph_metadata_functions': None,
 'get_contacts_config': None,
 'dssp_config': None}

In [35]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph

# Load the default config
c = ProteinGraphConfig(granularity='CA')

# Construct the graph!
g = construct_graph(uniprot_id="P0DTU3")

Output()

In [39]:
from graphein.protein.edges.distance import add_aromatic_interactions, add_cation_pi_interactions, add_hydrophobic_interactions, add_ionic_interactions
from graphein.protein.visualisation import plotly_protein_structure_graph

config = ProteinGraphConfig(edge_construction_functions=[add_aromatic_interactions,
                                                         add_cation_pi_interactions,
                                                         add_hydrophobic_interactions,
                                                         add_ionic_interactions])

g = construct_graph(uniprot_id="A2NZL3", config=config)

plotly_protein_structure_graph(g, colour_edges_by="kind", colour_nodes_by="residue_name", label_node_ids=False, node_size_multiplier=2, node_size_min=5)

Output()

In [37]:
from graphein.protein.visualisation import plotly_protein_structure_graph

plotly_protein_structure_graph(g, node_size_multiplier=0.5, colour_nodes_by="residue_name")

In [38]:
#For local PDB files
g = construct_graph(config=config, pdb_path="./example.pdb")

TypeError: construct_graph() got an unexpected keyword argument 'pdb_path'

In [40]:
#Distance thresholds for node edges
#long_interaction_threshold is specified as min number of residues between nodes that have an edge connecting them (as less interested in residues that are close together in sequence)
from functools import partial
from graphein.protein.edges.distance import add_distance_threshold

new_edge_funcs = {"edge_construction_functions": [partial(add_distance_threshold, long_interaction_threshold=5, threshold=10.)]}
config = ProteinGraphConfig(**new_edge_funcs)

g = construct_graph(config=config, uniprot_id="A2NZL3")
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="Protein graph created by thresholding distance between nodes. \n Nodes must be <10A apart and at least 5 positions apart \n Nodes coloured by sequence position.",
    node_size_multiplier=1
    )
p.show()

Output()

In [41]:
#k-NN based edges
from functools import partial
from graphein.protein.edges.distance import add_k_nn_edges

new_edge_funcs = {"edge_construction_functions": [partial(add_k_nn_edges, k=3, long_interaction_threshold=0)]}
config = ProteinGraphConfig(**new_edge_funcs)

g = construct_graph(config=config, uniprot_id="A2NZL3")
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="Protein graph created from K-NN of each node. Nodes coloured by sequence position",
    node_size_multiplier=1
    )
p.show()

Output()

In [43]:
#graph level feature annotation
from graphein.protein.features.sequence.embeddings import esm_sequence_embedding, biovec_sequence_embedding

new_graph_annotation_funcs = {"graph_metadata_functions": [esm_sequence_embedding]}
config = ProteinGraphConfig(**new_graph_annotation_funcs)

g = construct_graph(config=config, uniprot_id="A2NZL3")
print("ESM:", g.graph["esm_embedding_A"])
#print("biovec:", g.graph["biovec_embedding_A"])

Output()

ESM: [-0.00782469  0.24393201 -0.17262627 ...  0.11817875 -0.13062464
 -0.05121422]


In [47]:
from graphein.protein.features.nodes.amino_acid import meiler_embedding

config = ProteinGraphConfig(**{"node_metadata_functions": [meiler_embedding]})
g = construct_graph(config=config, pdb_code="3eiy")

for n, d in g.nodes(data=True):
    print(d["meiler_embedding"])

Output()

KeyError: 'meiler_embedding_A'

In [48]:
from graphein.ml import ProteinGraphListDataset, GraphFormatConvertor
import graphein.protein as gp

# Construct graphs
graphs = gp.construct_graphs_mp(
    pdb_code_it=["3eiy", "4hhb", "1lds", "2ll6"],
    return_dict=False
    )

# do some transformation
graphs = [gp.extract_subgraph_from_chains(g, ["A"]) for g in graphs]

# Convert to PyG Data format
convertor = GraphFormatConvertor(src_format="nx", dst_format="pyg")
graphs = [convertor(g) for g in graphs]

# Create dataset
ds = ProteinGraphListDataset(root=".", data_list=graphs, name="list_test")

  0%|          | 0/4 [00:00<?, ?it/s]

Processing...
Done!


In [None]:
https://github.com/a-r-j/graphein/tree/master/datasets/ppisp