In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('data/PSCDB/structural_rearrangement_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,level_0,index,PSCID,Protein Name,Free form,Bound form,Ligands,Classification(?),motion_type,Free PDB,Free Chains,Bound PDB,Bound Chains
0,0,0,0,CD.1,HYPOTHETICAL OXIDOREDUCTASE YIAK,1nxu_AB,1s20_AB,"2xNAD,2xTLA",200004,coupled_domain_motion,1nxu,AB,1s20,AB
1,1,1,1,CD.2,ADENYLATE KINASE,4ake_A,2eck_A,"ADP,AMP",200003,coupled_domain_motion,4ake,A,2eck,A
2,2,2,2,CD.3,GLUCOKINASE,1q18_AB,1sz2_AB,2xBGC,200003,coupled_domain_motion,1q18,AB,1sz2,AB
3,3,3,3,CD.4,LACTOFERRIN,1lfh_A,1lfi_A,"2xCU,2xNAG",110103,coupled_domain_motion,1lfh,A,1lfi,A
4,4,4,4,CD.5,ELONGATION FACTOR 2,1n0v_D,1n0u_A,SO1,110002,coupled_domain_motion,1n0v,D,1n0u,A


In [5]:
for i, j in enumerate(df.motion_type.unique()):
    print(i, j)

0 coupled_domain_motion
1 independent_domain_motion
2 coupled_local_motion
3 independent_local_motion
4 burying_ligand_motion
5 no_significant_motion
6 other_motion


In [6]:
labels_map = {key: value for value, key in enumerate(df.motion_type.unique())} 
labels_map

{'coupled_domain_motion': 0,
 'independent_domain_motion': 1,
 'coupled_local_motion': 2,
 'independent_local_motion': 3,
 'burying_ligand_motion': 4,
 'no_significant_motion': 5,
 'other_motion': 6}

In [12]:
from sklearn.model_selection import train_test_split
from graphein.construct_graphs import ProteinGraph

# Split datasets
x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.15)

# Initialise Graph Constructor
pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=True,
                  node_featuriser='meiler', get_contacts_path='/Users/arianjamasb/github/getcontacts',
                  pdb_dir='../../examples/pdbs/',
                  contacts_dir='../../examples/contacts/',
                  exclude_waters=True, covalent_bonds=False, include_ss=True)

# Build Graphs
train_graphs = [pg.dgl_graph_from_pdb_code(pdb_code=x_train['Free PDB'].iloc[i],
                                         chain_selection=list(x_train['Free Chains'].iloc[i])) for i in tqdm(range(len(x_train)))]

test_graphs = [pg.dgl_graph_from_pdb_code(pdb_code=x_test['Free PDB'].iloc[i],
                                         chain_selection=list(x_test['Free Chains'].iloc[i])) for i in tqdm(range(len(x_test)))]

ModuleNotFoundError: No module named 'graphein.construct_graphs'

https://github.com/a-r-j/graphein/blob/master/datasets/pscdb/process_data.ipynb 

In [13]:
import pandas as pd
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph
from torch_geometric.data import Data
import torch

# Load PSCDB metadata
df = pd.read_csv("data/PSCDB/structural_rearrangement_data.csv")

# Define graph configuration (customize features as needed)
config = ProteinGraphConfig(
    edge_construction_functions=["add_distance_threshold"],  # 5Å cutoff
    node_metadata_functions=["amino_acid_one_hot", "secondary_structure"],
    graph_format="pyg",  # Output PyG Data objects directly
)

labels = pd.get_dummies(df.motion_type).values.tolist()
labels = [torch.Tensor(i) for i in labels]

# Convert PDB entries to PyG graphs with labels
graphs = []
labels_map = {key: value for value, key in enumerate(df.motion_type.unique())} 

for pdb_id, label in zip(df["Free PDB"], df["motion_type"]):
    try:
        # Fetch and construct graph (Graphein downloads PDB files automatically)
        pyg_data = construct_graph(config=config, pdb_code=pdb_id)
        
        # Add label to the PyG Data object
        pyg_data.y = torch.tensor([labels_map[label]], dtype=torch.long)
        
        graphs.append(pyg_data)
    except Exception as e:
        print(f"Skipping {pdb_id}: {e}")
    break

Skipping 1nxu: 'str' object is not callablePlease ensure that provided node metadata functions match the f(n: str, d: Dict) function signature, where n is the node ID and d is the node data dictionary 


In [15]:
graphs

[]