# Explore

In [None]:
import pandas as pd

from deepchem.feat import MolGraphConvFeaturizer, GraphData
from rdkit import Chem

In [2]:
data = pd.read_csv(
    "https://raw.githubusercontent.com/attilaimre99/CPP1708/refs/heads/main/train.csv"
)
data.head()

Unnamed: 0,id,name,source,sequence,smiles,label,len,average_wt,is_cyclic
0,0,MLCPP2_non_CPP_4095,MLCPP2,NVQLGPSLTEKL,CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCC(...,0,12,1298.505,False
1,1,Dobchevetal_CPP_3291,Dobchevetal,RLIKTLKTLLQKRKTL,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@@H...,1,16,1953.499,False
2,2,C2Pred_CPP_145,C2Pred,WKCRRQCFRVLHHWN,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCNC(=N)...,1,15,2069.466,False
3,3,MLCPP2_non_CPP_4717,MLCPP2,YHLSKEDAEVINATKKRGNKVI,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@H](CCCCN...,0,22,2513.887,False
4,4,CellPPDindependent_CPP_1272,CellPPDindependent,RKKTFKEVANAVKISA,CC[C@H](C)[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@@H]...,1,16,1790.146,False


In [3]:
featurizer = MolGraphConvFeaturizer(use_edges=True, use_chirality=True)

In [4]:
smiles = str(data.loc[0, "smiles"])
name = str(data.loc[0, "name"])
mol = Chem.MolFromSmiles(smiles)

In [5]:
featurized = featurizer.featurize(mol)[0]
f = GraphData(
    node_features=featurized.node_features,
    edge_index=featurized.edge_index,
    edge_features=featurized.edge_features,
)
pyg_data = f.to_pyg_graph()
pyg_data.name = name
pyg_data

Data(x=[91, 32], edge_index=[2, 182], edge_attr=[182, 11], name='MLCPP2_non_CPP_4095')

In [7]:
f.node_features

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

# Use GraphCPP's implementation for the dataset

Use the molecular fingerprint generators from RDKit in file `fp_generators.py`

In [None]:
%%capture
from torch_geometric.loader import DataLoader
from graphcpp.dataset import CPPDataset
from lightning import LightningDataModule
from config import *

In [2]:
class GraphCPPDataModule(LightningDataModule):
    def __init__(self, folder="data", fp_type="topological", **kwargs):
        super().__init__()
        self.train_split = CPPDataset(
            root=folder, _split="train", fp_type=fp_type
        ).shuffle()
        self.val_split = CPPDataset(
            root=folder, _split="val", fp_type=fp_type
        ).shuffle()
        self.test_split = CPPDataset(
            root=folder, _split="test", fp_type=fp_type
        ).shuffle()

    def train_dataloader(self):
        return DataLoader(self.train_split, batch_size=BATCH_SIZE, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_split, batch_size=BATCH_SIZE)

    def test_dataloader(self):
        return DataLoader(self.test_split, batch_size=BATCH_SIZE)

In [3]:
module = GraphCPPDataModule()
train_loader = module.train_dataloader()

In [13]:
# Examine first batch
for i, batch in enumerate(train_loader):
    print(f"Batch {i}:")
    print(f"  Batch type: {type(batch)}")
    print(f"  Batch keys: {batch.keys() if hasattr(batch, 'keys') else 'No keys'}")

    if hasattr(batch, "x"):
        print(f"  Node features shape: {batch.x.shape}")
    if hasattr(batch, "y"):
        print(f"  Labels shape: {batch.y.shape}")
    if hasattr(batch, "edge_index"):
        print(f"  Edge index: {batch.edge_index.shape}")
    if hasattr(batch, "edge_attr"):
        print(f"  Edge attr: {batch.edge_attr.shape}")

    break

Batch 0:
  Batch type: <class 'abc.DataBatch'>
  Batch keys: ['x', 'edge_index', 'edge_attr', 'smiles', 'ptr', 'name', 'fp', 'y', 'batch']
  Node features shape: torch.Size([87686, 32])
  Labels shape: torch.Size([512])
  Edge index: torch.Size([2, 178724])
  Edge attr: torch.Size([178724, 11])
