# Graph Generation from Custon One-Hot Encodings
Includes:
- mol_weight and logp: scalar molecular properties stored in the graph Data.

- fingerprint: a 2048-dimensional bit vector (float tensor) stored as data.fingerprint

In [1]:
import pandas as pd
import torch
import os
from rdkit import Chem
from rdkit.Chem import rdchem
from tqdm import tqdm
from torch_geometric.data import Data

In [4]:
import torch
import os
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, AllChem
from torch_geometric.data import Data

# Define atom features
def atom_features(atom):
    return torch.tensor([
        atom.GetAtomicNum(),
        atom.GetFormalCharge(),
        int(atom.GetIsAromatic()),
        atom.GetHybridization().real,
        atom.GetDegree(),
        atom.GetTotalNumHs()
    ], dtype=torch.float)

# Define bond features
def bond_features(bond):
    return torch.tensor([
        int(bond.GetBondTypeAsDouble()),  # Single=1.0, Double=2.0, etc.
        int(bond.GetIsConjugated()),
        int(bond.IsInRing())
    ], dtype=torch.float)

# Convert SMILES to PyG graph with optional hybrid features
def smiles_to_graph(smiles, mol_id):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Node features
    x = torch.stack([atom_features(atom) for atom in mol.GetAtoms()])

    # Edge indices and features
    edge_index = []
    edge_attr = []

    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        f = bond_features(bond)

        # Undirected edge (i <-> j)
        edge_index += [[i, j], [j, i]]
        edge_attr += [f, f]

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.stack(edge_attr)

    # Molecular-level features
    mol_weight = Descriptors.MolWt(mol)
    logp = Crippen.MolLogP(mol)

    # Morgan fingerprint (ECFP4, radius=2, 2048 bits)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    fingerprint = torch.tensor(fp, dtype=torch.float)

    return Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr,
        smiles=smiles,
        mol_id=mol_id,
        mol_weight=torch.tensor([mol_weight], dtype=torch.float),
        logp=torch.tensor([logp], dtype=torch.float),
        fingerprint=fingerprint
    )

# Main function
def generate_graphs(smiles_csv="../data/step2_kinase_inhibitors_smiles.csv", output_dir="data/graphs/"):
    os.makedirs(output_dir, exist_ok=True)

    df = pd.read_csv(smiles_csv)
    saved = 0

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating graphs"):
        smiles = row["canonical_smiles"]
        mol_id = row["molecule_chembl_id"]
        data = smiles_to_graph(smiles, mol_id)

        if data:
            torch.save(data, os.path.join(output_dir, f"{mol_id}.pt"))
            saved += 1

    print(f"✓ Saved {saved} molecular graphs to {output_dir}")

# Run it
if __name__ == "__main__":
    generate_graphs()


Creating graphs: 100%|██████████| 10584/10584 [02:20<00:00, 75.08it/s]

✓ Saved 10584 molecular graphs to data/graphs/





In [7]:
import torch

example_graph_path = "../data/graphs/CHEMBL472.pt"

# Load graph with full object
graph = torch.load(example_graph_path, weights_only=False)

# Inspect
print("Node feature matrix shape:", graph.x.shape)
print("Number of features per node:", graph.x.shape[1])


Node feature matrix shape: torch.Size([33, 6])
Number of features per node: 6
