# Embedding Workflow
---

In [None]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
import sys
from preprocess.molecule_graph import collect_continuous_atom_features, mol_to_graph

sys.path.append("../src")

In [2]:
smiles = "COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@H]1[C@H]2c3cc4OCOc4cc3CCN3CCC[C@]23C=C1OC"  # Example SMILES string
mol = Chem.MolFromSmiles(smiles)

# Create a Morgan fingerprint generator
generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

# Generate the fingerprint
fingerprint = generator.GetFingerprint(mol)

In [3]:
# Convert the fingerprint to a NumPy array
arr = np.zeros((1,), dtype=int)
Chem.DataStructs.ConvertToNumpyArray(fingerprint, arr)
print(np.unique(arr))  # Print the unique values in the fingerprint

[0 1]


In [None]:
smiles_list = ["CCO", "C1CCCCC1", "c1ccccc1"]
continuous_atom_features = collect_continuous_atom_features(smiles_list)
scaler = StandardScaler()
scaler.fit(continuous_atom_features)

graph_list = []
for smiles in smiles_list:
    graph = mol_to_graph(smiles, scaler)
    if graph is not None:
        graph_list.append(graph)

print(graph_list)

[Data(x=[9, 15], edge_index=[2, 16], edge_attr=[16, 13]), Data(x=[18, 15], edge_index=[2, 36], edge_attr=[36, 13]), Data(x=[12, 15], edge_index=[2, 24], edge_attr=[24, 13])]
