## FingerPrints and descriptors

In [133]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors,QED, GraphDescriptors
import pandas as pd

def generate_ecfp4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
    return fp.ToBitString()

def get_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return {
        'MolWt': Descriptors.MolWt(mol),
        'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
        'NumRotatableBonds': rdMolDescriptors.CalcNumRotatableBonds(mol),
        'TPSA': Descriptors.TPSA(mol),
        'LabuteASA': rdMolDescriptors.CalcLabuteASA(mol),
        'MolLogP': rdMolDescriptors.CalcCrippenDescriptors(mol)[0],
        'MolMR': rdMolDescriptors.CalcCrippenDescriptors(mol)[1],
        'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'RingCount': Descriptors.RingCount(mol),
        'QED': QED.qed(mol),
        'BalabanJ': GraphDescriptors.BalabanJ(mol),
        'BertzCT': GraphDescriptors.BertzCT(mol),
        'Ipc': GraphDescriptors.Ipc(mol),
        'FormalCharge': Chem.GetFormalCharge(mol),  # Molecular formal charge
        'RadicalElectrons': Descriptors.NumRadicalElectrons(mol)  # Total radical electrons
    }


In [85]:
df = pd.read_csv('preprocessed.csv')

In [135]:
# Usage:
df['ECFP4'] = df['smiles'].apply(generate_ecfp4)



In [111]:
descriptor_df = df['smiles'].apply(get_molecular_descriptors).apply(pd.Series)

In [113]:
descriptor_df['drugbank-id'] = df['drugbank-id']

In [115]:
descriptor_df['logS'] = df['logs']

In [137]:
descriptor_df['ECFP4'] = df['ECFP4']

In [139]:
descriptor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1664 entries, 0 to 1663
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MolWt              1664 non-null   float64
 1   HeavyAtomCount     1664 non-null   float64
 2   NumRotatableBonds  1664 non-null   float64
 3   TPSA               1664 non-null   float64
 4   LabuteASA          1664 non-null   float64
 5   MolLogP            1664 non-null   float64
 6   MolMR              1664 non-null   float64
 7   FractionCSP3       1664 non-null   float64
 8   NumHDonors         1664 non-null   float64
 9   NumHAcceptors      1664 non-null   float64
 10  RingCount          1664 non-null   float64
 11  QED                1664 non-null   float64
 12  BalabanJ           1664 non-null   float64
 13  BertzCT            1664 non-null   float64
 14  Ipc                1664 non-null   float64
 15  FormalCharge       1664 non-null   float64
 16  RadicalElectrons   1664 

In [141]:
descriptor_df.to_csv("descriptors.csv", index=False)

In [131]:
df['ECFP4'].head()

0    <rdkit.DataStructs.cDataStructs.ExplicitBitVec...
1    <rdkit.DataStructs.cDataStructs.ExplicitBitVec...
2    <rdkit.DataStructs.cDataStructs.ExplicitBitVec...
3    <rdkit.DataStructs.cDataStructs.ExplicitBitVec...
4    <rdkit.DataStructs.cDataStructs.ExplicitBitVec...
Name: ECFP4, dtype: object

## Graph representation

In [43]:
from rdkit import Chem
from deepchem.feat import MolGraphConvFeaturizer
import numpy as np

class EnhancedMolGraphConvFeaturizer(MolGraphConvFeaturizer):
    def __init__(self, use_edges=True, use_chirality=True, use_partial_charge=False):
        super().__init__(use_edges, use_chirality, use_partial_charge)
        
    def _construct_atom_feature(self, atom):
        """Enhanced atom features matching DGL-LifeSci's AttentiveFP"""
        features = [
            self._atom_type(atom),          # 9 features
            self._formal_charge(atom),      # From paper [1]
            self._hybridization(atom),      # SP/SP2/SP3
            self._hydrogen_bonding(atom),   # Donor/acceptor
            self._aromatic(atom),           # Aromaticity
            self._degree(atom),             # 0-5
            self._num_hydrogens(atom),      # 0-4 
            self._chirality(atom),          # R/S
            self._radical_electrons(atom)   # From paper [1]
        ]
        return np.concatenate(features)

    def _construct_bond_feature(self, bond):
        """Enhanced bond features matching FormulationBCS requirements"""
        features = [
            self._bond_type(bond),          # Single/double/triple/aromatic
            self._same_ring(bond),          # Shared ring membership
            self._conjugated(bond),         # Conjugation
            self._stereo(bond)              # Stereo configuration
        ]
        return np.concatenate(features)

    # New feature extractors
    def _formal_charge(self, atom):
        return np.array([atom.GetFormalCharge()])
        
    def _radical_electrons(self, atom):
        return np.array([atom.GetNumRadicalElectrons()])

In [47]:
import pandas as pd
from rdkit import Chem
from deepchem.feat import MolGraphConvFeaturizer
import numpy as np
import pickle
import os


def process_dataset(csv_path, output_dir="graph_data"):
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Load dataset
    df = pd.read_csv(csv_path)
    print(f"Loaded dataset with {len(df)} entries")
    
    # Initialize featurizer
    featurizer = EnhancedMolGraphConvFeaturizer(use_edges=True)
    
    # Process molecules
    valid_graphs = []
    valid_indices = []
    
    for idx, row in df.iterrows():
        try:
            # Featurize molecule
            graph = featurizer.featurize(row['SMILES'])[0]
            
            # Validate featurization
            if graph is not None and graph.node_features is not None:
                valid_graphs.append({
                    'smiles': row['smiles'],
                    'node_features': graph.node_features,
                    'edge_features': graph.edge_features,
                    'edge_index': graph.edge_index,
                    'logS': row['logs'],  # Keep existing properties
                    'logP': row['logp_1'],
                    'logPapp': row['logp_2']
                })
                valid_indices.append(idx)
                
        except Exception as e:
            print(f"Error processing index {idx} ({row['smiles']}): {str(e)}")
    
    # Save processed data
    with open(os.path.join(output_dir, 'graph_data.pkl'), 'wb') as f:
        pickle.dump(valid_graphs, f)
    
    # Save valid indices for reference
    pd.Series(valid_indices).to_csv(os.path.join(output_dir, 'valid_indices.csv'), index=False)
    
    print(f"Successfully processed {len(valid_graphs)}/{len(df)} molecules")
    print(f"Graph data saved to {output_dir}/graph_data.pkl")
    print(f"Validation indices saved to {output_dir}/valid_indices.csv")


process_dataset("preprocessed.csv")


Loaded dataset with 1664 entries


Failed to featurize datapoint 0, [Cl-].[K+]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 367 ([Cl-].[K+]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Ca+2].[Cl-].[Cl-]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 620 ([Cl-].[Cl-].[Ca++]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Li+]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Al]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 710 ([Li+]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 715 ([Al]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Mn+2]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Cl-].[NH4+]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1154 ([Mn++]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1156 ([NH4+].[Cl-]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Cu]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, O. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [He]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1332 ([Cu]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1336 (O): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1339 ([He]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [C]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [F-].[Na+]. Appending empty array


Error processing index 1375 ([C]): 'numpy.ndarray' object has no attribute 'node_features'


Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1388 ([F-].[Na+]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Cl-].[Cl-].[Sn+2]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [I-].[Na+]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Cu+2].[O-2]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Se]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Cr]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Na+].[OH-]. Appending empty array


Error processing index 1410 ([Cl-].[Cl-].[Sn++]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1425 ([Na+].[I-]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1431 ([O--].[Cu++]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1432 ([Se]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1433 ([Cr]): 'numpy.ndarray' object has no attribute 'node_features'


Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [K+].[OH-]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1437 ([OH-].[Na+]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1438 ([OH-].[K+]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, O.[Al+3].[Al+3].[Cl-].[OH-].[OH-].[OH-].[OH-].[OH-]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Ne]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1478 (O.[OH-].[OH-].[OH-].[OH-].[OH-].[Al+3].[Al+3].[Cl-]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1485 ([Ne]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Ag]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [Mg+2].[O-2].[O-2].[O-2].[Si+4]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1560 ([Ag]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1573 ([O--].[O--].[O--].[Mg++].[Si+4]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [Fe+3]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1601 ([Fe+3]): 'numpy.ndarray' object has no attribute 'node_features'


Failed to featurize datapoint 0, [KH]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 0, [MgH2]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity


Error processing index 1616 ([KH]): 'numpy.ndarray' object has no attribute 'node_features'
Error processing index 1619 ([MgH2]): 'numpy.ndarray' object has no attribute 'node_features'
Successfully processed 1639/1664 molecules
Graph data saved to graph_data/graph_data.pkl
Validation indices saved to graph_data/valid_indices.csv


In [59]:
descriptor_df.head()

Unnamed: 0,MolWt,TPSA,MolLogP,NumHAcceptors,NumHDonors,drugbank-id
0,1269.433,495.89,-3.1057,16.0,17.0,DB00014
1,1811.253,519.89,4.8676,16.0,20.0,DB00027
2,2140.492,889.48,-9.7336,32.0,27.0,DB00067
3,1202.635,278.8,3.269,12.0,5.0,DB00091
4,1355.388,477.85,2.72222,21.0,9.0,DB00115
