This notebook consists of generating rdkit extra chmical features 
and checking the PAINS descriptor

In [7]:
!pip install rdkit-pypi
!pip install scikit-learn
!pip install git+https://github.com/bp-kelley/descriptastorus

Collecting git+https://github.com/bp-kelley/descriptastorus
  Cloning https://github.com/bp-kelley/descriptastorus to /tmp/pip-req-build-7zv_9brp
  Running command git clone -q https://github.com/bp-kelley/descriptastorus /tmp/pip-req-build-7zv_9brp


In [8]:
from collections import defaultdict

In [9]:
import pandas as pd

In [10]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors
from tqdm.notebook import tqdm
import numpy as np

In [11]:
def rdkit_2d_normalized_features_generator(mol):
    """
    Generates RDKit 2D normalized features for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :return: A 1D numpy array containing the RDKit 2D normalized features.
    """
    smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol
    generator = rdNormalizedDescriptors.RDKit2DNormalized()
    features = generator.process(smiles)

    return features

In [37]:
MORGAN_RADIUS=3
MORGAN_NUM_BITS=1024
def morgan_counts_features_generator(mol,
                                     radius: int = MORGAN_RADIUS,
                                     num_bits: int = MORGAN_NUM_BITS):
    """
    Generates a counts-based Morgan fingerprint for a molecule.
    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
    :param radius: Morgan fingerprint radius.
    :param num_bits: Number of bits in Morgan fingerprint.
    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
    """
    # mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    # features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    # features = np.zeros((1,))
    # DataStructs.ConvertToNumpyArray(features_vec, features)

    # return features
    smiles = Chem.MolToSmiles(mol, isomericSmiles=True) if type(mol) != str else mol
    generator = rdDescriptors.ChiralMorganCounts()
    features = generator.process(smiles)

    return features

In [25]:
data_train = pd.read_csv('../data/clean_train_no_FP.csv').drop(columns=['Unnamed: 0'])

In [26]:
data_train

Unnamed: 0,Unnamed: 0.1,Smiles,Active,clean,rot,heavy
0,258,Nn1cnc2cc3ccccc3cc2c1=O,False,Nn1cnc2cc3ccccc3cc2c1=O,0,16
1,316,O=C(O)C(O)C(O)C(=O)O.c1cnc2cc3c(cc2n1)C1CNCC3C1,False,c1cnc2cc3c(cc2n1)C1CNCC3C1,0,16
2,2775,O=C1CC2OCC=C3CN4CCC56c7ccccc7N1C5C2C3CC46,False,O=C1CC2OCC=C3CN4CCC56c7ccccc7N1C5C2C3CC46,0,25
3,310,[O-]n1ccccc1=S.[O-]n1ccccc1=S.[Zn+2],True,On1ccccc1=S,0,8
4,5408,Nc1nc[nH]n1,False,Nc1nc[nH]n1,0,6
...,...,...,...,...,...,...
5552,3340,CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(=O)CNC(=O)[C@H...,False,CSCCC(NC(=O)C(CC(C)C)NC(=O)CNC(=O)C(Cc1ccccc1)...,42,95
5553,4731,CC(C)CC(NC(=O)C(C)NC(=O)CNC(=O)C(NC=O)C(C)C)C(...,False,CC(C)CC(NC(=O)C(C)NC(=O)CNC(=O)C(NC=O)C(C)C)C(...,53,136
5554,578,CC(C)C[C@H](NC(=O)[C@H](CCCCN)NC(=O)[C@H](CCCN...,False,CC(C)CC(NC(=O)C(CCCCN)NC(=O)C(CCCN=C(N)N)NC(=O...,62,128
5555,3465,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,False,CC[C@H](C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)...,66,155


In [27]:
data_test = pd.read_csv('../data/clean_test_no_FP.csv').drop(columns=['Unnamed: 0'])

In [28]:
data_test

Unnamed: 0,Unnamed: 0.1,Smiles,clean,rot,heavy
0,973,C[C@@H]1O[C@@]2(CS1)CN1CCC2CC1,C[C@@H]1O[C@@]2(CS1)CN1CCC2CC1,0,13
1,1112,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H](Cl)[...,Cl[C@H]1[C@H](Cl)[C@@H](Cl)[C@H](Cl)[C@H](Cl)[...,0,12
2,1059,C=C1C(=O)O[C@H]2[C@H]1CC/C(C)=C\CC[C@@]1(C)O[C...,C=C1C(=O)O[C@H]2[C@H]1CC/C(C)=C\CC[C@@]1(C)O[C...,0,18
3,1052,O[C@H]1CO[C@@H]2[C@H](O)CO[C@H]12,O[C@H]1CO[C@@H]2[C@H](O)CO[C@H]12,0,10
4,1034,Oc1ccc(Cl)c2c1CCC2,Oc1ccc(Cl)c2c1CCC2,0,11
...,...,...,...,...,...
1609,1316,CC(C)C[C@H](NC(=O)[C@@H](Cc1c[nH]c2ccccc12)NC(...,CC(C)CC(NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1c...,34,95
1610,1265,CC[C@H](C(=O)N1CCCC[C@H]1C(=O)O[C@H](CCc1ccc(O...,CCC(C(=O)N1CCCCC1C(=O)O[C@H](CCc1ccc(OC)c(OC)c...,37,102
1611,498,CC(=O)N[C@H](Cc1ccc2ccccc2c1)C(=O)N[C@H](Cc1cc...,CC(=O)NC(Cc1ccc2ccccc2c1)C(=O)NC(Cc1ccc(Cl)cc1...,41,117
1612,1459,CCCCCCCCCCCCCCCC(=O)N[C@@H](CCC(O)NCCCC[C@H](N...,CCCCCCCCCCCCCCCC(=O)NC(CCC(O)NCCCCC(NC(=O)C(C)...,131,266


In [29]:
all_desc_test = []
for smi in tqdm(data_test.clean):
  desc = rdkit_2d_normalized_features_generator(Chem.MolFromSmiles(smi)) # change generator!
  all_desc_test.append(desc)

  0%|          | 0/1614 [00:00<?, ?it/s]

In [30]:
all_desc_train = []
for smi in tqdm(data_train.clean):
  desc = rdkit_2d_normalized_features_generator(Chem.MolFromSmiles(smi)) # change generator!
  all_desc_train.append(desc)

  0%|          | 0/5557 [00:00<?, ?it/s]

In [38]:
fp_test = []
for smi in tqdm(data_test.clean):
  desc = morgan_counts_features_generator(Chem.MolFromSmiles(smi)) # change generator!
  fp_test.append(desc)

  0%|          | 0/1614 [00:00<?, ?it/s]

In [39]:
fp_train = []
for smi in tqdm(data_train.clean):
  desc = morgan_counts_features_generator(Chem.MolFromSmiles(smi)) # change generator!
  fp_train.append(desc)

  0%|          | 0/5557 [00:00<?, ?it/s]

In [31]:
len(all_desc_train[0])

201

In [40]:
descs_test = defaultdict(list)
for line in all_desc_test:
  for i in range(1, 201):
    descs_test[f'desc_{str(i)}'].append(line[i])
for line in fp_test:
  for i in range(len(line)):
    descs_test[f'fp_{str(i)}'].append(line[i])
descs_test['Smiles'] = data_test.Smiles

In [41]:
test_data = pd.DataFrame(data=descs_test)

In [42]:
descs_train = defaultdict(list)
for line in all_desc_train:
  for i in range(1, 201):
    descs_train[f'desc_{str(i)}'].append(line[i])
for line in fp_train:
  for i in range(len(line)):
    descs_train[f'fp_{str(i)}'].append(line[i])
descs_train['Smiles'] = data_train.Smiles
descs_train['Active'] = data_train.Active
train_data = pd.DataFrame(data=descs_train)

In [44]:
test_data.to_csv('../data/test_rdkit_2d_normalized_count_fp_clean.csv')
train_data.to_csv('../data/train_rdkit_2d_normalized_count_fp_clean.csv')

Further you can see the code for preprocessing PAINS results. Wasn't unused for the final solution

In [102]:
pains_data = pd.read_csv('../data/new_data_after_pains.csv')

In [103]:
pains_data

Unnamed: 0,Active,canonical_smiles,core,BRENK,PAINS_A,PAINS_B,PAINS_C
0,False,COc1ccc2[nH]cc(CCN)c2c1,c1ccc2[nH]ccc2c1,,,,
1,False,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,c1ccc([C@H]2CCCNC2)cc1,,,,
2,False,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,O=S(=O)(c1ccc2ccccc2c1)N1CCN(c2ncccn2)CC1,"['hydroxamic_acid', 'Oxygen-nitrogen_single_bo...",,,
3,False,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,O=C(NCc1ccccc1)c1ccc(Oc2ccc(OCc3ccccc3)cc2)nc1,['aniline'],,,
4,False,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,c1ccc(CNCc2ccc(-c3ccnc4[nH]ccc34)cc2)cc1,,,,
...,...,...,...,...,...,...,...
5552,False,O=C(Oc1ccc([N+](=O)[O-])cc1)N1CCC(C(O)(c2ccc3c...,O=C(Oc1ccccc1)N1CCC(C(c2ccc3c(c2)OCO3)c2ccc3c(...,"['nitro_group', 'Oxygen-nitrogen_single_bond']",,,
5553,False,Nc1nonc1/C(=N/O)Nc1ccc(F)c(Br)c1,N=C(Nc1ccccc1)c1cnon1,"['imine_1', 'imine_2', 'oxime_1', 'Oxygen-nitr...",,,
5554,False,Oc1cccc2cccnc12,c1ccc2ncccc2c1,,,,
5555,False,OC(c1ccc(-c2ccc(CN3CCN(Cc4ccncc4)CC3)cc2)c(F)c...,c1ccc(-c2ccc(CN3CCN(Cc4ccncc4)CC3)cc2)cc1,,,,


In [88]:
good_pains_data = pains_data.merge(
  data_train[['Smiles', 'clean']], 
  left_on='canonical_smiles', 
  right_on='Smiles')[['clean', 'Active']].rename(columns={'clean': 'Smiles'})

In [89]:
good_pains_data.to_csv('../data/good_pains_train.csv', index=False, sep=',')