# **Installing rdkit**

In [None]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
! conda install -c rdkit rdkit -y
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

--2021-04-13 13:53:49--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.131.3, 104.16.130.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.131.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’


2021-04-13 13:53:50 (140 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.3.0=py37_0
    - ca-certificates==2020.1.1=0
    - certifi==2019.11.28=py37_0
    - cffi==1.14.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0


# **Load the dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Read in the dataset**

In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/dataset_cleaned.csv')
data

Unnamed: 0,Compound Id,SMILES,Solubility
0,10100,CCC(=O)OC(CC1=CC=CC=C1)(C2=CC=CC=C2)C(C)CN(C)C,3.320000
1,10113978,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(...,3.300000
2,10133,CC(CCC(=O)O)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C...,0.089900
3,10182969,COC1=CC=C(C=C1)N2C3=C(CCN(C3=O)C4=CC=C(C=C4)N...,0.040000
4,10220503,CCC(=O)NCC1CC1C2=C3CCOC3=CC=C2,1.100000
...,...,...,...
1225,9916,CN=C1CN(C(=C2C=C(C=CC2=N1)Cl)C3=CC=CC=C3)O.Cl,66.000000
1226,9926791,CC1CCN(CC1N(C)C2=NC=NC3=C2C=CN3)C(=O)CC#N,155.100000
1227,9949848,CC1=CC2C(CCC3(C2CCC3(C)C(=O)C)C)C4(C1=CC(=O)C...,0.000005
1228,996,C1=CC=C(C=C1)O,77.000000


In [None]:
data.SMILES

0          CCC(=O)OC(CC1=CC=CC=C1)(C2=CC=CC=C2)C(C)CN(C)C
1        CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(...
2        CC(CCC(=O)O)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C...
3        COC1=CC=C(C=C1)N2C3=C(CCN(C3=O)C4=CC=C(C=C4)N...
4                          CCC(=O)NCC1CC1C2=C3CCOC3=CC=C2
                              ...                        
1225        CN=C1CN(C(=C2C=C(C=CC2=N1)Cl)C3=CC=CC=C3)O.Cl
1226            CC1CCN(CC1N(C)C2=NC=NC3=C2C=CN3)C(=O)CC#N
1227     CC1=CC2C(CCC3(C2CCC3(C)C(=O)C)C)C4(C1=CC(=O)C...
1228                                       C1=CC=C(C=C1)O
1229                CC1=CC(=C(C=C1)SC2=CC=CC=C2N3CCNCC3)C
Name: SMILES, Length: 1230, dtype: object

In [None]:
data.Solubility

0         3.320000
1         3.300000
2         0.089900
3         0.040000
4         1.100000
           ...    
1225     66.000000
1226    155.100000
1227      0.000005
1228     77.000000
1229      7.800000
Name: Solubility, Length: 1230, dtype: float64

# **Convert list of molecules to rdkit object**

In [None]:
from rdkit import Chem

In [None]:
mol_list= []
for element in data.SMILES:
  mol = Chem.MolFromSmiles(element)
  mol_list.append(mol)

In [None]:
len(mol_list)

1230

# **Calculate molecular descriptors**


1.   cLogP (Octanol-water partition coefficient)
2.   MW (Molecular weight)
3.   RB (Number of rotatable bonds)
4.   AP (Aromatic proportion = number of aromatic atoms / total number of heavy atoms)




In [None]:
import numpy as np
from rdkit.Chem import Descriptors


In [None]:
def generate(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_MolWt = Descriptors.MolWt(mol)
        desc_NumRotatableBonds = Descriptors.NumRotatableBonds(mol)
           
        row = np.array([desc_MolLogP,
                        desc_MolWt,
                        desc_NumRotatableBonds])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MolLogP","MolWt","NumRotatableBonds"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

In [None]:
df = generate(data.SMILES)
df

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds
0,4.27550,339.479,8.0
1,3.13904,437.529,5.0
2,4.47790,392.580,4.0
3,2.69960,459.506,5.0
4,2.25120,245.322,4.0
...,...,...,...
1225,2.27100,336.222,1.0
1226,1.54478,312.377,3.0
1227,5.27970,340.507,1.0
1228,1.39220,94.113,0.0


# **Number of Aromatic Atoms**

In [None]:
def AromaticAtoms(m):
  aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())]
  aa_count = []
  for i in aromatic_atoms:
    if i==True:
      aa_count.append(1)
  sum_aa_count = sum(aa_count)
  return sum_aa_count

In [None]:
desc_AromaticAtoms = [AromaticAtoms(element) for element in mol_list]
desc_AromaticAtoms

[12,
 21,
 0,
 17,
 6,
 18,
 11,
 6,
 0,
 24,
 6,
 15,
 6,
 12,
 0,
 10,
 0,
 15,
 0,
 18,
 6,
 0,
 15,
 12,
 10,
 6,
 12,
 0,
 0,
 0,
 11,
 21,
 0,
 12,
 12,
 9,
 0,
 15,
 6,
 11,
 12,
 6,
 6,
 25,
 6,
 0,
 6,
 6,
 9,
 12,
 5,
 17,
 12,
 0,
 15,
 6,
 24,
 9,
 18,
 5,
 16,
 12,
 0,
 6,
 0,
 0,
 16,
 12,
 0,
 0,
 12,
 15,
 5,
 18,
 9,
 9,
 12,
 9,
 9,
 6,
 16,
 10,
 12,
 9,
 9,
 15,
 9,
 15,
 9,
 12,
 6,
 9,
 17,
 26,
 21,
 23,
 6,
 10,
 10,
 21,
 0,
 0,
 0,
 12,
 12,
 16,
 12,
 0,
 18,
 6,
 10,
 30,
 18,
 6,
 0,
 29,
 0,
 0,
 10,
 6,
 6,
 6,
 18,
 12,
 18,
 15,
 10,
 12,
 10,
 16,
 22,
 6,
 0,
 10,
 6,
 16,
 6,
 6,
 12,
 21,
 0,
 12,
 0,
 24,
 18,
 12,
 12,
 0,
 12,
 6,
 0,
 6,
 6,
 16,
 12,
 0,
 12,
 15,
 6,
 0,
 6,
 11,
 0,
 9,
 0,
 9,
 10,
 10,
 0,
 0,
 18,
 6,
 15,
 6,
 5,
 6,
 0,
 10,
 12,
 0,
 9,
 0,
 9,
 6,
 6,
 0,
 15,
 6,
 27,
 10,
 14,
 17,
 6,
 0,
 12,
 9,
 11,
 6,
 0,
 23,
 6,
 6,
 6,
 9,
 15,
 6,
 12,
 14,
 6,
 18,
 18,
 6,
 12,
 0,
 9,
 0,
 12,
 15,
 0,
 15,
 11,
 6,
 0,


# **Number of Heavy Atoms**

In [None]:
desc_HeavyAtomCount = [Descriptors.HeavyAtomCount(element) for element in mol_list]
desc_HeavyAtomCount

[25,
 31,
 28,
 34,
 18,
 32,
 37,
 9,
 20,
 39,
 16,
 43,
 12,
 30,
 25,
 24,
 13,
 45,
 26,
 54,
 11,
 11,
 29,
 22,
 17,
 28,
 28,
 7,
 23,
 23,
 18,
 31,
 23,
 23,
 25,
 17,
 30,
 31,
 23,
 26,
 32,
 19,
 18,
 31,
 13,
 28,
 32,
 8,
 20,
 31,
 14,
 22,
 23,
 10,
 16,
 23,
 42,
 23,
 24,
 20,
 31,
 22,
 20,
 19,
 51,
 23,
 33,
 28,
 23,
 14,
 39,
 59,
 7,
 34,
 20,
 16,
 33,
 20,
 19,
 17,
 32,
 59,
 23,
 18,
 23,
 33,
 18,
 34,
 10,
 34,
 16,
 23,
 37,
 34,
 40,
 33,
 10,
 59,
 59,
 46,
 9,
 14,
 10,
 29,
 24,
 35,
 58,
 73,
 51,
 22,
 26,
 101,
 30,
 33,
 9,
 38,
 14,
 31,
 29,
 35,
 21,
 16,
 28,
 25,
 29,
 30,
 22,
 20,
 17,
 26,
 33,
 17,
 28,
 51,
 23,
 29,
 16,
 15,
 20,
 34,
 34,
 27,
 42,
 82,
 25,
 19,
 21,
 16,
 21,
 9,
 7,
 21,
 20,
 29,
 19,
 56,
 22,
 33,
 19,
 14,
 16,
 30,
 32,
 10,
 23,
 19,
 11,
 16,
 40,
 65,
 28,
 24,
 29,
 11,
 13,
 22,
 5,
 14,
 21,
 57,
 19,
 11,
 18,
 26,
 17,
 14,
 39,
 19,
 40,
 28,
 22,
 22,
 15,
 11,
 38,
 35,
 30,
 20,
 12,
 28,
 17,
 14

# **Computing the Aromatic Proportion (AP) descriptor**

In [None]:
desc_AromaticProportion = [AromaticAtoms(element)/Descriptors.HeavyAtomCount(element) for element in mol_list]
desc_AromaticProportion

[0.48,
 0.6774193548387096,
 0.0,
 0.5,
 0.3333333333333333,
 0.5625,
 0.2972972972972973,
 0.6666666666666666,
 0.0,
 0.6153846153846154,
 0.375,
 0.3488372093023256,
 0.5,
 0.4,
 0.0,
 0.4166666666666667,
 0.0,
 0.3333333333333333,
 0.0,
 0.3333333333333333,
 0.5454545454545454,
 0.0,
 0.5172413793103449,
 0.5454545454545454,
 0.5882352941176471,
 0.21428571428571427,
 0.42857142857142855,
 0.0,
 0.0,
 0.0,
 0.6111111111111112,
 0.6774193548387096,
 0.0,
 0.5217391304347826,
 0.48,
 0.5294117647058824,
 0.0,
 0.4838709677419355,
 0.2608695652173913,
 0.4230769230769231,
 0.375,
 0.3157894736842105,
 0.3333333333333333,
 0.8064516129032258,
 0.46153846153846156,
 0.0,
 0.1875,
 0.75,
 0.45,
 0.3870967741935484,
 0.35714285714285715,
 0.7727272727272727,
 0.5217391304347826,
 0.0,
 0.9375,
 0.2608695652173913,
 0.5714285714285714,
 0.391304347826087,
 0.75,
 0.25,
 0.5161290322580645,
 0.5454545454545454,
 0.0,
 0.3157894736842105,
 0.0,
 0.0,
 0.48484848484848486,
 0.42857142857142855

In [None]:
df_desc_AromaticProportion = pd.DataFrame(desc_AromaticProportion, columns=['AromaticProportion'])
df_desc_AromaticProportion

Unnamed: 0,AromaticProportion
0,0.480000
1,0.677419
2,0.000000
3,0.500000
4,0.333333
...,...
1225,0.545455
1226,0.391304
1227,0.000000
1228,0.857143


# **X matrix (Combining all computed descriptors into 1 dataframe**

In [None]:
df

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds
0,4.27550,339.479,8.0
1,3.13904,437.529,5.0
2,4.47790,392.580,4.0
3,2.69960,459.506,5.0
4,2.25120,245.322,4.0
...,...,...,...
1225,2.27100,336.222,1.0
1226,1.54478,312.377,3.0
1227,5.27970,340.507,1.0
1228,1.39220,94.113,0.0


In [None]:
df_desc_AromaticProportion

Unnamed: 0,AromaticProportion
0,0.480000
1,0.677419
2,0.000000
3,0.500000
4,0.333333
...,...
1225,0.545455
1226,0.391304
1227,0.000000
1228,0.857143


In [None]:
X = pd.concat([df,df_desc_AromaticProportion], axis=1)
X

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion
0,4.27550,339.479,8.0,0.480000
1,3.13904,437.529,5.0,0.677419
2,4.47790,392.580,4.0,0.000000
3,2.69960,459.506,5.0,0.500000
4,2.25120,245.322,4.0,0.333333
...,...,...,...,...
1225,2.27100,336.222,1.0,0.545455
1226,1.54478,312.377,3.0,0.391304
1227,5.27970,340.507,1.0,0.000000
1228,1.39220,94.113,0.0,0.857143


# **Y matrix**

In [None]:
data.head()

Unnamed: 0,Compound Id,SMILES,Solubility
0,10100,CCC(=O)OC(CC1=CC=CC=C1)(C2=CC=CC=C2)C(C)CN(C)C,3.32
1,10113978,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(...,3.3
2,10133,CC(CCC(=O)O)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C...,0.0899
3,10182969,COC1=CC=C(C=C1)N2C3=C(CCN(C3=O)C4=CC=C(C=C4)N...,0.04
4,10220503,CCC(=O)NCC1CC1C2=C3CCOC3=CC=C2,1.1


# **Assigning the third column (index 2) to the Y matrix**

In [None]:
Y = data.iloc[:,2]
Y

0         3.320000
1         3.300000
2         0.089900
3         0.040000
4         1.100000
           ...    
1225     66.000000
1226    155.100000
1227      0.000005
1228     77.000000
1229      7.800000
Name: Solubility, Length: 1230, dtype: float64

# **Data splitting**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)