In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from IPython.core.display import display, HTML
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from rdkit import Chem

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score

In [4]:
def force_show_all(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None):
        display(HTML(df.to_html()))

In [8]:
bindingDB = pd.read_csv('SHP2_training_1875.csv')
chembl = pd.read_csv('..\Raw_data\ChemBl\shp2_chembl.csv')

In [9]:
bindingDB

Unnamed: 0,SMILES,IC50(microM)
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,0.0008
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,0.001
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,0.001
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,0.0012
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,0.0015
...,...,...
1870,NC1=C(N2CCC(CN)CC2)N=CC(C3=CC=CC(Cl)=C3Cl)=C1,1.64
1871,NC1=C(N2CCC(C)(N)CC2)N=CC(C3=CC=CC(Cl)=C3Cl)=C1,1.58
1872,NC1=C(NC2=CC(N)=CC=C2)N=CC(C3=CC=CN=C3)=C1,9.47
1873,NCC(CC1)CCN1C2=C(CO)C=C(C3=CC=CC(Cl)=C3Cl)C=N2,1.36


In [10]:
chembl

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,standard_value
0,CHEMBL328907,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,inactive,100000.0
1,CHEMBL2092743,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,inactive,100000.0
2,CHEMBL438997,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)...,inactive,1540.0
3,CHEMBL263010,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OC(C(=O)O)C...,inactive,22000.0
4,CHEMBL420456,Nc1nc(N)c2nc(CN3CCN(Cc4ccc(-c5ccccc5)cc4)CC3)n...,inactive,5000.0
...,...,...,...,...
1140,CHEMBL4778849,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,inactive,100000.0
1141,CHEMBL4786441,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,inactive,3320.0
1142,CHEMBL4788294,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,inactive,1240.0
1143,CHEMBL4800140,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,inactive,3970.0


In [11]:
bindingDB = bindingDB[['SMILES','IC50(microM)']]
chembl = chembl[['canonical_smiles','standard_value']]

In [12]:
chembl['standard_value'] = chembl['standard_value'].apply(lambda x: x/1000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [13]:
chembl

Unnamed: 0,canonical_smiles,standard_value
0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,100.00
1,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,100.00
2,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)...,1.54
3,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OC(C(=O)O)C...,22.00
4,Nc1nc(N)c2nc(CN3CCN(Cc4ccc(-c5ccccc5)cc4)CC3)n...,5.00
...,...,...
1140,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.00
1141,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32
1142,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24
1143,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97


In [14]:
chembl.columns = ['SMILES','IC50(microM)']

In [15]:
chembl

Unnamed: 0,SMILES,IC50(microM)
0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,100.00
1,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...,100.00
2,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)...,1.54
3,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OC(C(=O)O)C...,22.00
4,Nc1nc(N)c2nc(CN3CCN(Cc4ccc(-c5ccccc5)cc4)CC3)n...,5.00
...,...,...
1140,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.00
1141,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32
1142,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24
1143,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97


In [16]:
force_show_all(chembl)

Unnamed: 0,SMILES,IC50(microM)
0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@@H](Cc1ccc(OP(=O)(O)O)cc1)NC(C)=O)C(=O)N[C@@H](CCC(=O)O)C(=O)O,100.0
1,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)C(Cc1ccc(OP(=O)(O)O)cc1)c1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)O,100.0
2,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OP(=O)(O)O)cc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(C)=O)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C(N)=O)C(C)C,1.54
3,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccc(OC(C(=O)O)C(=O)O)cc1)NC(=O)[C@H](CC(N)=O)NC(=O)[C@H](CC(C)C)NC(C)=O)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CC(C)C)C(=O)N[C@H](C(N)=O)C(C)C,22.0
4,Nc1nc(N)c2nc(CN3CCN(Cc4ccc(-c5ccccc5)cc4)CC3)nnc2n1,5.0
5,Nc1nc(N)c2nc(CN3CCN(Cc4ccccc4)CC3)nnc2n1,9.0
6,CCN(CC)Cc1nnc2nc(N)nc(N)c2n1,14.0
7,O=P(O)(O)C(F)(F)c1ccc(COc2ccc(OCc3ccc(C(F)(F)P(=O)(O)O)cc3)cc2)cc1,16.0
8,N.N.N.N.O=P(O)(O)C(F)(F)c1ccc(COCCOCc2ccc(C(F)(F)P(=O)(O)O)cc2)cc1,152.0
9,O=P(O)(O)C(F)(F)c1ccc(CCCCc2ccc(C(F)(F)P(=O)(O)O)cc2)cc1,91.0


In [17]:
bindingDB['database'] = 'BindingDB'
chembl['database'] = 'Chembl'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
force_show_all(bindingDB)

Unnamed: 0,SMILES,IC50(microM),database
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3[C@@H]2N)CC1,0.0008,BindingDB
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(O)nc4CCCCn4c3=O)c2Cl)c(N)n1,0.001,BindingDB
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C(Cl)=CCC2)c(n1)C(N)=O,0.001,BindingDB
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)c2Cl)c2nccn12,0.0012,BindingDB
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3nccn23)[C@@H]1N,0.0015,BindingDB
5,N[C@@H]1c2cccnc2CC11CCN(CC1)c1ncc(Sc2cccc(Cl)c2Cl)c2nccn12,0.0017,BindingDB
6,N[C@@H]1c2ccccc2CC11CCN(CC1)c1ncc(Sc2ccnc3ccsc23)c2nccn12,0.0017,BindingDB
7,CSc1ccccc1Sc1cnc(N2CCC3(Cc4ccccc4[C@H]3N)CC2)n2ccnc12,0.0018,BindingDB
8,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3ccnc4N5CC6(CC6)CC5COc34)nc2CO)[C@@H]1N,0.002,BindingDB
9,N[C@@H]1c2ccccc2CC11CCN(CC1)c1ncc(Sc2cccc(Cl)c2Cl)c2nccn12,0.0021000000000000003,BindingDB


In [19]:
whole_data = pd.concat([bindingDB,chembl])
whole_data

Unnamed: 0,SMILES,IC50(microM),database
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,0.0008,BindingDB
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,0.001,BindingDB
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,0.001,BindingDB
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,0.0012,BindingDB
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,0.0015,BindingDB
...,...,...,...
1140,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.0,Chembl
1141,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32,Chembl
1142,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24,Chembl
1143,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97,Chembl


In [20]:
whole_data.iloc[:, 0].duplicated(keep=False).sum()

472

In [21]:
duplicates = whole_data.loc[whole_data.iloc[:, 0].duplicated(keep=False), :]
duplicates.shape

(472, 3)

In [22]:
force_show_all(duplicates)

Unnamed: 0,SMILES,IC50(microM),database
20,N[C@H]1C[C@@H]2CC[C@H]([C@H]1F)N2c1nc2[nH]nc(-c3ccc4ncsc4c3Cl)c2nc1CO,0.003,BindingDB
245,COC(=O)c1cc(O)c(=O)c2c(O)c(O)ccc2c1,0.161,BindingDB
321,CCc1ccc(OCC(=O)Nc2ccc(-c3nc4ccccc4o3)c(O)c2)cc1,0.368,BindingDB
434,COc1ccccc1NS(=O)(=O)c1cc(NC(=O)c2sccc2C)ccc1N1CCCC1,0.665,BindingDB
468,NC(=O)c1cc(O)c(=O)c2c(O)c(O)c(F)cc2c1,0.949,BindingDB
473,CCOC(=O)c1cc(O)c(=O)c2c(O)c(O)ccc2c1,0.985,BindingDB
533,NN=C1C(Cl)=C(Cl)C(Cl)=C1Cl,1.45,BindingDB
656,COc1ccc(NC(=O)c2cc(=O)c3ccccc3o2)cc1Cl,2.82,BindingDB
672,NS(=O)(=O)c1ccc(NC(=S)N2CCC(Cc3ccccc3)CC2)cc1,3.13,BindingDB
682,O=C1Nc2ccc3sc4ccccc4c3c2C1=O,3.25,BindingDB


In [23]:
whole_data.drop_duplicates(subset ="SMILES" ,keep='last',inplace=True)
whole_data.shape

(2753, 3)

In [24]:
whole_data.iloc[:, 0].duplicated(keep=False).sum()

0

In [25]:
whole_data['database'].value_counts()

BindingDB    1822
Chembl        931
Name: database, dtype: int64

In [26]:
whole_data

Unnamed: 0,SMILES,IC50(microM),database
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,0.0008,BindingDB
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,0.001,BindingDB
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,0.001,BindingDB
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,0.0012,BindingDB
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,0.0015,BindingDB
...,...,...,...
1140,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.0,Chembl
1141,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32,Chembl
1142,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24,Chembl
1143,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97,Chembl


In [27]:
whole_data = whole_data.dropna()

In [28]:
whole_data

Unnamed: 0,SMILES,IC50(microM),database
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,0.0008,BindingDB
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,0.001,BindingDB
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,0.001,BindingDB
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,0.0012,BindingDB
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,0.0015,BindingDB
...,...,...,...
1140,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.0,Chembl
1141,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32,Chembl
1142,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24,Chembl
1143,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97,Chembl


In [29]:
whole_data = whole_data.reset_index()
whole_data = whole_data.drop(['index'], axis=1)
whole_data

Unnamed: 0,SMILES,IC50(microM),database
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,0.0008,BindingDB
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,0.001,BindingDB
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,0.001,BindingDB
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,0.0012,BindingDB
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,0.0015,BindingDB
...,...,...,...
2747,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.0,Chembl
2748,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32,Chembl
2749,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24,Chembl
2750,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97,Chembl


In [30]:
def activity(s):
    if s[0] == '<':
        if float(s[1:]) <= 1:
            return 1
        else:
            return np.nan
    elif s[0] == '>':
        if float(s[1:]) >= 1:
            return 0
        else:
            return np.nan
    else:
        if float(s) > 1:
            return 0
        else:
            return 1

In [31]:
whole_data['IC50(microM)'] = whole_data['IC50(microM)'].apply(str)

In [32]:
whole_data['TARGET'] = whole_data['IC50(microM)'].apply(activity)
whole_data

Unnamed: 0,SMILES,IC50(microM),database,TARGET
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,0.0008,BindingDB,1.0
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,0.001,BindingDB,1.0
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,0.001,BindingDB,1.0
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,0.0012,BindingDB,1.0
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,0.0015,BindingDB,1.0
...,...,...,...,...
2747,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,100.0,Chembl,0.0
2748,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,3.32,Chembl,0.0
2749,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,1.24,Chembl,0.0
2750,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,3.97,Chembl,0.0


In [33]:
whole_data['TARGET'].value_counts()

0.0    1781
1.0     969
Name: TARGET, dtype: int64

In [34]:
def check_smiles_validity(df):
    smiles = list(df['SMILES'].values)
    invalid_smile_indices = []
    invalid_chemistry_indices = []

    for idx, smile in enumerate(tqdm(smiles)):
        m = Chem.MolFromSmiles(smile, sanitize=False)
        if m is None:
            print(idx, smile, "invalid smile")
            invalid_smile_indices.append(idx)
        else:
            try:
                Chem.SanitizeMol(m)
            except:
                print(idx, smile, "invalid chemistry")
                invalid_chemistry_indices.append(idx)
  
    return invalid_smile_indices, invalid_chemistry_indices

In [35]:
invalid_smiles, invalid_chemistry = check_smiles_validity(whole_data)

  0%|          | 0/2752 [00:00<?, ?it/s]

In [36]:
invalid_smiles

[]

In [37]:
invalid_chemistry

[]

In [44]:
class RDKit_2D:
    def __init__(self, smiles):
        self.mols = [Chem.MolFromSmiles(i) for i in smiles]
        self.smiles = smiles
        
        
    def compute_2Drdkit(self, name):
        rdkit_2d_desc = []
        calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        header = calc.GetDescriptorNames()
        for i in tqdm(range(len(self.mols))):
            try:
                ds = calc.CalcDescriptors(self.mols[i])
                rdkit_2d_desc.append(ds)
            except:
                print(self.smiles[i])
        df = pd.DataFrame(rdkit_2d_desc,columns=header)
        df.insert(loc=0, column='SMILES', value=self.smiles)
        df.to_csv(name+'_RDKit_2D.csv', index=False)

In [45]:
def main():
    filename = "shp2_2752"         
    smiles = [i for i in whole_data['SMILES'].values]  
    ## Compute RDKit_2D Fingerprints and export a csv file.
    RDKit_descriptor = RDKit_2D(smiles)        # create your RDKit_2D object and provide smiles
    RDKit_descriptor.compute_2Drdkit(filename) # compute RDKit_2D and provide the name of your desired output file. you can use the same name as the input file because the RDKit_2D class will ensure to add "_RDKit_2D.csv" as part of the output file.

if __name__ == '__main__':
    main()

  0%|          | 0/2752 [00:00<?, ?it/s]

In [46]:
data = pd.read_csv('shp2_2752_RDKit_2D.csv')
data

Unnamed: 0,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,12.992564,-0.169513,12.992564,0.068072,0.609173,455.389,431.197,454.132717,160,...,0,0,0,0,0,0,0,0,0,0
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,13.061568,-0.781987,13.061568,0.246174,0.340043,597.145,563.881,596.208486,216,...,0,0,0,0,0,0,0,0,0,0
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,12.340744,-0.676170,12.340744,0.017202,0.606557,485.419,459.211,484.154515,172,...,0,0,0,0,0,0,0,0,0,0
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,10.063495,-0.264210,10.063495,0.058620,0.603375,464.422,441.238,463.100037,158,...,0,0,0,0,0,0,0,0,0,0
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,6.464779,0.086398,6.464779,0.086398,0.614971,464.422,441.238,463.100037,158,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,13.333207,-1.619179,13.333207,0.045316,0.248676,597.661,558.349,597.257396,232,...,0,0,0,0,0,0,0,0,0,0
2748,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,12.516733,-1.051943,12.516733,0.156681,0.076526,824.967,772.551,824.356048,316,...,0,0,0,0,0,0,0,0,0,0
2749,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,13.462705,-3.710843,13.462705,0.073907,0.112068,679.791,642.495,679.223988,252,...,0,0,0,0,0,0,0,0,0,0
2750,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,13.659530,-3.730454,13.659530,0.066112,0.099496,793.132,757.852,791.095528,264,...,0,0,0,0,0,0,0,0,0,0


In [47]:
data['IC50(microM)'] = whole_data['IC50(microM)']
data.shape

(2752, 210)

In [48]:
data = data.dropna()
data

Unnamed: 0,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,IC50(microM)
0,Cc1nc(cc(=O)n1-c1cccc(Cl)c1Cl)N1CCC2(Cc3ccccc3...,12.992564,-0.169513,12.992564,0.068072,0.609173,455.389,431.197,454.132717,160,...,0,0,0,0,0,0,0,0,0,0.0008
1,N[C@@H]1CCCC11CCN(CC1)c1cnc(Sc2cccc(NC(=O)c3c(...,13.061568,-0.781987,13.061568,0.246174,0.340043,597.145,563.881,596.208486,216,...,0,0,0,0,0,0,0,0,0,0.001
2,N[C@@H]1c2ccccc2CC11CCN(CC1)c1nc(N)c(C2=C(Cl)C...,12.340744,-0.676170,12.340744,0.017202,0.606557,485.419,459.211,484.154515,172,...,0,0,0,0,0,0,0,0,0,0.001
3,N[C@@H]1C[C@H](O)CC11CCN(CC1)c1ncc(Sc2cccc(Cl)...,10.063495,-0.264210,10.063495,0.058620,0.603375,464.422,441.238,463.100037,158,...,0,0,0,0,0,0,0,0,0,0.0012
4,C[C@@H]1OCC2(CCN(CC2)c2ncc(Sc3cccc(Cl)c3Cl)c3n...,6.464779,0.086398,6.464779,0.086398,0.614971,464.422,441.238,463.100037,158,...,0,0,0,0,0,0,0,0,0,0.0015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,CC(C)(C)c1ccc(Oc2ccc(NC(=O)C3=CO[C@@H](O[C@@H]...,13.333207,-1.619179,13.333207,0.045316,0.248676,597.661,558.349,597.257396,232,...,0,0,0,0,0,0,0,0,0,100.0
2748,O=C(O)C1=CO[C@@H](O[C@@H]2O[C@H](COCc3ccccc3)[...,12.516733,-1.051943,12.516733,0.156681,0.076526,824.967,772.551,824.356048,316,...,0,0,0,0,0,0,0,0,0,3.32
2749,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,13.462705,-3.710843,13.462705,0.073907,0.112068,679.791,642.495,679.223988,252,...,0,0,0,0,0,0,0,0,0,1.24
2750,CCOS(=O)(=O)/C=C/c1ccc(OCC2=CC[C@@H]3C(C(=O)Nc...,13.659530,-3.730454,13.659530,0.066112,0.099496,793.132,757.852,791.095528,264,...,0,0,0,0,0,0,0,0,0,3.97


In [49]:
data.to_csv('SHP2_train_descriptors_2741.csv',index=False)