# Description

This notebook is used to concatenate all sources of raw SMILES data and prepare data so that the 'cleanup_smiles.py' script can parse original data (used in training RNN).

In [1]:
import pandas as pd

In [3]:
moses_smiles = pd.read_csv('./datasets/moses_dataset_v1.csv',sep=',')
print(moses_smiles.shape)
moses_smiles = moses_smiles[moses_smiles['SMILES'].notnull()]
print(moses_smiles.shape)
moses_smiles['smiles'] = moses_smiles["SMILES"]
moses_smiles['length'] = moses_smiles["smiles"].str.len()
moses_smiles.head()

(1936962, 2)
(1936962, 2)


Unnamed: 0,SMILES,SPLIT,smiles,length
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,train,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,38
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,36
2,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1,test,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1,38
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,35
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,32


In [4]:
chembl_smiles = pd.read_csv('./datasets/CHEMBL26-chembl_molecule.csv',sep=';')
print(chembl_smiles.shape)
chembl_smiles = chembl_smiles[chembl_smiles['Smiles'].notnull()]
print(chembl_smiles.shape)
chembl_smiles['smiles'] = chembl_smiles["Smiles"]
chembl_smiles['length'] = chembl_smiles["smiles"].str.len()
chembl_smiles.head()

(1919023, 31)
(1913602, 31)


Unnamed: 0,ChEMBL ID,Name,Synonyms,Type,Max Phase,Molecular Weight,Targets,Bioactivities,AlogP,PSA,...,Heavy Atoms,HBA Lipinski,HBD Lipinski,#RO5 Violations (Lipinski),Molecular Weight (Monoisotopic),Molecular Species,Molecular Formula,Smiles,smiles,length
369,CHEMBL288742,,,Small molecule,0,468.43,2,2.0,2.96,137.04,...,33.0,9.0,4.0,0.0,468.1621,ACID,C21H23F3N4O5,O=C(O)C(CCCCn1cnc2c1NC=NCC2O)(Cc1ccccc1C(F)(F)...,O=C(O)C(CCCCn1cnc2c1NC=NCC2O)(Cc1ccccc1C(F)(F)...,54
372,CHEMBL293576,,,Small molecule,0,434.54,1,1.0,2.56,136.76,...,32.0,8.0,6.0,1.0,434.243,BASE,C24H30N6O2,Cc1ccc(-c2cc(CN(Cc3ccccc3)C(CCCN=C(N)N)C(N)=O)...,Cc1ccc(-c2cc(CN(Cc3ccccc3)C(CCCN=C(N)N)C(N)=O)...,53
373,CHEMBL159008,,,Small molecule,0,269.3,1,2.0,3.27,55.49,...,20.0,4.0,1.0,0.0,269.1052,ACID,C16H15NO3,COCc1noc(O)c1Cc1ccc2ccccc2c1,COCc1noc(O)c1Cc1ccc2ccccc2c1,28
374,CHEMBL157177,,,Small molecule,0,278.36,1,1.0,3.82,34.89,...,21.0,3.0,0.0,0.0,278.1419,NEUTRAL,C18H18N2O,Cc1ccc2c(c1)c(=O)nc(C(C)C)n2-c1ccccc1,Cc1ccc2c(c1)c(=O)nc(C(C)C)n2-c1ccccc1,37
375,CHEMBL53561,,,Small molecule,0,353.47,0,,1.83,74.49,...,26.0,6.0,3.0,0.0,353.2216,NEUTRAL,C20H27N5O,Cc1cc(N)cc(C)c1C(=O)NCCN1CCN(c2ccccn2)CC1,Cc1cc(N)cc(C)c1C(=O)NCCN1CCN(c2ccccn2)CC1,41


In [5]:
# Load existing smiles from original dataset
old_smiles = pd.read_csv('./datasets/dataset.smi', names=["smiles"])
old_smiles['length'] = old_smiles["smiles"].str.len() 

In [6]:
# They are ready to be appended once run through a canonizer and then drop duplicates
moses_smiles = moses_smiles['smiles']
print(moses_smiles.shape)
moses_smiles = moses_smiles.drop_duplicates()
print(moses_smiles.shape)

(1936962,)
(1936962,)


In [7]:
# They are ready to be appended once run through a canonizer and then drop duplicates
chembl_smiles = chembl_smiles['smiles']
print(chembl_smiles.shape)
chembl_smiles = chembl_smiles.drop_duplicates()
print(chembl_smiles.shape)

(1913602,)
(1913492,)


In [8]:
old_smiles = old_smiles['smiles']
print(old_smiles.shape)
old_smiles = old_smiles.drop_duplicates()
print(old_smiles.shape)

(556134,)
(556134,)


In [9]:
smiles = moses_smiles.append(chembl_smiles)
print(smiles.shape)
smiles = smiles.drop_duplicates()
print(smiles.shape)
smiles = smiles.append(old_smiles)
print(smiles.shape)
smiles = smiles.drop_duplicates()
print(smiles.shape)

(3850454,)
(3781835,)
(4337969,)
(4314433,)


In [10]:
smiles.head()

0    CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1      CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2    CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
3       Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
4          Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
Name: smiles, dtype: object

In [11]:
smiles.to_csv(r'./datasets/all_smiles.smi', header=None, index=None, sep='\t', mode='a')

In [12]:
# Manually append HIV inhibitors list and remove duplicates to see if currently has any of them
all_smiles_test = pd.read_csv('./datasets/all_smiles.smi',sep='\t', header=None)
print(all_smiles_test.shape)
all_smiles_test.head()

(4314433, 1)


Unnamed: 0,0
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C


In [13]:
# Now can combine these lists and run the data cleaning py script

Found HIV inhibitor drug SMILES to manually add above

https://pubchem.ncbi.nlm.nih.gov/

Tipranavir - 85
CCCC1(CC(=C(C(=O)O1)C(CC)C2=CC(=CC=C2)NS(=O)(=O)C3=NC=C(C=C3)C(F)(F)F)O)CCC4=CC=CC=C4
Darunavir - 71
CC(C)CN(CC(C(CC1=CC=CC=C1)NC(=O)OC2COC3C2CCO3)O)S(=O)(=O)C4=CC=C(C=C4)N
Amprenavir - 66
CC(C)CN(CC(C(CC1=CC=CC=C1)NC(=O)OC2CCOC2)O)S(=O)(=O)C3=CC=C(C=C3)N
Lopinavir - 84
CC1=C(C(=CC=C1)C)OCC(=O)NC(CC2=CC=CC=C2)C(CC(CC3=CC=CC=C3)NC(=O)C(C(C)C)N4CCCNC4=O)O
Atazanavir - 98
CC(C)(C)C(C(=O)NC(CC1=CC=CC=C1)C(CN(CC2=CC=C(C=C2)C3=CC=CC=N3)NC(=O)C(C(C)(C)C)NC(=O)OC)O)NC(=O)OC
Saquinavir - 89
CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(C(CC3=CC=CC=C3)NC(=O)C(CC(=O)N)NC(=O)C4=NC5=CC=CC=C5C=C4)O
Indinavir - 82
CC(C)(C)NC(=O)C1CN(CCN1CC(CC(CC2=CC=CC=C2)C(=O)NC3C(CC4=CC=CC=C34)O)O)CC5=CN=CC=C5
Ritonavir - 93
CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2=CC=CC=C2)CC(C(CC3=CC=CC=C3)NC(=O)OCC4=CN=CS4)O
Nelfinavir - 70
CC1=C(C=CC=C1O)C(=O)NC(CSC2=CC=CC=C2)C(CN3CC4CCCCC4CC3C(=O)NC(C)(C)C)O
Efavirenz - 45
C1CC1C#CC2(C3=C(C=CC(=C3)Cl)NC(=O)O2)C(F)(F)F

GS-8374 - 89
CCOP(=O)(COC1=CC=C(C=C1)CC(C(CN(CC(C)C)S(=O)(=O)C2=CC=C(C=C2)OC)O)NC(=O)OC3COC4C3CCO4)OCC
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957959/

SPI-256 - ???
http://www.natap.org/2006/CROI/CROI_04.htm

See this for more:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5228633/
TMC310911 - 96
CC(C)CN(CC(C(CC1=CC=CC=C1)NC(=O)OC2COC3C2CCO3)O)S(=O)(=O)C4=CC5=C(C=C4)N=C(S5)NC6CCN(CC6)C7CCCC7
CTP-518 - 98
CC(C)(C)C(C(=O)NC(CC1=CC=CC=C1)C(CN(CC2=CC=C(C=C2)C3=CC=CC=N3)NC(=O)C(C(C)(C)C)NC(=O)OC)O)NC(=O)OC
PPL-100 - 83
CC(C)CN(C(CCCCNC(=O)C(C(C1=CC=CC=C1)C2=CC=CC=C2)NC(=O)OC)CO)S(=O)(=O)C3=CC=C(C=C3)N