In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e1.csv")
df = pd.DataFrame(data['SMILES'], columns=['SMILES'])

print(df)

                                                  SMILES
0                          O=C1NC(=O)c2cc(-c3cncs3)ccc21
1                                CNCc1ccc(S(C)(=O)=O)cc1
2             c1ccc2c(c1)sc1c(-c3c4ncsc4cc4ncsc34)cccc12
3      C1=Nc2c(cc3ncsc3c2-c2cccc3c2sc2ccccc23)[SH]1c1...
4      O=P(c1ccccc1)(c1ccccc1)c1cccc(-c2ccc3c(c2)oc2c...
...                                                  ...
29618      O=S(=O)(Cn1c2ccc(Cl)cc2c2cc(Cl)ccc21)c1ccccc1
29619  N#CC(C#N)=C1C(=Cc2ccc(-c3cccc(S(=O)(=O)Cn4c5cc...
29620  Clc1ccc2c(c1)c1cc(Cl)ccc1n2Cn1ccc(-c2cc3c(cn2)...
29621  Clc1ccc2c(c1)c1cc(Cl)cc(-c3c4ncsc4cc4ncsc34)c1...
29622    COc1cc(OCc2ccc(N(C)C)cc2)cc(C2=CC(=O)C=CC2=O)c1

[29623 rows x 1 columns]


In [4]:
def add_molecule_column_to_frame(df, smiles_col):
    df['ROMol'] = df[smiles_col].apply(Chem.MolFromSmiles)
    return df

df = add_molecule_column_to_frame(df, 'SMILES')
for name, func in Descriptors.descList:
    df[name] = df['ROMol'].apply(lambda x: func(x) if x is not None else np.nan)
    
RDKit = df.drop(columns=['SMILES', 'ROMol'])
print(df.shape)
print(RDKit.shape)

RDKit.head()

  import sys


(29623, 210)
(29623, 208)


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,11.453571,-0.32744,11.453571,0.323283,0.758305,230.248,224.2,230.014998,78,0,...,0,0,0,0,0,1,0,0,0,0
1,11.086415,-3.054934,11.086415,0.36792,0.783254,199.275,186.171,199.0667,72,0,...,0,0,1,0,0,0,0,0,0,0
2,4.687531,1.064689,4.687531,1.064689,0.311366,374.515,364.435,374.000611,118,0,...,0,0,0,0,0,2,0,1,0,0
3,4.955411,-0.666979,4.955411,0.666979,0.267444,459.646,446.542,458.999231,144,0,...,0,0,0,0,0,2,0,1,0,0
4,14.712886,-3.018804,14.712886,0.821392,0.277172,444.47,423.302,444.127917,158,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
RDKit.to_csv('result/RDKit_random.csv', index=False)