In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/Human_made_smiles_2.csv")
df = pd.DataFrame(data['SMILES'], columns=['SMILES'])

print(df)

                                                  SMILES
0                                CN(C)c1ccc(-c2ccon2)cc1
1                                CN(C)c1ccc(-c2ccno2)cc1
2                                CN(C)c1ccc(-c2cnoc2)cc1
3                                CN(C)c1ccc(-c2ncco2)cc1
4                                CN(C)c1ccc(-c2cocn2)cc1
...                                                  ...
25185  Fc1ccc2c(c1)c1cc(F)ccc1n2-c1ccc2c(c1)sc1cc(-c3...
25186  N#Cc1nc2c3cc(-c4ccc5c(c4)sc4cc(-n6c7ccc(F)cc7c...
25187  N#Cc1nc2c3ccc(-c4ccc5c(c4)sc4cc(-n6c7ccc(F)cc7...
25188  Fc1ccc2c(c1)c1cc(F)ccc1n2-c1ccc2c(c1)sc1cc(-c3...
25189  Fc1ccc2c(c1)c1cc(F)ccc1n2-c1ccc2c(c1)sc1cc(-c3...

[25190 rows x 1 columns]


In [4]:
def add_molecule_column_to_frame(df, smiles_col):
    df['ROMol'] = df[smiles_col].apply(Chem.MolFromSmiles)
    return df

df = add_molecule_column_to_frame(df, 'SMILES')
for name, func in Descriptors.descList:
    df[name] = df['ROMol'].apply(lambda x: func(x) if x is not None else np.nan)
    
RDKit = df.drop(columns=['SMILES', 'ROMol'])
print(df.shape)
print(RDKit.shape)

RDKit.head()

  import sys


(25190, 211)
(25190, 209)


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,4.786177,4.786177,0.870654,0.870654,0.724322,188.23,176.134,188.094963,72,0,...,0,0,0,0,0,0,0,0,0,0
1,5.056284,5.056284,0.801209,0.801209,0.724322,188.23,176.134,188.094963,72,0,...,0,0,0,0,0,0,0,0,0,0
2,4.786177,4.786177,1.009543,1.009543,0.724322,188.23,176.134,188.094963,72,0,...,0,0,0,0,0,0,0,0,0,0
3,5.195172,5.195172,0.66232,0.66232,0.724322,188.23,176.134,188.094963,72,0,...,0,0,0,0,0,0,0,0,0,0
4,4.925065,4.925065,0.870654,0.870654,0.724322,188.23,176.134,188.094963,72,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
RDKit.to_csv('result/RDKit_human.csv', index=False)