In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings
from pandas.errors import PerformanceWarning
warnings.simplefilter(action='ignore', category=PerformanceWarning)

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/smiles_e01.csv")
df = pd.DataFrame(data['SMILES'], columns=['SMILES'])

print(df)

                                                  SMILES
0                        CN(C)c1ccc(-c2ccc(N(C)C)cc2)cc1
1        CN(C)c1ccc(-c2ccc(N(C)C)cc2-c2ccc(N(C)C)cc2)cc1
2                   CN(C)c1ccc(-c2ccc(C=C(C#N)C#N)s2)cc1
3      CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2ccc(C=C(C#N)C#N)...
4      CN(C)c1ccc(-c2cc(N(C)C)ccc2-c2cc(N(C)C)ccc2-c2...
...                                                  ...
26085  CN(Cc1ccc2c(c1)sc1ncccc12)c1ccc(-c2c(F)c(F)c(F...
26086  CN(C)c1ccc(-c2c(F)c(F)c(F)c(F)c2F)c(-c2ccc(N(C...
26087  CN(C)c1ccc(-c2c(-c3c(F)c(F)c(F)c(F)c3F)ccc(N(C...
26088  CN(C)c1cc(-c2c(F)c(F)c(F)c(F)c2F)cc(N(c2ccccc2...
26089  CN(C)c1ccc(-c2c(-c3c(F)c(F)c(F)c(F)c3F)ccc(N(C...

[26090 rows x 1 columns]


In [4]:
def add_molecule_column_to_frame(df, smiles_col):
    df['ROMol'] = df[smiles_col].apply(Chem.MolFromSmiles)
    return df

df = add_molecule_column_to_frame(df, 'SMILES')
for name, func in Descriptors.descList:
    df[name] = df['ROMol'].apply(lambda x: func(x) if x is not None else np.nan)
    
RDKit = df.drop(columns=['SMILES', 'ROMol'])
print(df.shape)
print(RDKit.shape)

RDKit.head()

(26090, 211)
(26090, 209)


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,2.166667,2.166667,1.225613,1.225613,0.810055,240.35,220.19,240.162649,94,0,...,0,0,0,0,0,0,0,0,0,0
1,2.277636,2.277636,1.204026,1.204026,0.618355,359.517,330.285,359.236148,140,0,...,0,0,0,0,0,0,0,0,0,0
2,8.752886,8.752886,0.126255,0.126255,0.80027,279.368,266.264,279.083018,98,0,...,0,0,0,0,0,0,0,1,0,0
3,9.003611,9.003611,0.111901,0.111901,0.521615,398.535,376.359,398.156518,144,0,...,0,0,0,0,0,0,0,1,0,0
4,9.187178,9.187178,0.101214,0.101214,0.239467,517.702,486.454,517.230017,190,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
RDKit.to_csv('result/RDKit_ai2.csv', index=False)