In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
import warnings

In [2]:
def remove_all_zero_col(data):
    data = data.copy()
    for col in data.columns:
        if (data[col] == 0).all():
            data.drop(col, axis=1, inplace=True)
    return data

In [3]:
data = pd.read_csv("source/SMILES_real.csv")
name_No = data.drop(columns=['SMILES'])
df = pd.DataFrame(data['SMILES'], columns=['SMILES'])

print(name_No)
print(df)

                      Name      ID
0                   4CzIPN    OPS1
1                4tBuCzIPN    OPS2
2                 4MeCzIPN    OPS3
3                 4BrCzIPN    OPS4
4                4OMeCzIPN    OPS5
..                     ...     ...
95                OMe_TPP+   OPS96
96                    Acr+   OPS97
97                 Acr+_Ph   OPS98
98                Acr+_Mes   OPS99
99  3_6_tBu_10_ph_Acr+_Mes  OPS100

[100 rows x 2 columns]
                                               SMILES
0   N#CC1=C(N2C(C=CC=C3)=C3C4=C2C=CC=C4)C(N5C(C=CC...
1   N#CC1=C(N2C(C=CC(C(C)(C)C)=C3)=C3C4=C2C=CC(C(C...
2   N#CC1=C(N2C(C=CC(C)=C3)=C3C4=C2C=CC(C)=C4)C(N5...
3   BrC1=CC(C2=C3C=CC(Br)=C2)=C(C=C1)N3C(C(C#N)=C(...
4   N#CC1=C(N2C(C=CC(OC)=C3)=C3C4=C2C=CC(OC)=C4)C(...
..                                                ...
95  COC(C=C1)=CC=C1C2=CC(C3=CC=C(C=C3)OC)=CC(C4=CC...
96                  C[N+]1=C(C=CC=C2)C2=CC3=CC=CC=C31
97     C[N+]1=C(C=CC=C2)C2=C(C3=CC=CC=C3)C4=CC=CC=C41
98  C[N+]1=C(C=C

In [4]:
def add_molecule_column_to_frame(df, smiles_col):
    df['ROMol'] = df[smiles_col].apply(Chem.MolFromSmiles)
    return df

df = add_molecule_column_to_frame(df, 'SMILES')
for name, func in Descriptors.descList:
    df[name] = df['ROMol'].apply(lambda x: func(x) if x is not None else np.nan)
    
df_s = df.drop(columns=['SMILES', 'ROMol'])
print(df.shape)
print(df_s.shape)

df_s.head()

(100, 210)
(100, 208)


  import sys


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.116548,0.389761,12.116548,0.389761,0.178204,788.914,756.658,788.268845,286,0,...,0,0,0,0,0,0,0,0,0,0
1,13.185042,-0.171343,13.185042,0.16869,0.1762,1237.778,1141.01,1236.769647,478,0,...,0,0,0,0,0,0,0,0,0,0
2,12.426856,0.399888,12.426856,0.399888,0.176567,901.13,852.746,900.394046,334,0,...,0,0,0,0,0,0,0,0,0,0
3,12.374043,0.319495,12.374043,0.319495,0.176198,1420.082,1395.89,1411.552942,334,0,...,0,0,0,0,0,0,0,0,0,0
4,12.556354,0.210464,12.556354,0.210464,0.116076,1029.122,980.738,1028.353362,382,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
RDKit = pd.concat([name_No, df_s], axis=1, join='inner')
print(RDKit)
RDKit.to_csv('result/RDKit_real.csv', index = False)

                      Name      ID  MaxEStateIndex  MinEStateIndex  \
0                   4CzIPN    OPS1       12.116548        0.389761   
1                4tBuCzIPN    OPS2       13.185042       -0.171343   
2                 4MeCzIPN    OPS3       12.426856        0.399888   
3                 4BrCzIPN    OPS4       12.374043        0.319495   
4                4OMeCzIPN    OPS5       12.556354        0.210464   
..                     ...     ...             ...             ...   
95                OMe_TPP+   OPS96        6.297255        0.774805   
96                    Acr+   OPS97        2.240741        1.274259   
97                 Acr+_Ph   OPS98        2.274907        1.259308   
98                Acr+_Mes   OPS99        2.298634        1.269051   
99  3_6_tBu_10_ph_Acr+_Mes  OPS100        2.483634        0.056328   

    MaxAbsEStateIndex  MinAbsEStateIndex       qed     MolWt  HeavyAtomMolWt  \
0           12.116548           0.389761  0.178204   788.914         756.658   