In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors

In [3]:
df = pd.read_csv('qm9_sample.csv')

In [4]:
df

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,u0,u298,h298,g298,cv
0,gdb_45864,O=C1OC=NCC11CN1,2.65458,1.85295,1.23017,1.4059,66.11,-0.2693,-0.0363,0.2331,969.9823,0.114529,-453.960419,-453.953215,-453.952271,-453.992333,27.142
1,gdb_11104,C1C(CN1C=O)C=O,4.94875,1.15582,1.13915,3.8620,64.32,-0.2496,-0.0414,0.2082,1057.8115,0.118148,-399.811727,-399.803839,-399.802895,-399.845464,26.627
2,gdb_47450,C1CC1N2CCOC2=O,3.08235,1.38222,1.01814,4.6197,72.18,-0.2461,0.0510,0.2972,1163.2778,0.149356,-439.118302,-439.110229,-439.109284,-439.151634,29.710
3,gdb_88940,OC1CC1NC(=N)C=O,3.95367,0.82949,0.72068,4.6363,74.62,-0.2233,-0.0636,0.1597,1515.0734,0.134254,-455.127740,-455.118323,-455.117379,-455.163150,33.649
4,gdb_40813,C1NC1C1CCOC=N1,4.14610,1.04198,0.87868,2.6945,76.88,-0.2405,0.0257,0.2662,1311.9151,0.161918,-419.189278,-419.181294,-419.180350,-419.222393,30.168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gdb_20951,CCOc1cnno1,7.81782,1.16858,1.02975,4.2179,61.95,-0.2426,-0.0227,0.2199,1093.1478,0.105657,-415.833772,-415.826059,-415.825114,-415.866566,26.660
19996,gdb_11254,CC1OC(C=C1)C#C,5.06532,1.39767,1.22187,1.6287,72.66,-0.2464,0.0068,0.2532,1006.3004,0.129072,-346.571037,-346.563208,-346.562263,-346.603376,29.171
19997,gdb_34418,N#CC12COC1CCC2,2.32431,1.67523,1.25940,3.1914,75.22,-0.2686,0.0191,0.2876,1048.5046,0.149827,-401.959779,-401.952068,-401.951124,-401.992288,29.412
19998,gdb_46145,O=C1CCC(CO1)C#C,4.58871,0.99416,0.84701,3.9391,74.90,-0.2643,0.0061,0.2704,1303.3578,0.136070,-421.836918,-421.828562,-421.827618,-421.870228,31.180


# RDKit descriptors

In [5]:
descriptors_rdkit = []

for smile in df['smiles']:
  mol = Chem.MolFromSmiles(smile)
  if mol is not None:
    descriptor_values = [d(mol) for _, d in Descriptors._descList]
    descriptors_rdkit.append(descriptor_values)
  else:
    descriptors_rdkit.append([None] * len(Descriptors._descList))

descriptor_names = [x[0] for x in Descriptors._descList]

df_rdkit = pd.DataFrame(descriptors_rdkit, columns=descriptor_names)
df_rdkit = pd.concat([df, df_rdkit], axis=1)

In [6]:
df_rdkit

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,gdb_45864,O=C1OC=NCC11CN1,2.65458,1.85295,1.23017,1.4059,66.11,-0.2693,-0.0363,0.2331,...,0,0,0,0,0,0,0,0,0,0
1,gdb_11104,C1C(CN1C=O)C=O,4.94875,1.15582,1.13915,3.8620,64.32,-0.2496,-0.0414,0.2082,...,0,0,0,0,0,0,0,0,0,0
2,gdb_47450,C1CC1N2CCOC2=O,3.08235,1.38222,1.01814,4.6197,72.18,-0.2461,0.0510,0.2972,...,0,0,0,0,0,0,0,0,0,0
3,gdb_88940,OC1CC1NC(=N)C=O,3.95367,0.82949,0.72068,4.6363,74.62,-0.2233,-0.0636,0.1597,...,0,0,0,0,0,0,0,0,0,0
4,gdb_40813,C1NC1C1CCOC=N1,4.14610,1.04198,0.87868,2.6945,76.88,-0.2405,0.0257,0.2662,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gdb_20951,CCOc1cnno1,7.81782,1.16858,1.02975,4.2179,61.95,-0.2426,-0.0227,0.2199,...,0,0,0,0,0,0,0,0,0,0
19996,gdb_11254,CC1OC(C=C1)C#C,5.06532,1.39767,1.22187,1.6287,72.66,-0.2464,0.0068,0.2532,...,0,0,0,1,0,0,0,0,0,0
19997,gdb_34418,N#CC12COC1CCC2,2.32431,1.67523,1.25940,3.1914,75.22,-0.2686,0.0191,0.2876,...,0,0,0,0,0,0,0,0,0,0
19998,gdb_46145,O=C1CCC(CO1)C#C,4.58871,0.99416,0.84701,3.9391,74.90,-0.2643,0.0061,0.2704,...,0,0,0,1,0,0,0,0,0,0


# Mordred descriptors

In [7]:
calc = Calculator(descriptors, ignore_3D=True)
mols = [Chem.MolFromSmiles(smi) for smi in df_rdkit.smiles]
df_mordred = calc.pandas(mols)
df = pd.concat([df_rdkit, df_mordred], axis=1)

100%|██████████| 20000/20000 [05:41<00:00, 58.52it/s]


In [8]:
df

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,gdb_45864,O=C1OC=NCC11CN1,2.65458,1.85295,1.23017,1.4059,66.11,-0.2693,-0.0363,0.2331,...,9.475854,59.263520,126.042927,8.402862,79,11,50.0,61.0,2.673611,1.958333
1,gdb_11104,C1C(CN1C=O)C=O,4.94875,1.15582,1.13915,3.8620,64.32,-0.2496,-0.0414,0.2082,...,8.979291,37.044791,113.047678,7.536512,70,6,36.0,40.0,3.222222,2.000000
2,gdb_47450,C1CC1N2CCOC2=O,3.08235,1.38222,1.01814,4.6197,72.18,-0.2461,0.0510,0.2972,...,9.064274,58.079349,127.063329,7.059074,83,9,48.0,57.0,2.583333,1.972222
3,gdb_88940,OC1CC1NC(=N)C=O,3.95367,0.82949,0.72068,4.6363,74.62,-0.2233,-0.0636,0.1597,...,8.834774,57.095844,128.058577,7.532857,98,7,42.0,47.0,4.083333,2.111111
4,gdb_40813,C1NC1C1CCOC=N1,4.14610,1.04198,0.87868,2.6945,76.88,-0.2405,0.0257,0.2662,...,8.954286,56.778649,126.079313,6.635753,87,9,46.0,53.0,1.972222,2.027778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gdb_20951,CCOc1cnno1,7.81782,1.16858,1.02975,4.2179,61.95,-0.2426,-0.0227,0.2199,...,8.124743,47.992435,114.042927,8.145923,67,5,34.0,36.0,2.611111,2.000000
19996,gdb_11254,CC1OC(C=C1)C#C,5.06532,1.39767,1.22187,1.6287,72.66,-0.2464,0.0068,0.2532,...,8.365672,49.065586,108.057515,6.753595,63,6,36.0,39.0,3.222222,1.916667
19997,gdb_34418,N#CC12COC1CCC2,2.32431,1.67523,1.25940,3.1914,75.22,-0.2686,0.0191,0.2876,...,9.713597,54.600998,123.068414,6.837134,78,10,50.0,62.0,2.673611,2.041667
19998,gdb_46145,O=C1CCC(CO1)C#C,4.58871,0.99416,0.84701,3.9391,74.90,-0.2643,0.0061,0.2704,...,8.590258,37.289972,124.052429,7.297202,90,9,40.0,43.0,3.472222,2.166667


In [None]:
df.to_csv('data_with_descriptors.csv', index = False)

## pubchempy

In [83]:
import pubchempy as pcp
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [None]:
def calculate_descriptors(smiles):
    compound = pcp.get_properties(['MolecularWeight', 'XLogP', 'ExactMass', 'TPSA', 'Complexity'], smiles, 'smiles')[0]
    if compound['CID'] != 0:
        del compound['CID']
        return compound
    else:
        return {key: None for key in ['MolecularWeight', 'XLogP', 'ExactMass', 'TPSA', 'Complexity']}


def apply_calculate_descriptors(smiles):
    descriptors = calculate_descriptors(smiles)
    return pd.Series(descriptors)

tqdm.pandas()
new_columns = df['smiles'].progress_apply(apply_calculate_descriptors)

df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)