In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np

from mordred import Calculator, descriptors

In [2]:
dataset = pd.read_csv("Orbital_Energies_input_data.csv")
dataset.shape

(2904, 2)

In [3]:
dataset.head

<bound method NDFrame.head of                                             SMILES   Energygap
0                              Cc1ccc(cc1)C(F)(F)F  197.749421
1                                      OC(=O)CCCCl  247.493942
2     CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O  164.712327
3                                Nc1ccc(Cl)c(Cl)c1  169.027707
4                            C[C@@H](CCO)CCC=C(C)C  209.569808
...                                            ...         ...
2899    C(CP(c1ccccc1)c1ccccc1)P(c1ccccc1)c1ccccc1  168.649319
2900                               Brc1cccc2sccc12  162.928319
2901                CCO[C@H]1C=Cc2ccccc2N1C(=O)OCC  165.098245
2902                                 s1ccc2ccccc12  167.958431
2903                                 Cc1cccc(C)c1O  188.369417

[2904 rows x 2 columns]>

# 1. Generate canonical SMILES

In [4]:
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [5]:
a = Chem.MolFromSmiles('Cc1ccc(cc1)C(F)(F)F')
smiles_a = Chem.MolToSmiles(a)
smiles_a

'Cc1ccc(C(F)(F)F)cc1'

In [6]:
cannon_smiles = canonical_smiles(dataset['SMILES'])
len(cannon_smiles)

2904

In [7]:
dataset['SMILES'] = cannon_smiles
dataset

Unnamed: 0,SMILES,Energygap
0,Cc1ccc(C(F)(F)F)cc1,197.749421
1,O=C(O)CCCCl,247.493942
2,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,164.712327
3,Nc1ccc(Cl)c(Cl)c1,169.027707
4,CC(C)=CCC[C@@H](C)CCO,209.569808
...,...,...
2899,c1ccc(P(CCP(c2ccccc2)c2ccccc2)c2ccccc2)cc1,168.649319
2900,Brc1cccc2sccc12,162.928319
2901,CCOC(=O)N1c2ccccc2C=C[C@@H]1OCC,165.098245
2902,c1ccc2sccc2c1,167.958431


In [8]:
#Remove duplicates
dup_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
len(dup_smiles)

31

In [9]:
#Create a list for duplicate smiles
dataset[dataset['SMILES'].isin(dup_smiles)].sort_values(by=['SMILES'])

Unnamed: 0,SMILES,Energygap
218,C1=CCCCCCC1,214.097913
2125,C1=CCCCCCC1,214.097913
1901,C1CC[C@H]2CCCC[C@@H]2C1,251.14479
552,C1CC[C@H]2CCCC[C@@H]2C1,251.14479
554,C=CN1CCCC1=O,190.8349
1808,C=CN1CCCC1=O,190.835527
2203,C=Cc1ccccc1,168.467969
1354,C=Cc1ccccc1,168.475499
2341,C=Cc1ccccc1,168.490559
1265,C=Cc1ccncc1,174.83342


In [10]:
dataset_new = dataset.drop_duplicates(subset=['SMILES'])
len(dataset_new)

2873

In [11]:
dataset_new

Unnamed: 0,SMILES,Energygap
0,Cc1ccc(C(F)(F)F)cc1,197.749421
1,O=C(O)CCCCl,247.493942
2,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,164.712327
3,Nc1ccc(Cl)c(Cl)c1,169.027707
4,CC(C)=CCC[C@@H](C)CCO,209.569808
...,...,...
2899,c1ccc(P(CCP(c2ccccc2)c2ccccc2)c2ccccc2)cc1,168.649319
2900,Brc1cccc2sccc12,162.928319
2901,CCOC(=O)N1c2ccccc2C=C[C@@H]1OCC,165.098245
2902,c1ccc2sccc2c1,167.958431


# 2. Calculate descriptors using RDKits

In [12]:
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
# Return all the RDkit descriptors
molecule_des = calc.GetDescriptorNames()
len(molecule_des)
molecule_des

('MaxEStateIndex',
 'MinEStateIndex',
 'MaxAbsEStateIndex',
 'MinAbsEStateIndex',
 'qed',
 'MolWt',
 'HeavyAtomMolWt',
 'ExactMolWt',
 'NumValenceElectrons',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'FpDensityMorgan1',
 'FpDensityMorgan2',
 'FpDensityMorgan3',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW',
 'BalabanJ',
 'BertzCT',
 'Chi0',
 'Chi0n',
 'Chi0v',
 'Chi1',
 'Chi1n',
 'Chi1v',
 'Chi2n',
 'Chi2v',
 'Chi3n',
 'Chi3v',
 'Chi4n',
 'Chi4v',
 'HallKierAlpha',
 'Ipc',
 'Kappa1',
 'Kappa2',
 'Kappa3',
 'LabuteASA',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',
 'SMR_VSA7',
 'SMR_

## a. General molecular descriptors-about 200 molecular descriptors

In [13]:
# def RDkit_descriptors(smiles):
#     mols = [Chem.MolFromSmiles(i) for i in smiles] 
#     calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
#     desc_names = calc.GetDescriptorNames()
    
#     Mol_descriptors =[]
#     for mol in mols:
#         # add hydrogens to molecules
#         mol=Chem.AddHs(mol)
#         # Calculate all 200 descriptors for each molecule
#         descriptors = calc.CalcDescriptors(mol)
#         Mol_descriptors.append(descriptors)
#     return Mol_descriptors,desc_names 

# # Function call
# Mol_descriptors,desc_names = RDkit_descriptors(dataset_new['SMILES'])

In [14]:
# df_with_200_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
# df_with_200_descriptors

### b. Fingerprints

In [15]:
def morgan_fpts(data):
    Morgan_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        ftps = AllChem.GetMorganFingerprintAsBitVect(mol,2,1024)
        mfpts = np.array(ftps)
        Morgan_fpts.append(mfpts)
    return np.array(Morgan_fpts)

Morgan_fpts = morgan_fpts(dataset_new['SMILES'])
Morgan_fpts.shape

(2873, 1024)

In [16]:
Morgan_fingerprints = pd.DataFrame(Morgan_fpts,columns=['Col_{}'.format(i) for i in range(Morgan_fpts.shape[1])])
Morgan_fingerprints

Unnamed: 0,Col_0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,...,Col_1014,Col_1015,Col_1016,Col_1017,Col_1018,Col_1019,Col_1020,Col_1021,Col_1022,Col_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Calculate descreptors using Mordred-1826 descriptors

In [17]:
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=True)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    
    # pandas df
    df = calc.pandas(mols)
    return df

In [None]:
mordred_descriptors = All_Mordred_descriptors(dataset_new['SMILES'])

In [22]:
df_with_md = pd.DataFrame(mordred_descriptors)

In [26]:
row0 = df_with_md.loc[0]