In [2]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np

from mordred import Calculator, Descriptor

In [3]:
dataset = pd.read_csv("Orbital_Energies_input_data.csv")
dataset.shape

(2904, 2)

In [4]:
dataset.head

<bound method NDFrame.head of                                             SMILES   Energygap
0                              Cc1ccc(cc1)C(F)(F)F  197.749421
1                                      OC(=O)CCCCl  247.493942
2     CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O  164.712327
3                                Nc1ccc(Cl)c(Cl)c1  169.027707
4                            C[C@@H](CCO)CCC=C(C)C  209.569808
...                                            ...         ...
2899    C(CP(c1ccccc1)c1ccccc1)P(c1ccccc1)c1ccccc1  168.649319
2900                               Brc1cccc2sccc12  162.928319
2901                CCO[C@H]1C=Cc2ccccc2N1C(=O)OCC  165.098245
2902                                 s1ccc2ccccc12  167.958431
2903                                 Cc1cccc(C)c1O  188.369417

[2904 rows x 2 columns]>

# 1. Generate canonical SMILES

In [16]:
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [12]:
a = Chem.MolFromSmiles('Cc1ccc(cc1)C(F)(F)F')
smiles_a = Chem.MolToSmiles(a)
smiles_a

'Cc1ccc(C(F)(F)F)cc1'

In [18]:
cannon_smiles = canonical_smiles(dataset['SMILES'])
len(cannon_smiles)

2904

In [19]:
dataset['SMILES'] = cannon_smiles
dataset

Unnamed: 0,SMILES,Energygap
0,Cc1ccc(C(F)(F)F)cc1,197.749421
1,O=C(O)CCCCl,247.493942
2,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,164.712327
3,Nc1ccc(Cl)c(Cl)c1,169.027707
4,CC(C)=CCC[C@@H](C)CCO,209.569808
...,...,...
2899,c1ccc(P(CCP(c2ccccc2)c2ccccc2)c2ccccc2)cc1,168.649319
2900,Brc1cccc2sccc12,162.928319
2901,CCOC(=O)N1c2ccccc2C=C[C@@H]1OCC,165.098245
2902,c1ccc2sccc2c1,167.958431


In [21]:
#Remove duplicates
dup_smiles = dataset[dataset['SMILES'].duplicated()]['SMILES'].values
len(dup_smiles)

31

In [24]:
#Create a list for duplicate smiles
dataset[dataset['SMILES'].isin(dup_smiles)].sort_values(by=['SMILES'])

Unnamed: 0,SMILES,Energygap
218,C1=CCCCCCC1,214.097913
2125,C1=CCCCCCC1,214.097913
1901,C1CC[C@H]2CCCC[C@@H]2C1,251.14479
552,C1CC[C@H]2CCCC[C@@H]2C1,251.14479
554,C=CN1CCCC1=O,190.8349
1808,C=CN1CCCC1=O,190.835527
2203,C=Cc1ccccc1,168.467969
1354,C=Cc1ccccc1,168.475499
2341,C=Cc1ccccc1,168.490559
1265,C=Cc1ccncc1,174.83342


In [25]:
dataset_new = dataset.drop_duplicates(subset=['SMILES'])
len(dataset_new)

2873

In [26]:
dataset_new

Unnamed: 0,SMILES,Energygap
0,Cc1ccc(C(F)(F)F)cc1,197.749421
1,O=C(O)CCCCl,247.493942
2,CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,164.712327
3,Nc1ccc(Cl)c(Cl)c1,169.027707
4,CC(C)=CCC[C@@H](C)CCO,209.569808
...,...,...
2899,c1ccc(P(CCP(c2ccccc2)c2ccccc2)c2ccccc2)cc1,168.649319
2900,Brc1cccc2sccc12,162.928319
2901,CCOC(=O)N1c2ccccc2C=C[C@@H]1OCC,165.098245
2902,c1ccc2sccc2c1,167.958431


# 2. Calculate descriptors using RDKits

In [None]:
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
# Return all the RDkit descriptors
molecule_des = calc.GetDescriptorNames()
len(molecule_des)
molecule_des

## a. General molecular descriptors-about 200 molecular descriptors

In [45]:
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=True)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    # pandas df
    df = calc.pandas(mols)
    return df

In [44]:
mordred_descriptors = All_Mordred_descriptors(dataset_new['SMILES'])

NameError: name 'descriptors' is not defined