In [3]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, rdmolops, AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors


from scipy.stats import pearsonr


# sklearn ML models
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn import svm

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import  LGBMRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import KFold

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity='all'


In [5]:
df = pd.read_csv('../01-database-preprocessing-1203dp-to-1115dp/raw/atom_number_wH_sort_1115-backbone-correction-newSMILES.csv')
df.head()

Unnamed: 0,Nickname,bandgap(eV),c_smiles,newSMILES,Ref.No
0,P3HT,1.93,CCCCCCc1cc(C)sc1C,Cc1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C)c(s1)-c1cc(C...,S10
1,P3HST,1.82,CCCCCCSc1cc(C)sc1C,CSc1cc(sc1C)-c1sc(cc1SC)-c1sc(cc1SC)-c1sc(cc1S...,S123
2,POPT,1.76,CCCCCCCCc1ccc(-c2cc(C)sc2C)cc1,Cc1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c1cc(c(s1)-c...,S126
3,PT-C1,1.92,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(C)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1cc(C(=O)OC)c(s1)...,S122
4,PT-C2,1.89,CCCCC(CC)COC(=O)c1cc(C)sc1-c1ccc(-c2ccc(C)s2)s1,COC(=O)c1cc(C)sc1-c1ccc(s1)-c1ccc(s1)-c1cc(C(=...,S122


# Rdkit-209 descriptors generation

### a. 208 molecular descriptors in RDKit

In [6]:
smiles_list = df['c_smiles'].values
len(smiles_list)

1115

In [7]:
# proof and make a list of SMILES
c_smiles = []
for ds in smiles_list:
    try:
        cs = Chem.CanonSmiles(ds)
        c_smiles.append(cs)
    except:
        print('Invalid SMILES:', ds)
len(c_smiles)

1115

In [8]:
def RDKit208_MD(c_smiles):
    mols = [Chem.MolFromSmiles(i) for i in c_smiles] 
    desc_list = [x[0] for x in Descriptors._descList]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(desc_list)
    rdkit_desc = [calc.CalcDescriptors(m) for m in mols]
    df_208 = pd.DataFrame(rdkit_desc,columns=desc_list)
    return df_208

In [9]:
rdkit208_md = RDKit208_MD(c_smiles)

In [10]:
rdkit208_md

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,2.353121,2.353121,1.287870,1.287870,0.608740,196.359,176.199,196.128572,74,0,...,0,0,0,0,0,0,0,1,2,0
1,2.323896,2.323896,1.288704,1.288704,0.485488,228.426,208.266,228.100643,80,0,...,1,0,0,0,0,0,0,1,3,0
2,2.313310,2.313310,1.230210,1.230210,0.463323,300.511,272.287,300.191172,114,0,...,0,0,0,0,0,0,0,1,4,0
3,12.542211,12.542211,0.175600,-0.175600,0.510163,350.549,324.341,350.137422,126,0,...,0,0,0,0,0,0,0,2,0,0
4,12.817589,12.817589,0.184059,-0.184059,0.318494,432.676,404.452,432.125143,150,0,...,0,0,0,0,0,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,5.448759,5.448759,0.613257,0.613257,0.035659,2093.634,1894.050,2091.310291,788,0,...,0,0,0,0,0,0,0,7,52,0
1111,6.964899,6.964899,0.506884,-0.861916,0.035381,2097.349,1897.765,2095.457524,820,0,...,0,0,0,0,0,0,0,2,48,0
1112,7.004105,7.004105,0.497358,-0.874785,0.035381,2261.603,2057.987,2259.432966,868,0,...,0,0,0,0,0,0,0,4,48,0
1113,17.056452,17.056452,0.015571,0.015571,0.035580,2289.808,2066.032,2287.561538,886,0,...,0,0,0,0,0,0,0,6,56,0


In [11]:
rdkit208_md.to_csv('monomer-1115dp-RDKit209.csv', index = False)