In [1]:
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, Descriptors3D,PandasTools,Draw
from mordred import Calculator, descriptors


In [2]:
data =pd.read_excel('数据集汇总.xlsx', sheet_name='Sheet1')
data

Unnamed: 0,阳离子smile,是否形成（Y能N否）,y_true
0,CCCCCC([NH3+])C,Y,1
1,[NH3+]CCSSCC[NH3+],Y,1
2,BrC1=CC=CC=C1CC[NH3+],Y,1
3,ClC1=CC=CC=C1CC[NH3+],Y,1
4,[NH3+]CCOCC[NH3+],Y,1
...,...,...,...
100,Cc1ccc([NH3+])cc1F,N,0
101,[NH3+]c1ccc(C(F)(F)F)cc1,N,0
102,[NH3+]c1cc(C(F)(F)F)cc(C(F)(F)F)c1,N,0
103,C[NH2+]c1ccccc1F,N,0


In [4]:
PandasTools.AddMoleculeColumnToFrame(data,smilesCol='阳离子smile') #将smile码转换为RDkit分子

In [5]:
for i,j in Descriptors.descList: #这句没问题，Descriptors.descList确实是[(i,j) for i,j in Descriptors.descList if j.fingerprints] 第一项是描述符名称，第二项是描述符函数
    data[i] = data.ROMol.map(j) #i是描述符名称，j是描述符函数，data.ROMol.map(j)是成列计算描述符值
print(data.shape) #主意，描述符是针对质子化后的阳离子，而不是中性胺

(105, 212)


  


In [6]:
data.to_excel('whether2D_Rdkit.xlsx')

In [7]:
calc = Calculator(descriptors,ignore_3D=False) #ignore_3D=False表示不计算3D描述符
df = pd.read_excel('数据集汇总.xlsx', sheet_name='Sheet1')
mols = [Chem.MolFromSmiles(smi) for smi in df['阳离子smile']]
for i in range(len(mols)): 
    mols[i] = AllChem.AddHs(mols[i]) #计算2D描述符需要先加氢
    AllChem.EmbedMolecule(mols[i])
    AllChem.MMFFOptimizeMolecule(mols[i]) #用MMFF94方法优化分子
df = calc.pandas(mols) #计算Mordred描述符
df.to_excel('whether2D_mordred.xlsx')

In [13]:
#计算RDKit的3D描述符

data_3D = pd.read_excel('数据集汇总.xlsx', sheet_name='Sheet1')
mols = [Chem.MolFromSmiles(smi) for smi in data_3D['阳离子smile']]
for i in range(len(mols)): 
    mols[i] = AllChem.AddHs(mols[i]) #计算2D描述符需要先加氢
    AllChem.EmbedMolecule(mols[i])
    AllChem.MMFFOptimizeMolecule(mols[i]) #用MMFF94方法优化分子

Aspheri = []
Eccentri = []
Inertial = []
RadiusO = []
Spher = []
for i in range(len(data_3D)):
    j1 = Descriptors3D.Asphericity(mols[i])
    Aspheri.append(j1)
    j2 = Descriptors3D.Eccentricity(mols[i])
    Eccentri.append(j2)
    j3 = Descriptors3D.InertialShapeFactor(mols[i])
    Inertial.append(j3)
    j4 = Descriptors3D.RadiusOfGyration(mols[i])
    RadiusO.append(j4)
    j5 = Descriptors3D.SpherocityIndex(mols[i])
    Spher.append(j5)
data_3D['Asphericity'] = Aspheri
data_3D['Eccentricity'] = Eccentri
data_3D['InertialShapeFactor'] = Inertial
data_3D['RadiusOfGyration'] = RadiusO
data_3D['SpherocityIndex'] = Spher
    
data_3D.to_excel('whether2D_Rdkit_3Ddescriptor.xlsx')



In [14]:
calc = Calculator(descriptors,ignore_3D=False) #计算3D描述符
df = pd.read_excel('数据集汇总.xlsx', sheet_name='Sheet1')
mols = [Chem.MolFromSmiles(smi) for smi in df['阳离子smile']]
for i in range(len(mols)): 
    mols[i] = AllChem.AddHs(mols[i]) #计算2D描述符需要先加氢
    AllChem.EmbedMolecule(mols[i])
    AllChem.MMFFOptimizeMolecule(mols[i]) #用MMFF94方法优化分子
df = calc.pandas(mols) #计算Mordred的所有描述符
df.to_excel('whether2D_mordred_include3Ddescriptor.xlsx')

In [17]:
df.shape

(105, 1826)