In [None]:
%matplotlib inline
import os
import dgl.function as fn
from dgl import DGLGraph
from collections import namedtuple
import networkx as nx
import torch
import torch.nn.functional as F
from torch.nn.functional import cosine_similarity
import torch.optim as optim
import torch.nn as nn
from torch.nn import Linear
import numpy as np
import pandas as pd
from torch_geometric.data import Data, Batch
from torch_geometric.nn.conv import GCNConv
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, DataStructs, RDConfig
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Descriptors import rdMolDescriptors
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from e3fp.fingerprint.generate import fp, fprints_dict_from_mol
from e3fp.conformer.generate import generate_conformers
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import seaborn as sns
import hashlib
IPythonConsole.ipython_useSVG=True

In [None]:
# 读取SDF文件
sdf_file = './dataset/BindingDB_PubChem_3D.sdf'
# Pandas Dataframe的列名
df = pd.DataFrame(columns=['mol_name', 'smiles', 'mol_weight', 'IC50 (nM)'])
# 读取的分子数量
chunk_size = 10000
mols = []
suppl = Chem.SDMolSupplier(sdf_file)

In [None]:
# 用RDKit中的SDMolSupplier读取分子
for i,mol in enumerate(suppl):
    if i > 0 and i % chunk_size == 0:
        break
    else:
        if mol is not None:
            mol = Chem.AddHs(mol)
            # 3D
            AllChem.EmbedMolecule(mol)
            mols.append(mol)
            #其他非标准的属性
            # propNames = list(mol.GetPropNames())
            # print(propNames)
            # 提取分子信息
            mol_ID = mol.GetProp('_Name')
            smiles = Chem.MolToSmiles(mol)
            mol_weight = Chem.rdMolDescriptors.CalcExactMolWt(mol)
            IC50 = mol.GetProp('IC50 (nM)')
            # 添加数据到DataFrame
            df = df.append({'mol_name': mol_ID, 'smiles': smiles, 'mol_weight': mol_weight,
                            'IC50 (nM)': IC50}, ignore_index=True)
 # 打印结果
df

In [None]:
df.info()
# 数据清洗
df['IC50 (nM)'] = pd.to_numeric(df['IC50 (nM)'], errors='coerce')
df['mol_name'] = df['mol_name'].astype(str)
# 删除包含英文字母的行
df = df[~df['mol_name'].str.contains('[a-zA-Z]')]
# 把转成数字类型
df['mol_name'] = pd.to_numeric(df['mol_name'], errors='coerce')

# 删除空值
df = df.dropna()
df = df.reset_index(drop=True)
df

In [None]:
df = pd.read_csv('./dataset/BindingDB_PubChem_3D_dataset.csv')
df

In [None]:
# 对df进行缩放
df['IC50 (nM)'] = np.log10(df['IC50 (nM)'])
df['mol_weight'] = np.log10(df['mol_weight'])

# 归一化, 但不小于0
df['IC50 (nM)'] = (df['IC50 (nM)'] - df['IC50 (nM)'].min()) / (df['IC50 (nM)'].max() - df['IC50 (nM)'].min())
df['mol_weight'] = (df['mol_weight'] - df['mol_weight'].min()) / (df['mol_weight'].max() - df['mol_weight'].min())


In [None]:
# 数据集可视化
sns.set(style='whitegrid', context='notebook')
cols = ['IC50 (nM)', 'mol_weight']
sns.pairplot(df[cols], height=2.5)
plt.show()


In [None]:
# 把df存储为csv，方便便后续使用
df.to_csv('./dataset/BindingDB_PubChem_3D_dataset.csv', index=False)


In [None]:
df

In [None]:
df.describe()

In [None]:
# 画图
Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(500, 500), legends=[mol.GetProp('_Name') for mol in mols])

In [None]:
mols3D = []
j = 0
suppl = Chem.SDMolSupplier(sdf_file)
 # 用RDKit中的SDMolSupplier读取分子
for i,mol in enumerate(suppl):
    # 判断j是否超出数组范围
    if i > 0 and i % chunk_size == 0 and j > (len(df['smiles']) - 1):
        break
    else:
        if mol is not None:
            mol = Chem.AddHs(mol)
            # 判断smiles是否相同
            if df.iloc[j]['smiles'] == Chem.MolToSmiles(mol):
                mols3D.append(mol)
                j += 1

len(mols3D)

In [None]:
df3D = {"MOLI":[], "MOLJ":[], "E3FPTC":[], "i":[], "j":[]}
fpdicts = [ fprints_dict_from_mol( mol ) for mol in mols3D ]
# 获取 e3fp fingerprint
#如果分子有多个构象，则该函数将生成多个指纹。
fps = [ fp[5][0] for fp in fpdicts]
# 用e3fp生成rdkit指纹
binfp = [ fp.fold().to_rdkit() for fp in fps ]

In [None]:
# 把fps转dataframe
df_fps = pd.DataFrame(fps)
# 存储，方便后续使用，生成时间太长
df_fps.to_csv('./dataset/BindingDB_PubChem_3D_E3TP_features_arrstr.csv', index=False)

In [None]:
df_fps = pd.read_csv('./dataset/BindingDB_PubChem_3D_E3FP_features_arrstr.csv')
df_fps

In [None]:
# 动态数组
fp_array = [[] for i in range(len(df_fps['0']))]
len(df_fps['0'])


In [None]:

string = ''

# 遍历每个0	Fingerprint(indices=array([1691953, 23151693, ...] 这些由字符组成的指纹
for i, fp in enumerate(df_fps['0']):
    # 遍历每个字符
    print(fp)
    for j, c in enumerate(fp):
        if c != '[' and c != ']' and c != ',' and c != ' ' and c != '(' and c != ')' and c != '=':
            string += c
            print(string)
        # 如果字符为'['则清空string
        elif c == '[':     
            string = ''
        # 如果字符为','，则获取string值
        elif c == ',':
            fp_array[i].append(string)
            string = ''
        # 若字符为']'，则获取值并结束
        elif c == ']':
            fp_array[i].append(string)
            string = ''
            break
        

In [None]:
fp_array

In [None]:
# 生成特征矩阵
# 算出最大的数组长度
max_len = 0
for i in range(len(fp_array)):
    if len(fp_array[i]) > max_len:
        max_len = len(fp_array[i])
max_len

In [None]:
# 通过最大长度生成特征矩阵，用0填充
fp_matrix = np.zeros((len(fp_array), max_len))
fp_matrix.shape

In [None]:
# 把数组中的值赋给特征矩阵
for i in range(len(fp_array)):
    for j in range(len(fp_array[i])):
        fp_matrix[i][j] = fp_array[i][j]
fp_matrix

In [None]:
# 把特征矩阵转成DataFrame
df_fp_matrix = pd.DataFrame(fp_matrix)
# 数据清洗
df_fp_matrix = df_fp_matrix.dropna()
df_fp_matrix = df_fp_matrix.reset_index(drop=True)



In [None]:
# log10
df_fp_matrix = np.log10(df_fp_matrix)
# 归一化
df_fp_matrix = (df_fp_matrix - df_fp_matrix.min()) / (df_fp_matrix.max() - df_fp_matrix.min())
df_fp_matrix

In [None]:
# 对nan的列进行删除
df_fp_matrix = df_fp_matrix.dropna(axis=1)
df_fp_matrix = df_fp_matrix.reset_index(drop=True)


In [None]:
# 标准化
df_fp_matrix = (df_fp_matrix - df_fp_matrix.mean()) / df_fp_matrix.std()
df_fp_matrix

In [None]:
# 由于零值太多，所以只保留前面较为完整的数据列
df_fp_matrix = df_fp_matrix.iloc[:, :40]
df_fp_matrix

In [None]:
# 把df_fp_matrix_norm存储为csv
df_fp_matrix.to_csv('./dataset/BindingDB_PubChem_3D_E3FP_features_dataset.csv', index=False)

In [None]:
# 生成分子对
for i in range( len(binfp) ):
    for j in range( i ):
        e3fpTC = DataStructs.TanimotoSimilarity( binfp[i], binfp[j] )
        moli = mols[i].GetProp("_Name")
        molj = mols[j].GetProp("_Name")
        df3D["MOLI"].append( moli )
        df3D["MOLJ"].append( molj )
        df3D["E3FPTC"].append( e3fpTC )
        df3D["i"].append( i )
        df3D["j"].append( j )
df3D = pd.DataFrame( df3D )
df3D

In [None]:
# 把df3D存储为csv
df3D.to_csv('./dataset/BindingDB_PubChem_3D_E3FPTC_dataset.csv', index=False)