In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from Bio import SeqIO
import networkx as nx
import numpy as np
import pandas as pd

In [None]:
# Step 1: 解析药物的SMILES序列并提取药效团特征
def extract_molecular_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    features = np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))
    pass

# Step 2: 生成药物的二维拓扑结构和原子连接矩阵
def generate_molecule_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    atom_connectivity = Chem.GetAdjacencyMatrix(mol)
    return atom_connectivity

# Step 3: 解析蛋白的FASTA序列
def parse_protein_fasta(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences[record.id] = str(record.seq)
    return sequences

# Step 4: 构建蛋白图
def build_protein_graph(sequence):
    # Your implementation to build protein graph
    pass

# Step 5: 构建药物图
def build_drug_graph(molecule, atom_connectivity):
    G = nx.Graph()
    for atom in molecule.GetAtoms():
        G.add_node(atom.GetIdx(), atomic_number=atom.GetAtomicNum())

    for bond in molecule.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        G.add_edge(start, end)

    return G



In [None]:
smiles = "CC(=O)C(O)=O"  # Example SMILES
molecule, atom_connectivity = generate_molecule_graph(smiles)
molecular_features = extract_molecular_features(smiles)

fasta_file = "R_05_PSSM_input_removeIllegal_ShortSequences.fasta"  # Example FASTA file
protein_sequences = parse_protein_fasta(fasta_file)
protein_graph = build_protein_graph(protein_sequences)

drug_graph = build_drug_graph(molecule, atom_connectivity)


In [None]:
smiles = "CC(=O)C(O)=O"  # Example SMILES
extract_molecular_features(smiles)


In [None]:
generate_molecule_graph(smiles)

In [None]:
fasta_file = "R_05_PSSM_input_removeIllegal_ShortSequences.fasta"  # Example FASTA file
protein_sequences = parse_protein_fasta(fasta_file)

In [None]:
protein_graph = build_protein_graph(protein_sequences)

In [None]:
drug_graph = build_drug_graph(molecule, atom_connectivity)

In [None]:
# 1. 提取药效团特征
def extract_features(smiles_list, activity_list):
    features = []
    activities = []
    for smiles, activity in zip(smiles_list, activity_list):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            features.append(np.array(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)))
            activities.append(activity)
        else:
            print("Invalid SMILES:", smiles)
    return np.array(features), np.array(activities)

In [None]:
# 2. 准备数据集
# 假设你有一个包含smiles字符串和对应活性的数据集
#载入数据

file_path = 'drugbank_smiles.csv'  # 修改为您的文件路径
data = pd.read_csv(file_path)

smiles_data = data['smiles'].tolist()  # 包含SMILES字符串的列表
activity_data = data['logP ALOGPS'].tolist()  # 对应的活性数据


In [None]:
# 将SMILES转换为药效团特征
X, y = extract_features(smiles_data, activity_data)

In [None]:

# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 3. 设计CNN模型
model = Sequential()
model.add(Conv1D(128, kernel_size=5, activation='relu', input_shape=(1024, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
# 4. 模型训练
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train.reshape(-1, 1024, 1), y_train, epochs=10, batch_size=32, validation_split=0.1)


In [None]:

# 5. 模型评估
loss, accuracy = model.evaluate(X_test.reshape(-1, 1024, 1), y_test)
print("Test Accuracy:", accuracy)