In [10]:
!pip install lightgbm numpy pandas rdkit scikit-learn #在终端用pip(不用写感叹号)下载lightgbm库、numpy库、pandas库、rdkit库、scikit-learn库

Collecting rdkit
  Downloading rdkit-2023.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.1


In [26]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score
import numpy as np
import matplotlib.pyplot as plt
import os

# 读取数据
DATA_PATH="data"
data = pd.read_csv(f"{DATA_PATH}/mol_train.csv")
features = data['SMILES']
labels = data['TARGET']

# 定义计算1D-QSAR特征的函数
def calculate_1dqsar_repr(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol_weight = Descriptors.MolWt(mol)  # 计算分子的分子量
    log_p = Descriptors.MolLogP(mol)  # 计算分子的LogP值
    num_h_donors = Descriptors.NumHDonors(mol)  # 计算分子中的氢键供体数量
    num_h_acceptors = Descriptors.NumHAcceptors(mol)  # 计算分子中的氢键受体数量
    tpsa = Descriptors.TPSA(mol)  # 计算分子的表面积极性
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)  # 计算分子中的可旋转键数量
    num_aromatic_rings = Descriptors.NumAromaticRings(mol)  # 计算分子中的芳香环数量
    num_aliphatic_rings = Descriptors.NumAliphaticRings(mol)  # 计算分子中的脂环数量
    num_saturated_rings = Descriptors.NumSaturatedRings(mol)  # 计算分子中的饱和环数量
    num_heteroatoms = Descriptors.NumHeteroatoms(mol)  # 计算分子中的杂原子数量
    num_valence_electrons = Descriptors.NumValenceElectrons(mol)  # 计算分子中的价电子数量
    num_radical_electrons = Descriptors.NumRadicalElectrons(mol)  # 计算分子中的自由基电子数量
    num_polar_hydrogens = Descriptors.NumHAcceptors(mol)  # 计算极性氢原子数量
    # 计算N原子的数量
    count_n = 0
    for atom in mol.GetAtoms():
        atomic_num = atom.GetAtomicNum()
        if atomic_num == 7:  # 7代表氮原子的原子序数
            count_n += 1

    # 返回特征及其简写名称
    feature_names = ['mw', 'log_p', 'nhd', 'nha', 'tpsa', 'nrb',
                     'nar', 'nalr', 'nsr', 'nh',
                     'nve', 'nre', 'count_n', 'nph']
    features_repr = [mol_weight, log_p, num_h_donors, num_h_acceptors, tpsa, num_rotatable_bonds,
                     num_aromatic_rings, num_aliphatic_rings, num_saturated_rings, num_heteroatoms,
                     num_valence_electrons, num_radical_electrons, count_n, num_polar_hydrogens]
    return feature_names, features_repr
# 初始化特征列表
all_features = []

# 计算1D-QSAR特征
for smiles in features:
    feature_names, features_repr = calculate_1dqsar_repr(smiles)  # 调用计算函数获取特征及其名称
    all_features.append(features_repr)  # 将特征添加到特征列表

# 将特征数据和标签数据拆分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(all_features, labels, test_size=0.2, random_state=42)

# 创建随机森林分类器
clf = RandomForestClassifier()

# 在训练集上拟合模型
clf.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = clf.predict(X_test)

# 提取特征重要性
importances = clf.feature_importances_

# 可视化特征重要性
# X_train = np.array(X_train)
# indices = np.argsort(importances)[::-1]  # 特征重要性排序的索引

# plt.figure()
# plt.title("Feature Importance")
# plt.bar(range(X_train.shape[1]), importances[indices], align="center")
# plt.xticks(range(X_train.shape[1]), [feature_names[i] for i in indices])
# plt.xlabel("Feature")
# plt.ylabel("Importance")
# plt.show()

# 计算f2-score
f2_score = fbeta_score(y_test, y_pred, beta=2)

print("F2-score: ", f2_score)

F2-score:  0.8222222222222222
