In [1]:
# Импорт библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('df_cleaned_rdkit_morgan.csv')

print("Минимум IC50_nM:", df['IC50_nM'].min())
print("Количество значений <= 0:", (df['IC50_nM'] <= 0).sum())
print("Пропущенных значений в IC50_nM:", df['IC50_nM'].isna().sum())

Минимум IC50_nM: 0.0
Количество значений <= 0: 7
Пропущенных значений в IC50_nM: 0


In [3]:
# Загрузка датасета с Morgan-фингерпринтами
# Удаляем строки, где IC50_nM <= 0, чтобы избежать log(0) и отрицательных значений
df_filtered = df[df['IC50_nM'] > 0].copy()

X = df_filtered.drop(columns=['canonical_smiles', 'IC50_nM'])
y = -np.log10(df_filtered['IC50_nM'])  # pIC50

# Тренировочное и тестовое разбиение
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабируем признаки
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
print("Минимум IC50_nM:", y.min())
print("Количество значений <= 0:", (y <= 0).sum())
print("Пропущенных значений в IC50_nM:", y.isna().sum())

Минимум IC50_nM: -8.287400000005613
Количество значений <= 0: 5064
Пропущенных значений в IC50_nM: 0


In [5]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)


In [6]:
# Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train_scaled, y_train)
gb_pred = gb.predict(X_test_scaled)


In [7]:
# MLP
mlp = MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp.fit(X_train_scaled, y_train)
mlp_pred = mlp.predict(X_test_scaled)


In [14]:
# CNN-модель
class SimpleCNN(nn.Module):
    def __init__(self, input_len):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=5)
        self.pool = nn.MaxPool1d(2)
        self.fc1 = nn.Linear(32 * ((input_len - 5 + 1) // 2), 64)
        self.out = nn.Linear(64, 1)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.out(x)

# Перевод в Tensor
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

model = SimpleCNN(X_train_tensor.shape[2])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

for epoch in range(10):
    model.train()
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: loss = {loss.item():.4f}")

# Предсказания
model.eval()
with torch.no_grad():
    cnn_pred = model(X_test_tensor).squeeze().numpy()


Epoch 1: loss = 2.2679
Epoch 2: loss = 2.7684
Epoch 3: loss = 1.2250
Epoch 4: loss = 1.9655
Epoch 5: loss = 2.2804
Epoch 6: loss = 1.6605
Epoch 7: loss = 5.7813
Epoch 8: loss = 2.4446
Epoch 9: loss = 0.6288
Epoch 10: loss = 1.1752
Epoch 11: loss = 2.5598
Epoch 12: loss = 0.8933
Epoch 13: loss = 2.3354
Epoch 14: loss = 1.3820
Epoch 15: loss = 1.4640
Epoch 16: loss = 1.3031
Epoch 17: loss = 1.4693
Epoch 18: loss = 2.2011
Epoch 19: loss = 3.2204
Epoch 20: loss = 1.6472
Epoch 21: loss = 2.4351
Epoch 22: loss = 1.2130
Epoch 23: loss = 1.6982
Epoch 24: loss = 3.2655
Epoch 25: loss = 1.2857
Epoch 26: loss = 1.3967
Epoch 27: loss = 1.2955
Epoch 28: loss = 1.8496
Epoch 29: loss = 2.2138
Epoch 30: loss = 1.5902
Epoch 31: loss = 1.7803
Epoch 32: loss = 2.3555
Epoch 33: loss = 2.3375
Epoch 34: loss = 2.8584
Epoch 35: loss = 3.1264
Epoch 36: loss = 3.4093
Epoch 37: loss = 1.8482
Epoch 38: loss = 1.5588
Epoch 39: loss = 2.1392


KeyboardInterrupt: 

In [11]:
# Функция оценки модели
def evaluate(y_true, y_pred):
    return {
        'RMSE': root_mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }

results = {
    'RandomForest': evaluate(y_test, rf_pred),
    'GradientBoosting': evaluate(y_test, gb_pred),
    'MLP': evaluate(y_test, mlp_pred),
    'CNN': evaluate(y_test, cnn_pred)
}
pd.DataFrame(results).T

Unnamed: 0,RMSE,MAE,R2
RandomForest,0.87536,0.652285,0.578797
GradientBoosting,1.035938,0.830903,0.41009
MLP,0.960549,0.70117,0.492826
CNN,0.987978,0.738633,0.463447
