In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("cox2_with_rdkit_descriptors.csv")

df = df[df['IC50_nM'] > 0].copy()
df['pIC50'] = -np.log10(df['IC50_nM'] * 1e-9)

In [2]:
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['Smiles', 'IC50_nM', 'pIC50'])
y = df['pIC50']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_scaled)

In [4]:
import random
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [5]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = {}

for name, model in models.items():
    r2 = cross_val_score(model, X_scaled, y, cv=kf, scoring='r2')
    mae = cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_mean_absolute_error')
    rmse = cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_root_mean_squared_error')

    results[name] = {
        'R2 mean': r2.mean(),
        'R2 std': r2.std(),
        'MAE mean': -mae.mean(),
        'RMSE mean': -rmse.mean()
    }

In [16]:
results_df = pd.DataFrame(results).T
results_df[['R2 mean', 'MAE mean', 'RMSE mean']]

Unnamed: 0,R2 mean,MAE mean,RMSE mean
Random Forest,0.597103,0.706625,0.941067
XGBoost,0.546488,0.763365,0.998446


In [6]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

model = MLP(X_tensor.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

for epoch in range(30):
    for xb, yb in loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print("Final MSE:", loss.item())


Final MSE: 0.45987680554389954


In [5]:
fps_df = pd.read_csv("cox2_with_morgan_fp.csv")

fps_df = fps_df[fps_df['IC50_nM'] > 0].copy()
fps_df['pIC50'] = -np.log10(df['IC50_nM'] * 1e-9)

X_fps = fps_df[[col for col in fps_df.columns if col.startswith('FP_')]].values
y = fps_df['pIC50'].values

In [7]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

class CNN1D(nn.Module):
    def __init__(self, input_length=2048):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        self.fc = nn.Sequential(
            nn.Linear((input_length // 4) * 64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

In [8]:
# Проверка и установка устройства
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
model = CNN1D().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()
X_fps_tensor = torch.tensor(X_fps, dtype=torch.float32).unsqueeze(1).to(device)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)
dataset = TensorDataset(X_fps_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)


Using device: mps


In [9]:
for epoch in range(10):
    model.train()
    total_loss = 0
    for xb, yb in loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

Epoch 1, Loss: 1.6709
Epoch 2, Loss: 1.1954
Epoch 3, Loss: 1.0287
Epoch 4, Loss: 0.9030
Epoch 5, Loss: 0.7911
Epoch 6, Loss: 0.7003
Epoch 7, Loss: 0.6291
Epoch 8, Loss: 0.5699
Epoch 9, Loss: 0.5096
Epoch 10, Loss: 0.4666


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for xb, yb in loader:  
        pred = model(xb)
        all_preds.append(pred.cpu())
        all_true.append(yb.cpu())

preds = torch.cat(all_preds).numpy().flatten()
true_vals = torch.cat(all_true).numpy().flatten()

mae = mean_absolute_error(true_vals, preds)
rmse = mean_squared_error(true_vals, preds, squared=False)
r2 = r2_score(true_vals, preds)

print(f"CNN metrics:")
print(f"MAE  = {mae:.4f}")
print(f"RMSE = {rmse:.4f}")
print(f"R²   = {r2:.4f}")

CNN metrics:
MAE  = 0.4586
RMSE = 0.6178
R²   = 0.8264




Выводы:
CNN справляется лучше всех — особенно хорошо прогнозирует низкие значения активности (видно по MAE).
Модели дерева (RF, XGB) явно переобучаются или не извлекают глубокую структуру из Morgan фингерпринтов.
MLP — неплохой компромисс, но CNN лучше ловит локальные паттерны в бинарных векторах (а это и есть идея фингерпринтов!).
Стоит сохранить CNN как финальную модель и, возможно, протестировать её на hold-out выборке или внешнем наборе.