In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [156]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit.Chem.Crippen import MolLogP
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [157]:
df = pd.read_csv(r"/content/dataset_v1.csv", usecols = ['SMILES'])
#не тянет весь датасет
df = df.sample(n = 13000, random_state = 9)
train_smiles, test_smiles = train_test_split(df['SMILES'], test_size=0.2, random_state=42)
print(df.head())
#list_of_smiles = df['SMILES']


                                           SMILES
49574             O=c1ccc2cc(O)c(OCc3ccccc3)cc2o1
1530849           CC1Cc2nc(N)nn2CC1c1ccc(O)c(F)c1
1007434  CC(CN1CCOCC1)NC(=O)Cn1cc(C(F)(F)F)ccc1=O
1863635      O=S(=O)(NCc1cc2ccccn2n1)c1c(F)cccc1F
771230      Cc1ccc(OCC(=O)OCc2cccc(C(N)=O)c2)cc1C


In [158]:
def mols(smiles_or_mol):
    if isinstance(smiles_or_mol, str):
        if len(smiles_or_mol) == 0:
            return None
        mol = Chem.MolFromSmiles(smiles_or_mol)

        if mol is None:
            return None
        try:
            Chem.SanitizeMol(mol) # Исправляет неккоректную структуру молекулы
        except ValueError:
            return None
        fingerprint = GetMACCSKeysFingerprint(mol)
        logp = Chem.Crippen.MolLogP(mol)
        return fingerprint, logp
    return smiles_or_mol

In [159]:
# Создаю список Mol объектов из SMILES и получаю MACCS ключей и значений logP для каждой молекулы
fingers_train = []
logp_list_train = []
for smiles in train_smiles:
    fingerprint, logp = mols(smiles)
    if fingerprint is not None and logp is not None:
        fingers_train.append(fingerprint)
        logp_list_train.append(logp)
print(fingers_train[:5])
print(logp_list_train[:5])

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23c8533d10>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be8aef10>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be8af370>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23c67c7c30>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23c680ee30>]
[3.4470800000000015, 1.4331999999999998, 3.0237000000000007, 3.5244200000000028, 2.4758200000000006]


In [160]:
X_train = np.array(fingers_train)
y_train = np.array(logp_list_train)

In [161]:
# Создаю список Mol объектов из SMILES и получаю MACCS ключей и значений logP для каждой молекулы
fingers_test = []
logp_list_test = []
for smiles in test_smiles:
    fingerprint, logp = mols(smiles)
    if fingerprint is not None and logp is not None:
        fingers_test.append(fingerprint)
        logp_list_test.append(logp)
print(fingers_test[:5])
print(logp_list_test[:5])

[<rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be703bc0>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be703c30>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be703ca0>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be703d10>, <rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f23be703d80>]
[1.729, 1.81908, 2.4681000000000006, 3.327700000000002, 1.9595999999999998]


In [162]:
# Преобразую данных в массивы numpy
X_test = np.array(fingers_test)
y_test = np.array(logp_list_test)
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
print(y_test.shape)
print(y_train.shape)

(2600,)
(10400,)


In [163]:
print(X_test.shape)
print(X_train.shape)

(2600, 168)
(10400, 168)


In [164]:
# Веса
weights = np.random.rand(X_train.shape[1], 1)
#weights = weights.T
# Гиперпараметры
learning_rate = 0.001
max_iter = 100
print(weights.shape)

(168, 1)


In [165]:
def line(x, w, b):
  y_predict = np.dot(x, w) + b
  return y_predict.flatten()

def loss(y_true, y_predict):
  l = 0

  for i in range(len(y_true)):
    l += (y_true[i] - y_predict[i])**2
  return l / len(y_true)

In [166]:
def derivative_w_analytic(w, b, X, y):
    dldw = 0
    dldw = 2 * np.dot(X.T, np.dot(X, w) + b - y) / len(y)
    return dldw.mean(axis=1, keepdims=True)


def derivative_b_analytic(w, b, X, y):

    dldb = 2 * (np.dot(X, w) + b - y)
    return dldb.mean(axis=1)

In [167]:
# print(derivative_w_numeric(0.6, 2, delta=0.0001))
# print(derivative_w_numeric(0.23, 2, delta=0.0001))

print(derivative_w_analytic(weights, 1, X_test, y_test))
print(derivative_b_analytic(weights, 1, X_test, y_test))

print(derivative_w_analytic(weights, 1, X_train, y_train))
print(derivative_b_analytic(weights, 1, X_train, y_train))

print(derivative_w_analytic(weights, 1, X_test, y_test).shape)
print(derivative_b_analytic(weights, 1, X_test, y_test).shape)
print(derivative_w_analytic(weights, 1, X_train, y_train).shape)
print(derivative_b_analytic(weights, 1, X_train, y_train).shape)
print(weights.shape)

[[4.31562349e+01]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [1.50637119e-01]
 [0.00000000e+00]
 [0.00000000e+00]
 [4.93801651e-01]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [3.64915567e-01]
 [0.00000000e+00]
 [1.34741539e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [2.44488573e+00]
 [1.01742826e+00]
 [0.00000000e+00]
 [1.70915360e+00]
 [4.94696736e-02]
 [0.00000000e+00]
 [5.89035980e-01]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [4.52781509e+00]
 [4.85869570e+00]
 [5.61851231e-01]
 [0.00000000e+00]
 [7.29127798e+00]
 [6.58405148e+00]
 [1.06643826e+01]
 [3.54797366e-02]
 [3.54797366e-02]
 [3.02934293e+00]
 [7.85890798e+00]
 [3.40931288e+00]
 [0.00000000e+00]
 [2.49771220e-01]
 [1.15253183e+00]
 [7.11552574e+00]
 [3.54797366e-02]
 [0.00000000e+00]
 [4.55742621e-01]
 [4.83376039e+00]
 [1.29038184e+01]
 [2.92592374e+00]
 [3.635480

In [168]:
# def get_grad(y_predicted, y):
#     error = (y_predicted - y)
#     gradient = 2*np.dot(error.T, X)/len(y)
#     return gradient
from tqdm import tqdm

def gd_algorithm(w_start, b_start, X_train, y_train, learning_rate=learning_rate, max_iter=100):
  w, b = w_start, b_start
  ll = []
  X = X_train
  y = y_train
  for iter in tqdm(range(max_iter)):
    y_predicted = line(X, w, b)
    l = loss(y, y_predicted)
    ll.append(l)

    w = w - learning_rate * derivative_w_analytic(w, b, X, y)
    b = b - learning_rate * derivative_b_analytic(w, b, X, y)
  return w, b, ll



In [169]:
w_opt, b_opt, ll = gd_algorithm(weights, 1, X_train, y_train, learning_rate=0.01, max_iter=100)
print("Оптимальные веса:", w_opt)
print("Оптимальные байесы:", b_opt)

100%|██████████| 100/100 [06:11<00:00,  3.72s/it]

Оптимальные веса: [[-3.42555594e-01]
 [ 3.05642475e-01]
 [ 6.46878311e-01]
 [ 3.02196386e-01]
 [ 9.14253175e-01]
 [ 4.32783200e-01]
 [ 8.49828555e-01]
 [ 5.35017864e-02]
 [ 9.73909551e-01]
 [ 3.35602746e-01]
 [ 7.78762163e-01]
 [ 4.55440822e-01]
 [ 6.59872168e-01]
 [ 3.07174552e-01]
 [ 7.27662913e-01]
 [ 1.70616422e-01]
 [ 9.62395800e-01]
 [ 2.74470980e-01]
 [ 4.71664069e-01]
 [ 9.88815216e-01]
 [ 3.73474241e-01]
 [ 5.52989896e-01]
 [ 1.89951431e-01]
 [ 6.86337128e-02]
 [ 9.30738763e-01]
 [ 9.12660873e-01]
 [ 8.40277946e-01]
 [ 6.00411153e-01]
 [ 5.37422981e-03]
 [ 7.56202986e-01]
 [ 8.57346664e-01]
 [ 6.96503120e-02]
 [ 1.61486459e-01]
 [ 5.23198882e-01]
 [ 1.98921753e-01]
 [ 4.15821537e-01]
 [ 8.57478971e-01]
 [ 6.64492575e-01]
 [ 5.88198484e-01]
 [ 2.07883958e-01]
 [ 1.95237105e-01]
 [ 6.78211632e-01]
 [ 1.74188489e-01]
 [ 1.70916956e-01]
 [ 1.87478239e-01]
 [ 5.86571368e-01]
 [ 1.73615798e-01]
 [ 9.37970938e-02]
 [ 2.75849842e-01]
 [ 8.73502035e-02]
 [ 8.91197813e-01]
 [ 4.51200484




In [170]:
print(X_train.shape)
print(y_train.shape)
print(w_opt.shape)
print(b_opt.shape)
b_opt = b_opt.reshape(-1, 1)
print(b_opt.shape)

(10400, 168)
(10400,)
(168, 1)
(10400,)
(10400, 1)


In [181]:
import sklearn

# Функция для оценки модели
def productive(X, y, w, b):
  y_pred = line(X, w, b)
  mse = sklearn.metrics.mean_squared_error(y, y_pred)
  r2 = sklearn.metrics.r2_score(y, y_pred)
  return mse, r2
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# Оценка модели на обучающем наборе
train_mse, train_r2 = productive(X_train, y_train, w_opt, b_opt)
print("GD_train MSE:", train_mse)
print("GD_train R²:", train_r2)

# Оценка модели на тестовом наборе
# test_mse, test_r2 = productive(X_test, y_test, w_opt, b_opt)
# print("Test MSE:", test_mse)
# print("Test R²:", test_r2)


(10400, 168) (10400,)
(2600, 168) (2600,)
GD_train MSE: 3.2720046594399417
GD_train R²: -2.875599330864896
