In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import numpy as np
from joblib import dump, load

In [2]:
df = pd.concat(
    [
        pd.read_csv('./data/quantitive_neftekod25_data.csv').drop_duplicates('SMILES', keep='first')[['SMILES', 'PDSC']],
        pd.read_csv('./data/export.csv')[['SMILES', 'PDSC']]
    ]
)
df = df[df['PDSC'] > 0]

In [3]:
df.head()

Unnamed: 0,SMILES,PDSC
0,C1=CC=C(C=C1)NC2=CC=CC=C2,1.6
8,CC(C)(C)CC(C)(C)C1=CC=CC=C1NC2=CC=CC3=CC=CC=C32,10.93
13,C1(=CC=CC=C1N(C2=CC=CC=C2CCCCCCCCC)[H])CCCCCCCCC,3.985
18,C1=CC=C(C=C1)NC2=CC=CC3=CC=CC=C32,10.86
23,CC1=C(C(=CC=C1)O)C,6.855


In [4]:
def get_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors = {}
    for descriptor_name, descriptor_func in Descriptors.descList:
        descriptors[descriptor_name] = descriptor_func(mol)

    return descriptors

descriptors_list = []
for smi in df['SMILES']:
    desc = get_descriptors(smi)
    descriptors_list.append(desc)
    
descriptors_df = pd.DataFrame(descriptors_list)

df = pd.concat([df.reset_index(drop=True), descriptors_df.reset_index(drop=True)], axis=1).dropna()
df = df.drop(columns=df.columns[df.nunique() <= 1])

In [5]:
df.drop(columns=['SMILES', 'PDSC']).corrwith(df['PDSC']).sort_values(ascending=False, key=abs).head(10)

EState_VSA4      0.649159
VSA_EState8      0.535111
EState_VSA3      0.512979
SPS              0.497038
BCUT2D_LOGPHI    0.492175
BCUT2D_MWLOW    -0.487608
BCUT2D_CHGHI     0.484440
BertzCT          0.477339
Chi2n            0.468391
BCUT2D_CHGLO    -0.457150
dtype: float64

In [6]:
df['bins'] = pd.qcut(df['PDSC'], q=5, labels=False, duplicates='drop')

In [7]:
X = df.drop(columns=['SMILES', 'PDSC', 'bins'])
y = df['PDSC']

models = [
    ('Linear Regression', ElasticNet(alpha=1.6)),
    ('Random Forest', RandomForestRegressor(n_estimators=100)),
    ('SVR', SVR()),
    ('KNN', KNeighborsRegressor()),
    ('XGBoost', xgb.XGBRegressor())
]

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
results = []

for name, model in models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    cv_scores = cross_val_score(pipeline, X, y, cv=skf.split(X, df['bins']), scoring='r2')
    results.append((name, np.mean(cv_scores), np.std(cv_scores)))


results.sort(key=lambda x: x[1], reverse=True)
for res in results:
    print(f"{res[0]}: R2 = {res[1]:.3f} (±{res[2]:.3f})")

Linear Regression: R2 = 0.447 (±0.239)
SVR: R2 = 0.094 (±0.053)
KNN: R2 = 0.077 (±0.316)
Random Forest: R2 = -0.021 (±0.322)
XGBoost: R2 = -0.059 (±0.150)


In [8]:
cat_model = CatBoostRegressor(verbose=0, random_state=42)
rcat_model = CatBoostRegressor(verbose=0, random_state=42)
cv_scores = cross_val_score(cat_model, X, y, cv=skf.split(X, df['bins']), scoring='r2')
print(f"CatBoost: R2 = {np.mean(cv_scores):.3f} (±{np.std(cv_scores):.3f})")

CatBoost: R2 = -0.018 (±0.127)


In [9]:
from sklearn.model_selection import GridSearchCV
params = {
    'depth': [1, 2, 3, 4],
    'learning_rate': [0.01, 0.1, 0.2, 0.005],
    'iterations': [400],
    'l2_leaf_reg': [0.05, 0.1, 0.3]
}

grid = GridSearchCV(
    estimator=CatBoostRegressor(verbose=0),
    param_grid=params,
    cv=skf.split(X, df['bins']),
    scoring='r2'
)

grid.fit(X, y)
print(f"Best CatBoost: {grid.best_score_:.3f}")
print(f"Best params: {grid.best_params_}")

Best CatBoost: 0.036
Best params: {'depth': 1, 'iterations': 400, 'l2_leaf_reg': 0.3, 'learning_rate': 0.005}


In [10]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('elastic', ElasticNet(max_iter=50000))
])

param_grid = {
    'elastic__alpha': np.logspace(-3, 1, 10),
    'elastic__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

stratified_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=stratified_cv.split(X, df['bins']),
    scoring='r2',
    n_jobs=-1,
    verbose=1
)


grid_search.fit(X, y)

print(f"Лучший R2: {grid_search.best_score_:.3f}")
print(f"Лучшие параметры: {grid_search.best_params_}")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Лучший R2: 0.426
Лучшие параметры: {'elastic__alpha': 1.2915496650148828, 'elastic__l1_ratio': 0.7}


In [11]:
best_params = {'alpha': 1.3, 'l1_ratio': 0.7}

model = Pipeline([
    ('scaler', StandardScaler()),
    ('elastic', ElasticNet(**best_params, max_iter=50000))
])

model.fit(X, y)

coefficients = model.named_steps['elastic'].coef_
importance = pd.DataFrame({
    'Feature': X.columns.tolist(),
    'Coefficient': coefficients,
    'Abs_Effect': np.abs(coefficients)
}).sort_values('Abs_Effect', ascending=False)
importance.head(15)

Unnamed: 0,Feature,Coefficient,Abs_Effect
80,EState_VSA4,1.689876,1.689876
79,EState_VSA3,1.142696,1.142696
90,VSA_EState4,0.965614,0.965614
82,EState_VSA6,-0.787312,0.787312
71,SlogP_VSA4,0.511104,0.511104
27,BertzCT,0.477107,0.477107
18,BCUT2D_MWLOW,-0.178677,0.178677
98,NHOHCount,0.172273,0.172273
21,BCUT2D_LOGPHI,0.156446,0.156446
40,HallKierAlpha,-0.101685,0.101685


In [12]:
top_features = importance.head(10)['Feature'].tolist()
X_top = X[top_features]
model_top = Pipeline([
    ('scaler', StandardScaler()),
    ('elastic', ElasticNet(**best_params, max_iter=50000))
])
model_top.fit(X_top, y)

model_data = {
    'model': model_top,
    'features': top_features
}

dump(model_data, './models/model_iter_0')

['./models/model_iter_0']