In [None]:
!pip install datamol
!pip install molfeat
import pandas as pd
import datamol as dm
import numpy as np

from molfeat.calc import RDKitDescriptors2D, FPCalculator, MordredDescriptors
from molfeat.trans import MoleculeTransformer

In [None]:
!pip install rdkit

import pandas as pd
import rdkit
from rdkit import Chem
from collections import Counter



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = dm.read_csv("/content/train_admet.csv", smiles_column="Drug", index_col=0)
test_data = dm.read_csv("/content/test_data.csv", smiles_column="Drug", index_col=0)



In [None]:
train_features_by_others = pd.read_csv('/content/drive/MyDrive/X_train_features_2.csv')


In [None]:
test_features_by_others = pd.read_csv('/content/drive/MyDrive/X_test_features_2.csv')

In [None]:
train_features_by_others

Unnamed: 0.1,Unnamed: 0,[Na+],[cH-],N,[Na],[CH2-],C,Cl,[NH2+],[o+],...,N-C(SINGLE),N-C(AROMATIC),C-H(SINGLE),N-S(SINGLE),P-O(SINGLE),O-S(SINGLE),C-O(AROMATIC),O-S(DOUBLE),N-C(TRIPLE),C-N(AROMATIC)
0,0,0,0,0,0,0,4,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,2
2,2,0,0,3,0,0,4,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,3,0,0,2,0,0,4,0,0,0,...,2,0,0,0,0,0,0,0,0,0
4,4,0,0,2,0,0,12,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7934,7934,0,0,0,0,0,0,2,0,0,...,0,2,0,0,0,0,0,0,0,2
7935,7935,0,0,2,0,0,10,0,0,0,...,0,4,0,0,0,0,0,0,0,4
7936,7936,0,0,1,0,0,4,0,0,1,...,2,1,0,0,0,0,1,0,0,0
7937,7937,0,0,0,0,0,7,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [None]:
def get_atom_chars(smi):
    atoms_chars=[]
    mol = Chem.MolFromSmiles(smi,sanitize=False)
    for a in mol.GetAtoms():
        atom=Chem.RWMol()
        atom.AddAtom(a)
        atoms_chars.append(Chem.MolToSmiles(atom))
    return atoms_chars

In [None]:
unique_atoms = set()
for i in range(len(train_data)):
    for j in get_atom_chars(train_data.iloc[i]['Drug']):
        unique_atoms.add(j)

for i in range(len(test_data)):
    for j in get_atom_chars(test_data.iloc[i]['Drug']):
        unique_atoms.add(j)

In [None]:
unique_atoms = list(unique_atoms)

In [None]:
mols_train = []
mols_test = []
for i in range(len(train_data)):
  mols_train.append(Chem.MolFromSmiles(train_data.iloc[i]['Drug'],sanitize=True))

for i in range(len(test_data)):
  mols_test.append(Chem.MolFromSmiles(test_data.iloc[i]['Drug'],sanitize=True))

In [None]:
from collections import Counter

atoms_fracs_train = []
atoms_fracs_test = []
for i in range(len(train_data)):
    atoms = get_atom_chars(train_data.iloc[i]['Drug'])
    fracs = {i: 0 for i in unique_atoms}
    for j in atoms:
        fracs[j] += 1
    fracs['property'] = train_data.iloc[i]['property']
    atoms_fracs_train.append(fracs)

for i in range(len(test_data)):
    atoms = get_atom_chars(test_data.iloc[i]['Drug'])
    fracs = {i: 0 for i in unique_atoms}
    for j in atoms:
        fracs[j] += 1
    fracs['property'] = test_data.iloc[i]['property']
    atoms_fracs_test.append(fracs)

In [None]:
atoms_fracs_train = pd.DataFrame(atoms_fracs_train)
atoms_fracs_test = pd.DataFrame(atoms_fracs_test)


In [None]:
# from rdkit.Chem import AllChem

# MFP_train_bits = [AllChem.GetMorganFingerprintAsBitVect(i, 10, 512).ToList() for i in mols_train]
# MFP_test_bits = [AllChem.GetMorganFingerprintAsBitVect(i, 10, 512).ToList() for i in mols_test]

In [None]:
# atoms_fracs_train[[f'{i}_' for i in range(512)]] = MFP_train_bits
# atoms_fracs_test[[f'{i}_' for i in range(512)]] = MFP_test_bits

In [None]:
from molfeat.trans.concat import FeatConcat
from molfeat.trans.fp import FPVecTransformer

rdkit = FPVecTransformer("desc2D", n_jobs=8, dtype=np.float32, replace_nan=True)
maccs = FPVecTransformer("maccs", dtype=np.float32)
ecfp4 = FPVecTransformer("ecfp:4", dtype=np.float32)

# Wrap the calculator in a transformer instance
featurizer = FeatConcat([maccs, ecfp4, rdkit], dtype=np.float32)

with dm.without_rdkit_log():
    feats = featurizer(train_data["mol"])
    feats_test = featurizer(test_data["mol"])



In [None]:
feats = pd.DataFrame(feats)
feats.columns = [f"{i}_" for i in range(2383)]

In [None]:
feats_test = pd.DataFrame(feats_test)
feats_test.columns = [f"{i}_" for i in range(2383)]

In [None]:
X = pd.concat([atoms_fracs_train, feats], axis=1)

In [None]:
X_test = pd.concat([atoms_fracs_test, feats_test], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, list(train_data.Y), test_size=0.25, random_state=42, stratify=list(train_data.Y))

In [None]:
X

Unnamed: 0,[Na+],[cH-],N,[Na],[CH2-],C,Cl,[NH2+],[o+],[NH-],...,d_200,d_201,d_202,d_203,d_204,d_205,d_206,d_207,d_208,d_209
0,0,0,0,0,0,4,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,3,0,0,4,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,2,0,0,4,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,0,2,0,0,12,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7934,0,0,0,0,0,0,2,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7935,0,0,2,0,0,10,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7936,0,0,1,0,0,4,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7937,0,0,0,0,0,7,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
nms=[x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(nms)
descs_train = [calc.CalcDescriptors(x) for x in mols_train ]
descs_test = [calc.CalcDescriptors(x) for x in mols_test ]
X[[f'd_{i}' for i in range(210)]] = descs_train
X_test[[f'd_{i}' for i in range(210)]] = descs_test

In [None]:
Mordred_descr_train = pd.read_csv('/content/drive/MyDrive/Mordred_descr_train.csv')

  Mordred_descr_train = pd.read_csv('/content/drive/MyDrive/Mordred_descr_train.csv')


In [None]:
Mordred_descr_test = pd.read_csv('/content/drive/MyDrive/Mordred_descr_test.csv')

  Mordred_descr_test = pd.read_csv('/content/drive/MyDrive/Mordred_descr_test.csv')


In [None]:
from catboost import CatBoostClassifier,Pool
clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.11,
    loss_function='Logloss',
    eval_metric = 'AUC',
    random_state = 42,
    simple_ctr='Counter', combinations_ctr='Counter',
    cat_features=['property'],
    depth=7,
    auto_class_weights='SqrtBalanced',
)


In [None]:
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    learning_rate=0.11,
    random_state=42,
    n_estimators=1000,
    max_depth=7,
    enable_categorical=True
)

In [None]:
lgbm = LGBMClassifier(
    objective='binary',
    metric='auc',
    learning_rate=0.11,
    random_state=42,
    n_estimators=1000,
    is_unbalance=True  # Automatically handle class imbalance
)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight = "balanced", random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42, class_weight='balanced')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, random_state=42)

In [None]:
#depth= 6 – 0.9209
#depth= 6 – 0.9205
#lr у всех 0.09 - 0.9214
#lr у всех 0.08 - 0.9211
#lr 0.11 – 0.9221

from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('clf', clf),
    ('xgb', xgb),
    ('lgbm', lgbm),
    ],
    voting='soft'  # Use 'hard' for majority voting or 'soft' for probability averaging
)

In [None]:
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_tr.columns.values]

In [None]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)

0:	total: 217ms	remaining: 3m 36s
1:	total: 352ms	remaining: 2m 55s
2:	total: 488ms	remaining: 2m 42s
3:	total: 623ms	remaining: 2m 35s
4:	total: 765ms	remaining: 2m 32s
5:	total: 901ms	remaining: 2m 29s
6:	total: 1.03s	remaining: 2m 26s
7:	total: 1.19s	remaining: 2m 27s
8:	total: 1.33s	remaining: 2m 26s
9:	total: 1.46s	remaining: 2m 24s
10:	total: 1.59s	remaining: 2m 23s
11:	total: 1.78s	remaining: 2m 26s
12:	total: 2s	remaining: 2m 32s
13:	total: 2.26s	remaining: 2m 38s
14:	total: 2.52s	remaining: 2m 45s
15:	total: 2.77s	remaining: 2m 50s
16:	total: 3.04s	remaining: 2m 55s
17:	total: 3.29s	remaining: 2m 59s
18:	total: 3.52s	remaining: 3m 1s
19:	total: 3.78s	remaining: 3m 5s
20:	total: 4.04s	remaining: 3m 8s
21:	total: 4.29s	remaining: 3m 10s
22:	total: 4.53s	remaining: 3m 12s
23:	total: 4.78s	remaining: 3m 14s
24:	total: 5s	remaining: 3m 15s
25:	total: 5.24s	remaining: 3m 16s
26:	total: 5.48s	remaining: 3m 17s
27:	total: 5.73s	remaining: 3m 18s
28:	total: 6.1s	remaining: 3m 24s
29:	t

ValueError: feature_names must be string, and may not contain [, ] or <

In [None]:
y_pred = ensemble_model.predict_proba(X_test)
score = roc_auc_score(y_val, ensemble_model.predict_proba(X_val)[:, 1])
print(f'Model score: {score:.4f}')

Model score: 0.9218


In [None]:
sample['Y'] =  y_pred[:, 1]

In [None]:
sample.to_csv('ensemble_boosting.csv', index=False)