# Strong ML baselines
## Fingerprints+RandomForest

In [1]:
import matplotlib.pylab as plt # Для визуализации
import numpy as np # Для работы с массивами
import pandas as pd # Для работы с таблицами

In [None]:
# XGBoost -- классификаторы и регрессоры для табличных данных
%pip install xgboost
%pip install scikit-learn

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem # Нужен для построения вектора фингерпринтов

from sklearn.model_selection import train_test_split # Нужно для создание тренировочной и тестовой выборки
from xgboost import XGBRegressor 

from sklearn.metrics import mean_squared_error, r2_score # Метрики для регрессий

In [3]:
# Загрузка данных
df_full = pd.read_csv("summary.csv", index_col=0, nrows=10000)
df_full = df_full[df_full['CONFORMER id'] == 0]

In [4]:
# Применяем функцию ко всем значениям в колонке 
# и пишем новую колонку
df_full["mol"] = df_full["SMILES"].apply(Chem.MolFromSmiles)

In [5]:
# Расчёт Моргановских фингерпринтов
df_full["fingerprint"] = df_full["mol"].apply(lambda m: np.array(AllChem.GetMorganFingerprintAsBitVect(m, radius=2, nBits=512)))

In [6]:
df_full["fingerprint"]

0       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
24      [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ...
25      [1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
26      [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
                              ...                        
9990    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
9991    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9992    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9997    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9998    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: fingerprint, Length: 2569, dtype: object

In [7]:
df_full['DFT FORMATION ENERGY']

0      -6.395356
6      -7.744498
24     -7.038037
25     -5.436545
26     -5.492884
          ...   
9990   -5.992071
9991   -6.909303
9992   -6.893859
9997   -5.297589
9998   -6.285701
Name: DFT FORMATION ENERGY, Length: 2569, dtype: float64

In [8]:
X_full = np.vstack(df_full["fingerprint"].values)
Y_full = df_full['DFT FORMATION ENERGY'].values

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_full, Y_full, test_size=0.2, random_state=42
)

In [10]:
# Инициализация модели
xgb = XGBRegressor(
    random_state=42,
    n_jobs=-1
)

# Обучение
xgb.fit(X_train, y_train)

In [11]:
y_pred = xgb.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

RMSE: 0.5997
R²: 0.5940


In [12]:
# Ручной расчёт MAE, основной метрики для этой задачи
np.mean(np.abs((y_test - y_pred)))

0.45719786038175597

## Bag-of-atoms

In [13]:
from collections import Counter
from rdkit import Chem
import pandas as pd

# Unique atom types across dataset
def get_atom_types(smiles_list):
    atom_types = set()
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None: 
            continue
        for atom in mol.GetAtoms():
            atom_types.add(atom.GetSymbol())
    return sorted(atom_types)

def bag_of_atoms(smiles, atom_types):
    mol = Chem.MolFromSmiles(smiles)
    counts = Counter([a.GetSymbol() for a in mol.GetAtoms()])
    return [counts.get(atom, 0) for atom in atom_types]

atom_types = get_atom_types(df_full["SMILES"])
df_full["BoA"] = df_full["SMILES"].apply(lambda s: bag_of_atoms(s, atom_types))

X_boa = pd.DataFrame(df_full["BoA"].tolist(), columns=[f"atom_{a}" for a in atom_types])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_boa.values, Y_full, test_size=0.2, random_state=42
)

# Инициализация модели
xgb = XGBRegressor(
    random_state=42,
    n_jobs=-1
)

# Обучение
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.mean(np.abs((y_test - y_pred)))

0.23757571956123993

## Bag-of-bonds

In [15]:
def get_bond_types(smiles_list):
    bond_types = set()
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        for bond in mol.GetBonds():
            atoms = sorted([bond.GetBeginAtom().GetSymbol(), bond.GetEndAtom().GetSymbol()])
            bond_types.add("-".join(atoms))
    return sorted(bond_types)

def bag_of_bonds(smiles, bond_types):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [0] * len(bond_types)
    counts = Counter(
        "-".join(sorted([b.GetBeginAtom().GetSymbol(), b.GetEndAtom().GetSymbol()]))
        for b in mol.GetBonds()
    )
    return [counts.get(bt, 0) for bt in bond_types]

bond_types = get_bond_types(df_full["SMILES"])
df_full["BoB"] = df_full["SMILES"].apply(lambda s: bag_of_bonds(s, bond_types))

X_bob = pd.DataFrame(df_full["BoB"].tolist(), columns=[f"bond_{b}" for b in bond_types])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X_bob.values, Y_full, test_size=0.2, random_state=42
)

# Инициализация модели
xgb = XGBRegressor(
    random_state=42,
    n_jobs=-1
)

# Обучение
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.mean(np.abs((y_test - y_pred)))

0.29880258387160097

In [17]:
# В nabla2DFT ошибка лучшей модели равна -- 0.0008
# Это 0.2% от ошибки наблюдаемой нами в базовых решениях