# Предварительная настройка

In [70]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from rdkit.Chem import rdFingerprintGenerator as rfg
from sklearn.inspection import permutation_importance
from rdkit.Chem.MolStandardize import rdMolStandardize as std
from rdkit import RDLogger
from sklearn.base import clone
from tqdm.auto import tqdm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV

import sys
sys.path.append(r"C:\Users\Alexey\Documents\programming\VSCode\mymodules")

from feature_importances_table import feature_imp_table

In [19]:
pd.set_option("display.max_rows", None)      # показать все строки
pd.set_option("display.max_columns", None)   # все столбцы
pd.set_option("display.max_colwidth", None)  # не резать ячейки
pd.set_option("display.width", 0)            # ширину подбирает сам
RDLogger.DisableLog('rdApp.info')
RDLogger.DisableLog('rdApp.warning')
RDLogger.DisableLog('rdApp.debug')

# Подготовка данных

In [4]:
df_test = pd
df_train = pd.read_csv('data/train.csv')

In [5]:
X = df_train.drop(columns=['id', 'Tm']) 
y = df_train['Tm']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Обработчик SMILES фичи

In [6]:
def preprocess_smiles(s: str):
    mol = Chem.MolFromSmiles(s, sanitize=True)
    if mol is None:
        return None
    # стандартная очистка
    mol = std.Cleanup(mol)
    # оставить главный фрагмент (без солей)
    mol = std.LargestFragmentChooser().choose(mol)
    # канонизировать таутомер
    mol = std.TautomerEnumerator().Canonicalize(mol)
    Chem.SanitizeMol(mol)
    return mol

In [7]:
class SmilesFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, nBits=4096, radius=2, add_physchem=True):
        self.nBits = int(nBits)
        self.radius = int(radius)
        self.add_physchem = bool(add_physchem)
        self._gen = None

    def fit(self, X, y=None):
        self._gen = rfg.GetMorganGenerator(
            radius=self.radius,
            fpSize=self.nBits,
            includeChirality=False,
            useBondTypes=True,
        )
        return self

    def _fp_bits(self, mol):
        fp = self._gen.GetFingerprint(mol)
        bits = np.zeros((self.nBits,), dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(fp, bits)
        return bits

    def _one(self, s: str):
        mol = preprocess_smiles(s)
        k = 5 if self.add_physchem else 0
        feat = np.zeros(self.nBits + k, dtype=float)
        if mol is None:
            if self.add_physchem:
                feat[self.nBits:] = np.nan
            return feat
        feat[:self.nBits] = self._fp_bits(mol)
        if self.add_physchem:
            feat[self.nBits:] = [
                Descriptors.MolWt(mol),
                Descriptors.MolLogP(mol),
                Descriptors.NumHDonors(mol),
                Descriptors.NumHAcceptors(mol),
                Descriptors.TPSA(mol),
            ]
        return feat

    def transform(self, X):
        col = np.ravel(X).astype(str)
        return np.vstack([self._one(s) for s in col])
    
    def get_feature_names_out(self, input_features=None):
        bits = [f"smiles_ecfp_r{self.radius}_{i}" for i in range(self.nBits)]
        if self.add_physchem:
            phys = ["smiles_MolWt","smiles_MolLogP","smiles_NumHDonors",
                    "smiles_NumHAcceptors","smiles_TPSA"]
            return np.array(bits + phys, dtype=object)
        return np.array(bits, dtype=object)

In [8]:
target = "Tm"
smiles_col = "SMILES"
num_cols = [c for c in df_train.columns if c.startswith("Group ")]

# Импутер нужен на случай NaN (в т.ч. из битых SMILES)
pre = ColumnTransformer(
    transformers=[
        ("smiles", Pipeline([
            ("feat", SmilesFeaturizer(nBits=4096, radius=1, add_physchem=True)),
            ("imp", SimpleImputer(strategy="median")),
        ]), [smiles_col]),
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
        ]), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# Обучение

## Основные функции

In [9]:
def Export(root_path_data = "data"):
    df_train = pd.read_csv(f"{root_path_data}/train.csv")
    df_test = pd.read_csv(f"{root_path_data}/test.csv")
    df_ex_sub = pd.read_csv(f"{root_path_data}/sample_submission.csv")
    return df_train, df_test, df_ex_sub

In [10]:
def create_pipline(model, title):
    model = Pipeline([
        ("prep", pre),
        (f"{title}", model),
    ])
    return model

In [11]:
def study_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    print(f"MAE: {mae:.4f}")
    return model

## Начало работы, инициализация XGB и RF

In [20]:
df_train, df_test, df_ex_sub = Export()

In [21]:
param_rf = {'n_estimators': 1000,
            'random_state': 42,
            'n_jobs': -1,   
            'max_depth': 10,
            'max_features': 0.3,
            'criterion': 'absolute_error',
            'min_samples_leaf': 2}

In [22]:
model_rf_ = RandomForestRegressor(**param_rf)
model_rf_pipline = create_pipline(model_rf_, "rf")
model_rf_after_fit = study_model(model_rf_pipline, X_train, y_train, X_valid, y_valid)

MAE: 33.9030


In [23]:
param_xgb = {
    'n_estimators': 1200,
    'n_jobs': -1,
    'random_state': 42,
    'max_depth': 14,
    'subsample': 0.8,
    'reg_lambda': 1.5,
    'reg_alpha': 0.5,
    'learning_rate': 0.03,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.5,
}

In [24]:
model_xgb_ = XGBRegressor(**param_xgb)
model_xgb_pipline = create_pipline(model_xgb_, "xgb")
model_xgb_after_fit = study_model(model_xgb_pipline, X_train, y_train, X_valid, y_valid)

MAE: 28.2560


In [25]:
model_rf = model_rf_after_fit.named_steps['rf']
model_xgb = model_xgb_after_fit.named_steps['xgb']
X_val = model_rf_after_fit.named_steps['prep'].transform(X_valid)
y_val = y_valid

## Анализ результатов №1

Оценка важности фичей и избавление от бестолковых

In [26]:
def create_feature_imp_table(model, title):
    feature_name = model.named_steps['prep'].get_feature_names_out()
    importances = model.named_steps[f'{title}'].feature_importances_
    fi_table = feature_imp_table(feature_name, importances)
    return fi_table

In [27]:
fi_table_rf = create_feature_imp_table(model_rf_after_fit, "rf")
fi_table_xgb = create_feature_imp_table(model_xgb_after_fit, "xgb")

In [28]:
fi_table_rf.head(20)

Unnamed: 0,Feature,Importance
0,smiles_MolWt,16.058438
1,smiles_TPSA,11.0923
2,smiles_ecfp_r1_1380,8.994978
3,smiles_MolLogP,6.720438
4,smiles_ecfp_r1_3921,5.572328
5,Group 15,5.186246
6,smiles_NumHDonors,4.097896
7,Group 2,3.616978
8,smiles_NumHAcceptors,2.995198
9,smiles_ecfp_r1_3798,1.615119


In [29]:
fi_table_xgb.head(20)

Unnamed: 0,Feature,Importance
0,smiles_ecfp_r1_1380,4.374469
1,smiles_ecfp_r1_3921,2.25964
2,Group 170,1.006094
3,Group 86,0.93946
4,Group 372,0.917649
5,Group 31,0.888564
6,Group 17,0.719548
7,smiles_ecfp_r1_764,0.715913
8,smiles_ecfp_r1_1792,0.705326
9,smiles_ecfp_r1_3661,0.676568


In [37]:
strong_feature_rf = fi_table_rf.loc[fi_table_rf["Importance"] > 0.01, "Feature"].tolist()
strong_feature_xgb = fi_table_xgb.loc[fi_table_xgb["Importance"] > 0.01, "Feature"].tolist()
print("Strong features RF:", len(strong_feature_rf))
print("Strong features XGB:", len(strong_feature_xgb))

Strong features RF: 349
Strong features XGB: 1094


Теперь переучим

In [38]:
X_train_value = model_rf_after_fit.named_steps['prep'].transform(X_train)
X_train_names = model_rf_after_fit.named_steps['prep'].get_feature_names_out()
X_train_value_df = pd.DataFrame(X_train_value, columns=X_train_names)

X_valid_value = model_rf_after_fit.named_steps['prep'].transform(X_valid)
X_valid_names = model_rf_after_fit.named_steps['prep'].get_feature_names_out()
X_valid_value_df = pd.DataFrame(X_valid_value, columns=X_valid_names)

In [77]:
X_train_rf = X_train_value_df[strong_feature_rf]
X_train_xgb = X_train_value_df[strong_feature_xgb]

X_valid_rf = X_valid_value_df[strong_feature_rf]
X_valid_xgb = X_valid_value_df[strong_feature_xgb]

### Переобучение и просмотр новых MAE

In [40]:
def restudy(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)

    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    print(f"MAE after feature selection: {mae:.4f}")
    return model

In [79]:
model_rf_strong = restudy(model_rf_, X_train_rf, y_train, X_valid_rf, y_valid)

MAE after feature selection: 33.9668


In [42]:
model_xgb_strong = restudy(model_xgb_, X_train_xgb, y_train, X_valid_xgb, y_valid)

MAE after feature selection: 28.6546


# Создаем мета-модель

In [71]:
model_meta = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    RidgeCV(alphas=[0.1, 1.0, 10.0])
)

In [None]:
kf = KFold(n_splits=4, shuffle=True, random_state=42)

In [64]:
def build_oof(model, X, y, kf, oof=None, *, desc="OOF"):
    """
    X, y — pandas
    kf — KFold/StratifiedKFold (один и тот же для всех базовых моделей)
    oof — pd.Series или pd.DataFrame под форму предсказаний; если None — создадим Series
    """
    if oof is None:
        oof = pd.Series(np.nan, index=X.index, name="oof")

    splitter = kf.split(X, y)
    splitter = tqdm(splitter, total=kf.get_n_splits(), desc=desc)

    for train_index, valid_index in splitter:
        mdl = clone(model)  # новый клон на каждый фолд

        X_tr, X_val = X.iloc[train_index], X.iloc[valid_index]
        y_tr        = y.iloc[train_index]

        mdl.fit(X_tr, y_tr)
        preds = mdl.predict(X_val)

        # поддержка Series и DataFrame-OOF, а также многомерных предсказаний
        if isinstance(oof, pd.Series):
            oof.iloc[valid_index] = np.asarray(preds).ravel()
        else:
            oof.iloc[valid_index, :] = np.asarray(preds)

    return oof


In [68]:
oof_rf = build_oof(model_rf_, X_train_rf, y_train, kf)
oof_xgb = build_oof(model_xgb_, X_train_xgb, y_train, kf)

OOF: 100%|██████████| 4/4 [02:51<00:00, 42.87s/it]
OOF: 100%|██████████| 4/4 [00:59<00:00, 14.98s/it]


In [72]:
df_meta = pd.DataFrame({
    "rf_pred": oof_rf,
    "xgb_pred": oof_xgb,
})

model_meta.fit(df_meta, y_train)

0,1,2
,steps,"[('standardscaler', ...), ('ridgecv', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alphas,"[0.1, 1.0, ...]"
,fit_intercept,True
,scoring,
,cv,
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [80]:
pred_xgb = model_xgb_strong.predict(X_valid_xgb)
pred_rf = model_rf_strong.predict(X_valid_rf)

df_meta_valid = pd.DataFrame({
    "rf_pred": pred_rf,
    "xgb_pred": pred_xgb,
})

In [81]:
meta_preds = model_meta.predict(df_meta_valid)
mae_meta = mean_absolute_error(y_valid, meta_preds)

In [82]:
print(f"Meta-model MAE: {mae_meta:.4f}")

Meta-model MAE: 28.7383


# Формируем submission

In [83]:
df_test_value = model_rf_after_fit.named_steps['prep'].transform(df_test)
df_test_names = model_rf_after_fit.named_steps['prep'].get_feature_names_out()
df_test_value_df = pd.DataFrame(df_test_value, columns=df_test_names)

### XGB

In [47]:
df_test_strong = df_test_value_df[strong_feature_xgb]
pred = model_xgb_strong.predict(df_test_strong)

In [49]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "Tm": pred,})

submission.to_csv("submission.csv", index=False)

### Meta

In [85]:
df_test_xgb_strong = df_test_value_df[strong_feature_xgb]
df_test_rf_strong = df_test_value_df[strong_feature_rf]

In [86]:
pred_rf = model_rf_strong.predict(df_test_rf_strong)
pred_xgb = model_xgb_strong.predict(df_test_xgb_strong)
df_meta_test = pd.DataFrame({
    "rf_pred": pred_rf,
    "xgb_pred": pred_xgb,
})

In [87]:
pred = model_meta.predict(df_meta_test)

In [88]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "Tm": pred,})

submission.to_csv("submission_meta.csv", index=False)