# Предварительная настройка

In [125]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import  AllChem, Descriptors
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from rdkit.Chem import rdFingerprintGenerator as rfg

import sys
sys.path.append(r"C:\Users\Alexey\Documents\programming\VSCode\mymodules")

from feature_importances_table import feature_imp_table

In [126]:
pd.set_option("display.max_rows", None)      # показать все строки
pd.set_option("display.max_columns", None)   # все столбцы
pd.set_option("display.max_colwidth", None)  # не резать ячейки
pd.set_option("display.width", 0)            # ширину подбирает сам

# Подготовка данных

In [127]:
df_test = pd
df_train = pd.read_csv('data/train.csv')

In [128]:
X = df_train.drop(columns=['id', 'Tm']) 
y = df_train['Tm']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Обработчик SMILES фичи

In [129]:
class SmilesFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, nBits=4096, radius=2, add_physchem=True):
        self.nBits = int(nBits)
        self.radius = int(radius)
        self.add_physchem = bool(add_physchem)
        self._gen = None

    def fit(self, X, y=None):
        self._gen = rfg.GetMorganGenerator(
            radius=self.radius,
            fpSize=self.nBits,
            includeChirality=False,
            useBondTypes=True,
        )
        return self

    def _fp_bits(self, mol):
        fp = self._gen.GetFingerprint(mol)
        bits = np.zeros((self.nBits,), dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(fp, bits)
        return bits

    def _one(self, s: str):
        mol = Chem.MolFromSmiles(s)
        k = 5 if self.add_physchem else 0
        feat = np.zeros(self.nBits + k, dtype=float)
        if mol is None:
            if self.add_physchem:
                feat[self.nBits:] = np.nan
            return feat
        feat[:self.nBits] = self._fp_bits(mol)
        if self.add_physchem:
            feat[self.nBits:] = [
                Descriptors.MolWt(mol),
                Descriptors.MolLogP(mol),
                Descriptors.NumHDonors(mol),
                Descriptors.NumHAcceptors(mol),
                Descriptors.TPSA(mol),
            ]
        return feat

    def transform(self, X):
        col = np.ravel(X).astype(str)
        return np.vstack([self._one(s) for s in col])
    
    def get_feature_names_out(self, input_features=None):
        bits = [f"smiles_ecfp_r{self.radius}_{i}" for i in range(self.nBits)]
        if self.add_physchem:
            phys = ["smiles_MolWt","smiles_MolLogP","smiles_NumHDonors",
                    "smiles_NumHAcceptors","smiles_TPSA"]
            return np.array(bits + phys, dtype=object)
        return np.array(bits, dtype=object)

In [130]:
target = "Tm"
smiles_col = "SMILES"
num_cols = [c for c in df_train.columns if c.startswith("Group ")]

# Импутер нужен на случай NaN (в т.ч. из битых SMILES)
pre = ColumnTransformer(
    transformers=[
        ("smiles", Pipeline([
            ("feat", SmilesFeaturizer(nBits=4096, radius=1, add_physchem=True)),
            ("imp", SimpleImputer(strategy="median")),
        ]), [smiles_col]),
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
        ]), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# Обучение

## Основные функции

In [160]:
def Export(root_path_data = "data"):
    df_train = pd.read_csv(f"{root_path_data}/train.csv")
    df_test = pd.read_csv(f"{root_path_data}/test.csv")
    df_ex_sub = pd.read_csv(f"{root_path_data}/sample_submission.csv")
    return df_train, df_test, df_ex_sub

In [155]:
def create_pipline(model, title):
    model = Pipeline([
        ("prep", pre),
        (f"{title}", model),
    ])
    return model

In [158]:
def study_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    print(f"MAE: {mae:.4f}")
    return model

## Начало работы, инициализация XGB и RF

In [161]:
df_train, df_test, df_ex_sub = Export()

In [162]:
model_rf_ = RandomForestRegressor(n_estimators=600, max_depth=None, n_jobs=-1, random_state=42)
model_rf_pipline = create_pipline(model_rf_, "rf")
model_rf_after_fit = study_model(model_rf_pipline, X_train, y_train, X_valid, y_valid)

MAE: 31.7720


In [163]:
param_xgb = {
    'n_estimators': 1200,
    'n_jobs': -1,
    'random_state': 42,
    'max_depth': 14,
    'subsample': 0.8,
    'reg_lambda': 1.5,
    'reg_alpha': 0.5,
    'learning_rate': 0.03,

}

In [164]:
model_xgb_ = XGBRegressor(**param_xgb)
model_xgb_pipline = create_pipline(model_xgb_, "xgb")
model_xgb_after_fit = study_model(model_xgb_pipline, X_train, y_train, X_valid, y_valid)

MAE: 30.4720
