In [2]:

import pandas as pd

# Load data
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Impute NaN values
median_value = train_data['AlogP'].median()
train_data['AlogP'].fillna(median_value, inplace=True)
test_data['AlogP'].fillna(median_value, inplace=True)


In [3]:

from sklearn.preprocessing import PolynomialFeatures

# Create polynomial features
X = train_data.drop(columns=['id', 'SMILES', 'MLM', 'HLM'])
X_test = test_data.drop(columns=['id', 'SMILES'])

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(X_test)


In [4]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor

# Define target variables
y_mlm = train_data['MLM']
y_hlm = train_data['HLM']

# Reduced hyperparameters for demonstration
param_dist_rf = {
    'n_estimators': [100],
    'max_depth': [10]
}

param_dist_gbr = {
    'n_estimators': [100],
    'learning_rate': [0.1]
}

param_dist_cat = {
    'iterations': [100],
    'learning_rate': [0.1],
    'depth': [10]
}

# Error handling for hyperparameter tuning
try:
    rf_mlm_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist_rf, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    rf_mlm_search.fit(X_poly, y_mlm)
    best_rf_mlm_params = rf_mlm_search.best_params_
except:
    best_rf_mlm_params = {'n_estimators': 100, 'max_depth': 10}

try:
    rf_hlm_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist_rf, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    rf_hlm_search.fit(X_poly, y_hlm)
    best_rf_hlm_params = rf_hlm_search.best_params_
except:
    best_rf_hlm_params = {'n_estimators': 100, 'max_depth': 10}

try:
    gbr_mlm_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), param_distributions=param_dist_gbr, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    gbr_mlm_search.fit(X_poly, y_mlm)
    best_gbr_mlm_params = gbr_mlm_search.best_params_
except:
    best_gbr_mlm_params = {'n_estimators': 100, 'learning_rate': 0.1}

try:
    gbr_hlm_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), param_distributions=param_dist_gbr, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    gbr_hlm_search.fit(X_poly, y_hlm)
    best_gbr_hlm_params = gbr_hlm_search.best_params_
except:
    best_gbr_hlm_params = {'n_estimators': 100, 'learning_rate': 0.1}

try:
    cat_mlm_search = RandomizedSearchCV(CatBoostRegressor(random_state=42, verbose=0), param_distributions=param_dist_cat, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    cat_mlm_search.fit(X_poly, y_mlm)
    best_cat_mlm_params = cat_mlm_search.best_params_
except:
    best_cat_mlm_params = {'iterations': 100, 'learning_rate': 0.1, 'depth': 10}

try:
    cat_hlm_search = RandomizedSearchCV(CatBoostRegressor(random_state=42, verbose=0), param_distributions=param_dist_cat, n_iter=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    cat_hlm_search.fit(X_poly, y_hlm)
    best_cat_hlm_params = cat_hlm_search.best_params_
except:
    best_cat_hlm_params = {'iterations': 100, 'learning_rate': 0.1, 'depth': 10}

best_rf_mlm_params, best_rf_hlm_params, best_gbr_mlm_params, best_gbr_hlm_params, best_cat_mlm_params, best_cat_hlm_params


({'n_estimators': 100, 'max_depth': 10},
 {'n_estimators': 100, 'max_depth': 10},
 {'n_estimators': 100, 'learning_rate': 0.1},
 {'n_estimators': 100, 'learning_rate': 0.1},
 {'learning_rate': 0.1, 'iterations': 100, 'depth': 10},
 {'learning_rate': 0.1, 'iterations': 100, 'depth': 10})

In [5]:
import pandas as pd
import numpy as np
import datamol as dm

from rdkit.Chem import SaltRemover
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans.concat import FeatConcat
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from pycaret.regression import *

In [6]:
ETC_COLUMNS = ["AlogP", "Molecular_Weight", "Num_H_Acceptors", "Num_H_Donors", "Num_RotatableBonds", "LogD", "Molecular_PolarSurfaceArea"]
AVAILABLE_FPS = ['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count',
                 'cats2D', 'pharm2D', 'scaffoldkeys', 'skeys']
MODEL = "v9_2"
SEED = 42

dm.disable_rdkit_log()

In [7]:
def preprocess_mol(row):
    mol = dm.to_mol(row["SMILES"], ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(
        mol,
        disconnect_metals=False,
        normalize=True,
        reionize=True,
        uncharge=False,
        stereo=True,
    )

    #mol = SaltRemover.SaltRemover().StripMol(mol, dontRemoveEverything=True)    
    row["Standard_Smiles"] = dm.to_smiles(mol)
    
    return row

def fill_na(df, imputer=None):
    if imputer is None:
        imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1), random_state=SEED)        
        df[ETC_COLUMNS] = imputer.fit_transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df), imputer
    else:
        df[ETC_COLUMNS] = imputer.transform(df[ETC_COLUMNS].to_numpy())
    
        return pd.DataFrame(df)

def extract_features(df):    
    _df = df.apply(preprocess_mol, axis=1)
    
    fps = []
    for fp in AVAILABLE_FPS:
        fps.append(FPVecTransformer(fp, dtype=np.float64, n_jobs=-1))
    
    featurizer = FeatConcat(fps, dtype=np.float64)
    smiles = _df["Standard_Smiles"].to_list()
    descriptors = featurizer(smiles)
            
    etcs = _df[ETC_COLUMNS].to_numpy()
    
    return pd.DataFrame(np.concatenate([descriptors, etcs], axis=1))

In [8]:
df_train = pd.read_csv("./data/train.csv").drop(columns=["id"], axis=1)

df_train["MLM"] = df_train.groupby(by=["SMILES"])["MLM"].transform("max")
df_train["HLM"] = df_train.groupby(by=["SMILES"])["HLM"].transform("max")
df_train = df_train.drop_duplicates().reset_index(drop=True)

In [9]:
df, imputer = fill_na(df_train)
df = extract_features(df)

df[["MLM", "HLM"]] = df_train[["MLM", "HLM"]]

In [13]:
def train(df, target="MLM"):    
    _df = df.drop(columns=["MLM", "HLM"], axis=1).copy()
    _df[target] = df[target]    
    _setup = setup(data=_df, target=target, train_size=0.8, session_id=SEED, transformation=False, normalize=False, use_gpu=False)
    
    _compare_models = compare_models(sort="RMSE", include=["rf", "catboost"], n_select=2)
    tuned_models = [tune_model(model, optimize="RMSE") for model in _compare_models]    
    
    blender = blend_models(tuned_models, optimize="RMSE")
    save_model(blender, f"./models/blender_{target}_{MODEL}")

In [14]:
train(df, target="MLM")
train(df, target="HLM")

Unnamed: 0,Description,Value
0,Session id,42
1,Target,MLM
2,Target type,Regression
3,Original data shape,"(3471, 32842)"
4,Transformed data shape,"(3471, 32842)"
5,Transformed train set shape,"(2776, 32842)"
6,Transformed test set shape,"(695, 32842)"
7,Numeric features,32841
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,25.1466,940.1651,30.6544,0.2683,1.5672,53.3367,285.763
rf,Random Forest Regressor,25.7266,950.7172,30.8254,0.2606,1.5997,53.4869,390.524


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.2602,1003.0787,31.6714,0.2525,1.5682,59.8191
1,27.882,1048.8496,32.3859,0.2361,1.6243,42.0399
2,27.1016,940.2512,30.6635,0.2697,1.6784,82.6811
3,27.019,982.097,31.3384,0.2599,1.6312,60.5138
4,26.9696,1016.49,31.8824,0.1998,1.7794,55.8285
5,25.8494,938.2141,30.6303,0.2438,1.6334,49.6216
6,27.2699,1021.9193,31.9675,0.2282,1.6759,63.4551
7,27.0878,981.5694,31.33,0.2147,1.6936,48.9699
8,25.7324,946.7571,30.7694,0.2251,1.5564,55.4167
9,26.8506,974.0936,31.2105,0.2064,1.7244,112.0464


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.5083,1042.2611,32.2841,0.2233,1.5836,63.2326
1,27.7336,1052.253,32.4384,0.2336,1.6282,40.6092
2,26.6931,935.2878,30.5825,0.2735,1.674,78.7893
3,26.8363,984.1964,31.3719,0.2583,1.6355,59.3225
4,26.7013,1032.5572,32.1334,0.1871,1.7754,55.7552
5,25.6375,929.7535,30.4919,0.2506,1.6287,51.3775
6,27.1284,1020.6828,31.9481,0.2292,1.6788,62.5992
7,26.9053,981.3792,31.327,0.2148,1.6935,49.2963
8,25.7079,952.1155,30.8564,0.2207,1.5619,57.63
9,26.7653,972.3068,31.1818,0.2078,1.7184,111.0669


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,25.0511,928.6214,30.4733,0.308,1.4548,42.5031
1,26.3221,1009.7995,31.7773,0.2645,1.5518,35.8782
2,25.0458,859.7237,29.321,0.3322,1.6032,61.9915
3,25.2435,899.9023,29.9984,0.3218,1.5524,48.6799
4,25.5979,975.1083,31.2267,0.2324,1.7158,50.088
5,24.5791,890.6811,29.8443,0.2821,1.585,44.4334
6,25.2071,934.5465,30.5704,0.2942,1.5843,56.0436
7,25.6986,923.788,30.3939,0.2609,1.6247,40.0476
8,24.7115,917.2979,30.2869,0.2492,1.5062,51.1889
9,25.1049,908.8874,30.1478,0.2595,1.632,102.3636


Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Description,Value
0,Session id,42
1,Target,HLM
2,Target type,Regression
3,Original data shape,"(3471, 32842)"
4,Transformed data shape,"(3471, 32842)"
5,Transformed train set shape,"(2776, 32842)"
6,Transformed test set shape,"(695, 32842)"
7,Numeric features,32841
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,26.8653,995.6693,31.5377,0.2383,1.3128,35.0741,296.656
catboost,CatBoost Regressor,26.4795,996.3748,31.5441,0.2375,1.293,33.6363,280.158


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28.0385,1016.7875,31.8871,0.2623,1.3522,23.5191
1,28.8313,1141.1459,33.7809,0.1579,1.4307,32.6964
2,28.0255,1032.7745,32.1368,0.2531,1.4223,75.1352
3,27.9625,1067.5416,32.6733,0.1883,1.3418,22.4113
4,27.9872,1091.9318,33.0444,0.1693,1.4642,60.4374
5,26.0964,909.3619,30.1556,0.2445,1.2707,15.7078
6,27.7476,1014.9918,31.8589,0.2374,1.4091,37.3632
7,28.2706,1082.902,32.9075,0.1653,1.3451,17.0349
8,26.2215,938.5613,30.6359,0.2227,1.1533,41.3266
9,26.6281,981.5014,31.3289,0.2369,1.3223,47.2627


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.8764,993.4704,31.5194,0.2792,1.3524,22.8037
1,28.0936,1080.2859,32.8677,0.2028,1.4185,31.9798
2,28.2355,1039.0673,32.2346,0.2486,1.4228,78.1776
3,28.2785,1059.9462,32.5568,0.1941,1.3363,23.5442
4,27.7804,1073.479,32.764,0.1833,1.4622,62.5189
5,26.2443,920.8255,30.3451,0.2349,1.2758,15.2247
6,28.0214,1023.2728,31.9886,0.2312,1.4105,35.8142
7,28.0909,1066.6862,32.6602,0.1778,1.3454,17.2071
8,26.0094,920.689,30.3429,0.2375,1.1423,41.6488
9,27.0858,1010.0676,31.7816,0.2147,1.3345,46.8668


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,26.3721,940.7808,30.6722,0.3175,1.2966,18.2857
1,27.7324,1089.8316,33.0126,0.1958,1.3891,29.3628
2,26.6776,974.3001,31.2138,0.2954,1.3695,71.2962
3,27.4656,1042.3567,32.2856,0.2074,1.2978,21.0867
4,26.6135,1023.0655,31.9854,0.2217,1.4067,58.3007
5,25.2062,862.1351,29.3621,0.2837,1.218,13.8838
6,26.4978,946.3511,30.7628,0.289,1.3601,33.9589
7,26.6823,1013.3614,31.8333,0.2189,1.2762,13.4964
8,25.1261,879.9926,29.6647,0.2712,1.1134,40.4714
9,26.3862,970.4101,31.1514,0.2455,1.2794,43.3372


Transformation Pipeline and Model Successfully Saved


In [15]:
def predict(df):    
    blender_MLM = load_model(f"./models/blender_MLM_{MODEL}")
    blender_HLM = load_model(f"./models/blender_HLM_{MODEL}")    
    
    pred_MLM = predict_model(blender_MLM, df)
    pred_HLM = predict_model(blender_HLM, df)
    
    return pred_MLM, pred_HLM   

In [16]:
df_test = pd.read_csv("./data/test.csv").drop(columns=["id"])
df = fill_na(df_test, imputer)
df = extract_features(df)

pred_MLM, pred_HLM = predict(df)

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [17]:
df_submission = pd.read_csv("./submission/sample_submission.csv")
df_submission["MLM"] = pred_MLM["prediction_label"]
df_submission["HLM"] = pred_HLM["prediction_label"]
df_submission.to_csv(f"./submission/{MODEL}_0917.csv", index=False)