In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
for dataset in ["RDKit", "MF2", "MK", "mordred", "RDKit_pca", "MF2_pca", "MK_pca", "mordred_pca"]:
    print(dataset)
    data = pd.read_csv('data/{}.csv'.format(dataset))
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Yield', 'Ligand_name', 'Ligand_No', 'Substrate_name', 'Substrate_No'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
    param = {"n_estimators": [100, 500, 1000, 3000, 5000], "max_depth": [3, 4, 5]}
    reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=5, n_jobs=16)
    reg.fit(X_train, y_train['Yield'])
    best = reg.best_estimator_
    y_pred1 = best.predict(X_train)
    y_pred2 = best.predict(X_test)
    print('R2 (train)', metrics.r2_score(y_train, y_pred1))
    print('R2 (test)',metrics.r2_score(y_test, y_pred2))
    print('RMSE (train)',metrics.root_mean_squared_error(y_train, y_pred1))
    print('RMSE (test)',metrics.root_mean_squared_error(y_test, y_pred2))

RDKit
R2 (train) 0.8532268040251354
R2 (test) -7.734414815751691
RMSE (train) 15.405824660492081
RMSE (test) 35.80541441947967
MF2
R2 (train) 0.8978194482086881
R2 (test) -8.35564931075974
RMSE (train) 12.854219063801365
RMSE (test) 37.05687265093229
MK
R2 (train) 0.6102804540800826
R2 (test) -8.07410626842132
RMSE (train) 25.103696116470275
RMSE (test) 36.49503028822669
mordred
R2 (train) 0.8792728798781958
R2 (test) -7.535717656975958
RMSE (train) 13.972174327989146
RMSE (test) 35.39580700535306
RDKit_pca
R2 (train) 0.8293839951108052
R2 (test) -7.300040661088428
RMSE (train) 16.610069871398302
RMSE (test) 34.90373541567089
MF2_pca
R2 (train) 0.8102605533656477
R2 (test) -8.148142059338648
RMSE (train) 17.51621995061925
RMSE (test) 36.64360963206871
MK_pca
R2 (train) 0.8216814945094187
R2 (test) -8.022010606715417
RMSE (train) 16.980863970124478
RMSE (test) 36.39011805968015
mordred_pca
R2 (train) 0.7310681133248896
R2 (test) -5.990239634660946
RMSE (train) 20.85368093490403
RMSE (te