In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn import metrics

In [2]:
for d in ['RDKit_pca', 'mordred_pca', 'RDKit', 'mordred']:
    dataset = d
    print(dataset)
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])

    r2_train = []
    r2_test = []
    mae_train = []
    mae_test = []
    for i in range(0,10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        param = {"max_leaf_nodes": [3, 5, 7], "max_depth": [3, 4, 5, 6], "l2_regularization": [0, 0.01, 0.1, 1],
                 "min_samples_leaf": [2, 3, 5]}
        reg = GridSearchCV(HistGradientBoostingRegressor(random_state=0), param_grid=param, cv=10, n_jobs=12)
        reg.fit(X_train,y_train['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(X_train)
        y_pred2 = best.predict(X_test)
        r2_train.append(metrics.r2_score(y_train, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))
        mae_train.append(metrics.mean_absolute_error(y_train, y_pred1))
        mae_test.append(metrics.mean_absolute_error(y_test, y_pred2))
    
    r2_train = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test = pd.DataFrame(data=r2_test, columns=['r2_test'])
    mae_train = pd.DataFrame(data=mae_train, columns=['mae_train'])
    mae_test = pd.DataFrame(data=mae_test, columns=['mae_test'])
    result = pd.concat([r2_train, r2_test, mae_train, mae_test], axis=1, join='inner')
    print(result)
    result.to_csv(f'result/HGB/result_{dataset}.csv')

RDKit_pca
   r2_train   r2_test  mae_train   mae_test
0  0.999590  0.632261   0.406960  11.498774
1  0.999661  0.479234   0.485535   7.880166
2  0.999956  0.455665   0.113175  17.079299
3  0.999406  0.790124   0.555498   8.385604
4  0.999836  0.534578   0.226934  13.229597
5  0.995442  0.233879   1.345097  13.768963
6  0.999634  0.646874   0.441027   8.795886
7  0.999986  0.626043   0.082086   9.689114
8  0.999993  0.527299   0.061879  10.177518
9  0.999839  0.587551   0.281357   9.868595
mordred_pca
   r2_train   r2_test  mae_train   mae_test
0  0.999964  0.511383   0.121380  13.380627
1  0.983626  0.117861   3.126943  13.718138
2  0.999063  0.391982   0.487240  17.842728
3  0.999942  0.681051   0.167480   9.289176
4  0.983604  0.203650   1.875058  16.739185
5  0.999487  0.508458   0.393141  12.330423
6  0.999833  0.443125   0.287432  12.019065
7  0.991377  0.522214   1.760067  11.295777
8  0.999950  0.474215   0.162810  12.177840
9  0.982553  0.530027   2.923568  10.035897
RDKit
   r