In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
for d in ['RDKit_pca', 'mordred_pca', 'RDKit', 'mordred']:
    dataset = d
    print(dataset)
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])

    r2_train = []
    r2_test = []
    mae_train = []
    mae_test = []
    for i in range(0,10):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        param = {"n_estimators": [100, 1000, 5000], "max_depth": [3, 4, 5, 6]}
        reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=10, n_jobs=6)
        reg.fit(X_train,y_train['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(X_train)
        y_pred2 = best.predict(X_test)
        r2_train.append(metrics.r2_score(y_train, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))
        mae_train.append(metrics.mean_absolute_error(y_train, y_pred1))
        mae_test.append(metrics.mean_absolute_error(y_test, y_pred2))
    
    r2_train = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test = pd.DataFrame(data=r2_test, columns=['r2_test'])
    mae_train = pd.DataFrame(data=mae_train, columns=['mae_train'])
    mae_test = pd.DataFrame(data=mae_test, columns=['mae_test'])
    result = pd.concat([r2_train, r2_test, mae_train, mae_test], axis=1, join='inner')
    print(result)
    result.to_csv(f'result/RF/result_{dataset}.csv')

RDKit_pca
   r2_train   r2_test  mae_train   mae_test
0  0.885563  0.576981   5.922712  14.247677
1  0.947994  0.422283   4.924399  10.243433
2  0.895042  0.482464   3.823799  17.294410
3  0.922492  0.783300   5.060834   9.434829
4  0.955648  0.486026   2.438979  14.235339
5  0.963667  0.150741   3.615032  13.991110
6  0.963478  0.551338   3.586818  10.884889
7  0.972082  0.579802   2.776558  10.366935
8  0.945892  0.571114   4.773543  10.605812
9  0.945905  0.477103   5.049452  10.654257
mordred_pca
   r2_train   r2_test  mae_train   mae_test
0  0.900637  0.549993   5.608597  14.375713
1  0.944879  0.345474   5.165840  10.473707
2  0.887138  0.519601   4.040738  17.267424
3  0.917243  0.728155   5.427089   9.952995
4  0.957682  0.296094   2.720961  16.167035
5  0.965361  0.513932   3.694799  11.932659
6  0.965024  0.457817   3.437768  12.628935
7  0.962952  0.540370   3.452204  11.429866
8  0.913778  0.539386   6.127037  11.444245
9  0.933619  0.462668   5.787426  10.149073
RDKit
   r