In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
for dataset in ['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])
    
    data_s = pd.read_csv(f'data/data_{dataset}_ALL.csv')
    y_s = pd.DataFrame(data_s['Yield'],columns=['Yield'])
    X_s = data_s.drop(columns=['Name', 'ID', 'Yield'])

    r2_train = []
    r2_test = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        param = {"n_estimators": [100,1000, 5000], "max_depth": [5, 7, 9]}
        reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=5, n_jobs=16)
        reg.fit(X_s,y_s['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(X_s)
        y_pred2 = best.predict(X_test)
        r2_train.append(metrics.r2_score(y_s, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))  

    r2_train = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test = pd.DataFrame(data=r2_test, columns=['r2_test'])
    result = pd.concat([r2_train, r2_test], axis=1, join='inner')
    result.to_csv(f'result/result_{dataset}.csv', index = False)

In [3]:
for re in['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    res = pd.read_csv(f'result/result_{re}.csv')
    print(f'Dataset: {re}')
    print(res.mean())
    print('=======================')

Dataset: DFT
r2_train    0.703451
r2_test     0.388861
dtype: float64
Dataset: MF_pca
r2_train    0.756550
r2_test     0.403518
dtype: float64
Dataset: MK_pca
r2_train    0.714070
r2_test     0.350631
dtype: float64
Dataset: RDKit_pca
r2_train    0.728676
r2_test     0.341707
dtype: float64
Dataset: mordred_pca
r2_train    0.910037
r2_test     0.390688
dtype: float64
Dataset: MF
r2_train    0.741348
r2_test     0.365412
dtype: float64
Dataset: MK
r2_train    0.679958
r2_test     0.279053
dtype: float64
Dataset: RDKit
r2_train    0.777394
r2_test     0.351333
dtype: float64
Dataset: mordred
r2_train    0.787231
r2_test     0.383747
dtype: float64
