In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
def calculate_statistics(group):
    r2_test = group['r2_test']
    r2_test_dict = {f'run{i}': r2_test_val for i, r2_test_val in enumerate(r2_test)}
    return pd.Series({
        **r2_test_dict, 
        'r2_test_mean': np.mean(r2_test),
        'r2_test_max': np.max(r2_test),
        'r2_test_min': np.min(r2_test),
        'r2_test_std': np.std(r2_test, ddof=0),
    })

def calculate_statistics2(group):
    rmse_test = group['rmse_test']
    rmse_test_dict = {f'run{i}': rmse_test_val for i, rmse_test_val in enumerate(rmse_test)}
    return pd.Series({
        **rmse_test_dict, 
        'rmse_test_mean': np.mean(rmse_test),
        'rmse_test_max': np.max(rmse_test),
        'rmse_test_min': np.min(rmse_test),
        'rmse_test_std': np.std(rmse_test, ddof=0),
    })

In [3]:
results_r2 = []
results_rmse = []
for i in range(10):
    for dataset in ['RDKit', 'mordred']:
        data = pd.read_csv(f'data/data_{dataset}.csv')
        y = pd.DataFrame(data['Yield'],columns=['Yield'])
        X = data.drop(columns=['Name', 'ID', 'Yield'])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        param = {"n_estimators": [100, 1000, 5000], "max_depth": [3, 4, 5, 6]}
        reg = GridSearchCV(RandomForestRegressor(random_state=0), param_grid=param, cv=10, n_jobs=6)
        reg.fit(X_train,y_train['Yield'])
        best = reg.best_estimator_
        y_pred1 = best.predict(X_train)
        y_pred2 = best.predict(X_test)
        r2_test_score = metrics.r2_score(y_test, y_pred2)
        rmse_test_score = metrics.root_mean_squared_error(y_test, y_pred2)
        results_r2.append({'desc': dataset,'r2_test': r2_test_score})
        results_rmse.append({'desc': dataset, 'rmse_test': rmse_test_score})

results_df = pd.DataFrame(results_r2)
gen_results = results_df.groupby(['desc']).apply(calculate_statistics).reset_index()
results_df2 = pd.DataFrame(results_rmse)
gen_results2 = results_df2.groupby(['desc']).apply(calculate_statistics2).reset_index()

In [4]:
gen_results.T.to_csv('result/result_r2.csv', header=False)
gen_results.T

Unnamed: 0,0,1
desc,RDKit,mordred
run0,0.702534,0.670578
run1,0.513088,0.419287
run2,0.569439,0.589574
run3,0.623737,0.708651
run4,0.322614,0.445582
run5,0.225827,0.425795
run6,0.47365,0.617306
run7,0.550116,0.654575
run8,0.432336,0.548675


In [5]:
gen_results2.T.to_csv('result/result_rmse.csv', header=False)
gen_results2.T

Unnamed: 0,0,1
desc,RDKit,mordred
run0,16.837799,17.719157
run1,13.814436,15.0865
run2,21.605288,21.094053
run3,18.047578,15.88108
run4,26.448645,23.927872
run5,24.636515,21.217475
run6,20.438763,17.427809
run7,19.698261,17.260534
run8,21.011239,18.734864
