In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn import metrics

In [2]:
for dataset in ["RDKit", "MF2", "MK", "mordred", "RDKit_pca", "MF2_pca", "MK_pca", "mordred_pca"]:
    print(dataset)
    data = pd.read_csv('data/{}.csv'.format(dataset))
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Yield', 'Ligand_name', 'Ligand_No', 'Substrate_name', 'Substrate_No'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
    a_X_train = (X_train - X_train.mean()) / X_train.std()
    a_X_test = (X_test - X_train.mean()) / X_train.std()
    a_X_train = a_X_train.dropna(how='any', axis=1)
    a_X_test = a_X_test[a_X_train.columns]
    model = RidgeCV(alphas=np.linspace(0.1, 30, num=150), cv=5)
    model.fit(a_X_train, y_train)
    y_pred1 = model.predict(a_X_train)
    y_pred2 = model.predict(a_X_test)
    print('R2 (train)', metrics.r2_score(y_train, y_pred1))
    print('R2 (test)',metrics.r2_score(y_test, y_pred2))
    print('RMSE (train)',metrics.root_mean_squared_error(y_train, y_pred1))
    print('RMSE (test)',metrics.root_mean_squared_error(y_test, y_pred2))

RDKit
R2 (train) 0.7433292270742082
R2 (test) -8.392765734367638
RMSE (train) 20.372756027840317
RMSE (test) 37.130307271609134
MF2
R2 (train) 0.9913695717580584
R2 (test) -10.4952372912325
RMSE (train) 3.7357495097057662
RMSE (test) 41.07624808963481
MK
R2 (train) 0.7521859579645602
R2 (test) -7.8707875941605
RMSE (train) 20.01817727278066
RMSE (test) 36.083851604841435
mordred
R2 (train) 0.9584932634184556
R2 (test) -9.810850963262684
RMSE (train) 8.192583369767444
RMSE (test) 39.834717173981176
RDKit_pca
R2 (train) 0.3481351574655608
R2 (test) -7.944454125417792
RMSE (train) 32.46686494452529
RMSE (test) 36.233369126351775
MF2_pca
R2 (train) 0.7039866913879513
R2 (test) -8.47645566168614
RMSE (train) 21.878482392237284
RMSE (test) 37.29535671671625
MK_pca
R2 (train) 0.4072661702271486
R2 (test) -8.23392647109593
RMSE (train) 30.959321388574477
RMSE (test) 36.815016875035646
mordred_pca
R2 (train) 0.3392349415120145
R2 (test) -7.869965260551682
RMSE (train) 32.687756173292144
RMSE (t