In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LassoCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
for dataset in ['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    data = pd.read_csv(f'data/data_{dataset}.csv')
    y = pd.DataFrame(data['Yield'],columns=['Yield'])
    X = data.drop(columns=['Name', 'ID', 'Yield'])
    
    data_s = pd.read_csv(f'data/data_{dataset}_ALL.csv')
    y_s = pd.DataFrame(data_s['Yield'],columns=['Yield'])
    X_s = data_s.drop(columns=['Name', 'ID', 'Yield'])
    scaler = StandardScaler()
    a_X_s = scaler.fit_transform(X_s)

    r2_train = []
    r2_test = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=i)
        a_X_test = scaler.transform(X_test)
        reg = LassoCV(alphas=np.linspace(0, 10, num=101), cv=5, n_jobs=16, max_iter=10000)
        reg.fit(a_X_s,y_s['Yield'])
        y_pred1 = reg.predict(a_X_s)
        y_pred2 = reg.predict(a_X_test)
        r2_train.append(metrics.r2_score(y_s, y_pred1))
        r2_test.append(metrics.r2_score(y_test, y_pred2))  

    r2_train = pd.DataFrame(data=r2_train, columns=['r2_train'])
    r2_test = pd.DataFrame(data=r2_test, columns=['r2_test'])
    result = pd.concat([r2_train, r2_test], axis=1, join='inner')
    result.to_csv(f'result/result_{dataset}.csv', index = False)

In [3]:
for re in['DFT', 'MF_pca', 'MK_pca', 'RDKit_pca', 'mordred_pca', 'MF', 'MK', 'RDKit', 'mordred']:
    res = pd.read_csv(f'result/result_{re}.csv')
    print(f'Dataset: {re}')
    print(res.mean())
    print('=======================')

Dataset: DFT
r2_train    0.371480
r2_test     0.023707
dtype: float64
Dataset: MF_pca
r2_train    0.606386
r2_test     0.272892
dtype: float64
Dataset: MK_pca
r2_train    0.494814
r2_test     0.228808
dtype: float64
Dataset: RDKit_pca
r2_train    0.541653
r2_test     0.260429
dtype: float64
Dataset: mordred_pca
r2_train    0.318373
r2_test     0.157156
dtype: float64
Dataset: MF
r2_train    0.673391
r2_test     0.325065
dtype: float64
Dataset: MK
r2_train    0.628814
r2_test     0.250134
dtype: float64
Dataset: RDKit
r2_train    0.646097
r2_test     0.313256
dtype: float64
Dataset: mordred
r2_train    0.68273
r2_test     0.37145
dtype: float64
