In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading dataset
df=pd.read_csv('Hybrid.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cate_name          68 non-null     object 
 1   Cate_HOMO          68 non-null     float64
 2   Cate_LUMO          68 non-null     float64
 3   Cate_MC_O1         68 non-null     float64
 4   Cate_MC_H1         68 non-null     float64
 5   Cate_MC_O2         68 non-null     float64
 6   Cate_MC_H2         68 non-null     float64
 7   Cate_BL_ArO1       68 non-null     float64
 8   Cate_BL_O1H1       68 non-null     float64
 9   Cate_BL_ArO2       68 non-null     float64
 10  Cate_BL_O2H2       68 non-null     float64
 11  Cate_total_dipole  68 non-null     float64
 12  Cate_RDKit_PC_0    68 non-null     float64
 13  Cate_RDKit_PC_1    68 non-null     float64
 14  Cate_RDKit_PC_2    68 non-null     float64
 15  Cate_RDKit_PC_3    68 non-null     float64
 16  Cate_RDKit_PC_4    68 non-nu

In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_HOMO  Cate_LUMO  Cate_MC_O1  Cate_MC_H1  Cate_MC_O2  Cate_MC_H2  \
0  -5.940433  -0.165443   -0.374537    0.264452   -0.416200    0.277039   
1  -5.940433  -0.165443   -0.374537    0.264452   -0.416200    0.277039   
2  -5.940433  -0.165443   -0.374537    0.264452   -0.416200    0.277039   
3  -5.940433  -0.165443   -0.374537    0.264452   -0.416200    0.277039   
4  -5.807916  -0.157824   -0.376813    0.263887   -0.418652    0.276114   

   Cate_BL_ArO1  Cate_BL_O1H1  Cate_BL_ArO2  Cate_BL_O2H2  ...    P_HOMO  \
0       1.36318       0.96622       1.37543       0.96253  ... -8.171191   
1       1.36318       0.96622       1.37543       0.96253  ... -8.457723   
2       1.36318       0.96622       1.37543       0.96253  ... -7.460440   
3       1.36318       0.96622       1.37543       0.96253  ... -8.082211   
4       1.36386       0.96625       1.37631       0.96239  ... -8.171191   

     P_LUMO    P_MC_P   P_MC_=O    P_MC_H  P_BL_P=O   P_BL_PH  P_total_dip

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
cum_coef = np.zeros((1,X.shape[1]))

#Lasso
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    
    n_lambda = 1000
    alphas = np.logspace(-6, 0, n_lambda)
    model = LassoCV(cv=5, alphas=alphas)
    model.fit(a_X_train, y_train['Yield'])
    
    cum_coef += model.coef_
    
    y_pred1 = model.predict(a_X_train)
    y_pred2 = model.predict(a_X_test)
    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.root_mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.root_mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)

    print('----------------------')
    print('seed:', seed)
    print('R2_train:', r2_train)
    print('R2_test:', r2_test)

print('R2_train_means:', sum(data_r2_train)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
R2_train: 0.8552269050607917
R2_test: 0.6937625018013396
----------------------
seed: 1
R2_train: 0.795707403343855
R2_test: 0.8266948899568507
----------------------
seed: 2
R2_train: 0.8379176548837481
R2_test: 0.7926990272553077
----------------------
seed: 3
R2_train: 0.8388221282171142
R2_test: 0.8180746402397528
----------------------
seed: 4
R2_train: 0.8556340097051194
R2_test: 0.7549698963535131
----------------------
seed: 5
R2_train: 0.8301127924624971
R2_test: 0.6742097840590031
----------------------
seed: 6
R2_train: 0.8676454451153909
R2_test: 0.7279564664287199
----------------------
seed: 7
R2_train: 0.8547062758478361
R2_test: 0.7554796143146876
----------------------
seed: 8
R2_train: 0.8669658697094507
R2_test: 0.7141927020848375
----------------------
seed: 9
R2_train: 0.8142286285804096
R2_test: 0.8321055374742143
R2_train_means: 0.8416967112926214
R2_test_means: 0.7590145059968225


In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
data_all.loc['std'] = data_all.std(ddof=0)
print(data_all)

data_all.to_csv('../../score/Hybrid/Lasso_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test  RMSE_test   MAE_test
0     0.855227   10.331323   6.413199  0.693763  20.013580  13.197410
1     0.795707   13.592048   8.720580  0.826695  12.957867   9.605282
2     0.837918   12.245090   8.030971  0.792699  14.158839   9.176094
3     0.838822   12.261009   7.806728  0.818075  13.213710   9.035978
4     0.855634   11.156051   7.337818  0.754970  16.141451  10.027264
5     0.830113   11.728776   7.039772  0.674210  19.905461  12.422260
6     0.867645   10.422087   6.542202  0.727956  18.094021  12.002616
7     0.854706   11.788687   8.139060  0.755480  14.888239  10.651350
8     0.866966   10.820609   6.910464  0.714193  16.806100  10.834166
9     0.814229   13.721105   9.758642  0.832106  10.876953   8.929774
mean  0.841697   11.806678   7.669944  0.759015  15.705622  10.588219
std   0.021073    1.076484   0.942485  0.051344   2.754155   1.370846
