In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading dataset
df=pd.read_csv('Hybrid.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cate_name          68 non-null     object 
 1   Cate_HOMO          68 non-null     float64
 2   Cate_LUMO          68 non-null     float64
 3   Cate_ESP_O1        68 non-null     float64
 4   Cate_ESP_H1        68 non-null     float64
 5   Cate_ESP_O2        68 non-null     float64
 6   Cate_ESP_H2        68 non-null     float64
 7   Cate_BL_ArO1       68 non-null     float64
 8   Cate_BL_O1H1       68 non-null     float64
 9   Cate_BL_ArO2       68 non-null     float64
 10  Cate_BL_O2H2       68 non-null     float64
 11  Cate_total_dipole  68 non-null     float64
 12  Cate_RDKit_PC_0    68 non-null     float64
 13  Cate_RDKit_PC_1    68 non-null     float64
 14  Cate_RDKit_PC_2    68 non-null     float64
 15  Cate_RDKit_PC_3    68 non-null     float64
 16  Cate_RDKit_PC_4    68 non-nu

In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_HOMO  Cate_LUMO  Cate_ESP_O1  Cate_ESP_H1  Cate_ESP_O2  Cate_ESP_H2  \
0      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
1      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
2      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
3      -5.63       0.21    -0.555170     0.441278    -0.594799     0.444770   
4      -5.51       0.22    -0.542028     0.433177    -0.582447     0.440296   

   Cate_BL_ArO1  Cate_BL_O1H1  Cate_BL_ArO2  Cate_BL_O2H2  ...  P_HOMO  \
0         1.363         0.969         1.378         0.965  ...   -7.85   
1         1.363         0.969         1.378         0.965  ...   -8.16   
2         1.363         0.969         1.378         0.965  ...   -7.22   
3         1.363         0.969         1.378         0.965  ...   -7.85   
4         1.364         0.969         1.379         0.965  ...   -7.85   

   P_LUMO   P_ESP_P  P_ESP_=O   P_ESP_H  P_BL_P=O  P_BL_PH  P_

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []

#LR
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    
    model = LinearRegression()
    model.fit(a_X_train, y_train['Yield'])
    
    y_pred1 = model.predict(a_X_train)
    y_pred2 = model.predict(a_X_test)
    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.root_mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.root_mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    
    print('----------------------')
    print('seed:', seed)
    print('R2_train:', r2_train)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
R2_train: 0.9122257278728144
R2_test: 0.6755611490811266
----------------------
seed: 1
R2_train: 0.8549700226345089
R2_test: 0.7244947854618289
----------------------
seed: 2
R2_train: 0.8748964696494153
R2_test: 0.750308782577876
----------------------
seed: 3
R2_train: 0.8797646073647605
R2_test: 0.6888667105237489
----------------------
seed: 4
R2_train: 0.8729915793453391
R2_test: 0.7630187204410193
----------------------
seed: 5
R2_train: 0.875929541592793
R2_test: 0.7212568106335258
----------------------
seed: 6
R2_train: 0.8926813669849165
R2_test: 0.7169077633041612
----------------------
seed: 7
R2_train: 0.8880450982890163
R2_test: 0.6297730333738505
----------------------
seed: 8
R2_train: 0.8963330349217938
R2_test: 0.6313626475567438
----------------------
seed: 9
R2_train: 0.8679324365852158
R2_test: 0.40084006120053084
R2_train_means: 0.8815769885240574
R2_test_means: 0.6702390464154412


In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
data_all.loc['std'] = data_all.std(ddof=0)
print(data_all)

data_all.to_csv('../../score/Hybrid/LR_score.csv')

      r2_train  RMSE_train  MAE_train   r2_test  RMSE_test   MAE_test
0     0.912226    8.044437   5.589650  0.675561  20.599753  13.890465
1     0.854970   11.452162   8.044448  0.724495  16.337769  12.628152
2     0.874896   10.757935   8.045175  0.750309  15.539199  10.492183
3     0.879765   10.589846   7.700839  0.688867  17.280305  12.894044
4     0.872992   10.463917   8.019133  0.763019  15.874128  12.461443
5     0.875930   10.023203   6.787320  0.721257  18.412186  12.366808
6     0.892681    9.384752   6.618675  0.716908  18.457797  13.512302
7     0.888045   10.348171   7.605485  0.629773  18.319762  14.244783
8     0.896333    9.551913   6.872337  0.631363  19.086661  14.174567
9     0.867932   11.569052   8.632064  0.400840  20.547569  14.546339
mean  0.881577   10.218539   7.391513  0.670239  18.045513  13.121109
std   0.014678    0.941759   0.820561  0.094793   1.616576   1.102006
