In [1]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading dataset
df=pd.read_csv('RDKit.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 224 entries, Cate_name to Yield
dtypes: float64(177), int64(45), object(2)
memory usage: 119.1+ KB


In [3]:
#Building descriptors
X = df.drop(columns=['Yield', 'Cate_name', 'P_name' ])
print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

---Descriptors---
   Cate_MaxAbsEStateIndex  Cate_MaxEStateIndex  Cate_MinAbsEStateIndex  \
0                8.669259             8.669259                0.076389   
1                8.669259             8.669259                0.076389   
2                8.669259             8.669259                0.076389   
3                8.669259             8.669259                0.076389   
4                8.850093             8.850093                0.060185   

   Cate_MinEStateIndex  Cate_qed  Cate_SPS  Cate_MolWt  Cate_HeavyAtomMolWt  \
0            -0.076389  0.490728  9.000000     110.112              104.064   
1            -0.076389  0.490728  9.000000     110.112              104.064   
2            -0.076389  0.490728  9.000000     110.112              104.064   
3            -0.076389  0.490728  9.000000     110.112              104.064   
4            -0.068889  0.513122  9.444444     124.139              116.075   

   Cate_ExactMolWt  Cate_NumValenceElectrons  ...  P_NumAromat

In [4]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []

#LR
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_train = a_X_train.dropna(how='any', axis=1)
    a_X_test = a_X_test[a_X_train.columns]
    
    model = LinearRegression()
    model.fit(a_X_train, y_train['Yield'])
    
    y_pred1 = model.predict(a_X_train)
    y_pred2 = model.predict(a_X_test)
    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.root_mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.root_mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)
    
    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    
    print('----------------------')
    print('seed:', seed)
    print('R2_train:', r2_train)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
R2_train: 0.9122257278728144
R2_test: 0.675561149081127
----------------------
seed: 1
R2_train: 0.8549700226345089
R2_test: 0.7244947854618298
----------------------
seed: 2
R2_train: 0.8717611987299334
R2_test: 0.7496651852434159
----------------------
seed: 3
R2_train: 0.8797646073647605
R2_test: 0.6888667105237467
----------------------
seed: 4
R2_train: 0.8576124722840733
R2_test: -2.9364577379194668e+23
----------------------
seed: 5
R2_train: 0.8759256259419356
R2_test: 0.7213769520283939
----------------------
seed: 6
R2_train: 0.8926813669849165
R2_test: 0.6860089260305834
----------------------
seed: 7
R2_train: 0.8880450982890163
R2_test: 0.6297730333738512
----------------------
seed: 8
R2_train: 0.896333034921794
R2_test: 0.6313626475567445
----------------------
seed: 9
R2_train: 0.8679324365852158
R2_test: 0.40084006120053006
R2_train_means: 0.8797251591608969
R2_test_means: -2.936457737919467e+22


In [5]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['RMSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])

data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, 
                      data_RMSE_test_pd, data_MAE_test_pd],
                     axis=1, join='inner')

data_all.loc['mean'] = data_all.mean()
data_all.loc['std'] = data_all.std(ddof=0)
print(data_all)

data_all.to_csv('../../score/RDKit/LR_score.csv')

      r2_train  RMSE_train  MAE_train       r2_test     RMSE_test  \
0     0.912226    8.044437   5.589650  6.755611e-01  2.059975e+01   
1     0.854970   11.452162   8.044448  7.244948e-01  1.633777e+01   
2     0.871761   10.891905   8.486959  7.496652e-01  1.555921e+01   
3     0.879765   10.589846   7.700839  6.888667e-01  1.728030e+01   
4     0.857612   11.079343   8.622444 -2.936458e+23  1.767033e+13   
5     0.875926   10.023362   6.789728  7.213770e-01  1.840822e+01   
6     0.892681    9.384752   6.618675  6.860089e-01  1.943903e+01   
7     0.888045   10.348171   7.605485  6.297730e-01  1.831976e+01   
8     0.896333    9.551913   6.872337  6.313626e-01  1.908666e+01   
9     0.867932   11.569052   8.632064  4.008401e-01  2.054757e+01   
mean  0.879725   10.293494   7.496263 -2.936458e+22  1.767033e+12   
std   0.016203    0.979008   0.913928  8.399408e+22  5.054400e+12   

          MAE_test  
0     1.389047e+01  
1     1.262815e+01  
2     1.073061e+01  
3     1.289404e+01