In [None]:
#import
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

In [None]:
#Reading dataset
df=pd.read_csv('RDKit.csv')
df.info()

In [None]:
#Preparing

# delete variables with zero variance
Var0Variable = np.where( df.var(axis=0) == 0 )
if len(Var0Variable[0]) == 0:
    print( "There is no variable with zero variance." )
    print( "" )
else:
    print( "There were {0} variables with zero variance".format(len(Var0Variable[0])))
    print( "the number of variable is: {0}".format(Var0Variable[0]) )
    print( "remove them" )
    print( "" )
    #df_var0 = np.delete(df, Var0Variable, 1)    
    df_var0 = df.drop(df.columns[Var0Variable], axis=1)

#removal of columns which data has all 0
def remove_all_zero_col(df):
    df = df.copy()
    for col in df.columns:
        if (df[col] == 0).all():
            df.drop(col, axis=1, inplace=True)
    return df
df_var0 = remove_all_zero_col(df_var0)
df_var0.info()

In [None]:
#Building descriptors
X = df_var0.drop(columns=['Yield', 'Cate_name', 'P_name'])

print('---Descriptors---')
print(X.head())

y = pd.DataFrame(df_var0['Yield'],columns=['Yield'])
print('---Objective---')
print(y.head())

In [5]:
X=X.astype('float16')
print(X.isnull().any())
X.isnull().values.sum()
X.info()

Cate_MaxEStateIndex         False
Cate_MinEStateIndex         False
Cate_MaxAbsEStateIndex      False
Cate_MinAbsEStateIndex      False
Cate_qed                    False
                            ...  
P_NumAromaticCarbocycles    False
P_NumAromaticRings          False
P_NumHAcceptors             False
P_NumHDonors                False
P_RingCount                 False
Length: 160, dtype: bool
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Columns: 160 entries, Cate_MaxEStateIndex to P_RingCount
dtypes: float16(160)
memory usage: 21.4 KB


In [6]:
#storage 
data_r2_train =[]
data_RMSE_train = []
data_MAE_train = []
data_r2_test = []
data_RMSE_test = []
data_MAE_test = []
best_model_parametors = []
data_cv_score = []

#knn
for i in range(10):
    seed=i
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    
    #autoscaling
    a_X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    a_X_test = (X_test - X_train.mean(axis=0)) / X_train.std(axis=0, ddof=1)
    
    a_y_train = (y_train - y_train.mean(axis=0)) / y_train.std(axis=0, ddof=1)
    
    param_knn={'n_neighbors':[1,2,3,4,5,6,7,8,9,10], 'p':[1,2,3,4,5]}
    reg_knn = GridSearchCV(KNeighborsRegressor(), param_grid=param_knn, cv=5, n_jobs=2)
    reg_knn.fit(a_X_train,a_y_train['Yield'])
    reg_best = reg_knn.best_estimator_
    y_pred1 = reg_best.predict(a_X_train)*y_train.std(axis=0, ddof=1).values+y_train.mean(axis=0).values
    y_pred2 = reg_best.predict(a_X_test)*y_train.std(axis=0, ddof=1).values+y_train.mean(axis=0).values

    
    #train
    r2_train = metrics.r2_score(y_train, y_pred1)
    RMSE_train = metrics.mean_squared_error(y_train, y_pred1)
    MAE_train =  metrics.mean_absolute_error(y_train, y_pred1)
    #test
    r2_test = metrics.r2_score(y_test, y_pred2)
    RMSE_test = metrics.mean_squared_error(y_test, y_pred2)
    MAE_test = metrics.mean_absolute_error(y_test, y_pred2)

    data_r2_train.append(r2_train)
    data_RMSE_train.append(RMSE_train)
    data_MAE_train.append(MAE_train)
    data_r2_test.append(r2_test)
    data_RMSE_test.append(RMSE_test)
    data_MAE_test.append(MAE_test)
    
    parametors = reg_knn.best_params_
    best_model_parametors.append(parametors)
    cv_score = reg_knn.best_score_
    data_cv_score.append(cv_score)
 
    print('----------------------')
    print('seed:', seed)
    print("Best Model Parameter:",reg_knn.best_params_)
    print("Best Model Score:",reg_knn.best_score_)
    print('R2_test:', r2_test)
print('R2_train_means:', sum(data_r2_train)/10)
print('CV_score_means:', sum(data_cv_score)/10)
print('R2_test_means:', sum(data_r2_test)/10)

----------------------
seed: 0
Best Model Parameter: {'n_neighbors': 9, 'p': 1}
Best Model Score: 0.7444399970897215
R2_test: 0.5726622122516063
----------------------
seed: 1
Best Model Parameter: {'n_neighbors': 5, 'p': 1}
Best Model Score: 0.7050905261056044
R2_test: 0.6002400400690912
----------------------
seed: 2
Best Model Parameter: {'n_neighbors': 4, 'p': 1}
Best Model Score: 0.6341453505884326
R2_test: 0.545613419575402
----------------------
seed: 3
Best Model Parameter: {'n_neighbors': 7, 'p': 1}
Best Model Score: 0.5930872407400527
R2_test: 0.7006668835832824
----------------------
seed: 4
Best Model Parameter: {'n_neighbors': 8, 'p': 1}
Best Model Score: 0.6869219334466836
R2_test: 0.677924228716258
----------------------
seed: 5
Best Model Parameter: {'n_neighbors': 6, 'p': 1}
Best Model Score: 0.7034714952471195
R2_test: 0.6123727345159033
----------------------
seed: 6
Best Model Parameter: {'n_neighbors': 6, 'p': 1}
Best Model Score: 0.6508465578610372
R2_test: 0.6401

In [9]:
data_r2_train_pd = pd.DataFrame(data= data_r2_train, columns=['r2_train'])
data_RMSE_train_pd = pd.DataFrame(data=data_RMSE_train, columns=['MSE_train'])
data_MAE_train_pd = pd.DataFrame(data=data_MAE_train, columns=['MAE_train'])
data_r2_test_pd = pd.DataFrame(data=data_r2_test, columns=['r2_test'])
data_RMSE_test_pd = pd.DataFrame(data=data_RMSE_test, columns=['RMSE_test'])
data_MAE_test_pd = pd.DataFrame(data=data_MAE_test, columns=['MAE_test'])
data_cv_score_pd = pd.DataFrame(data=data_cv_score, columns=['cv_score'])
data_parametors = pd.DataFrame([best_model_parametors])
        
data_all = pd.concat([data_r2_train_pd, data_RMSE_train_pd, data_MAE_train_pd, data_r2_test_pd, data_RMSE_test_pd, data_MAE_test_pd, data_cv_score_pd], 
                     axis=1, join='inner')
data_all.loc['mean'] = data_all.mean()
print(data_all)

data_all.to_csv('../../score/RDKit/knn_score.csv')

      r2_train   MSE_train  MAE_train   r2_test   RMSE_test   MAE_test  \
0     0.776878  164.500394   9.595745  0.572662  558.936508  19.105820   
1     0.741215  234.022128  11.395745  0.600240  387.306667  14.533333   
2     0.763638  218.658245   9.920213  0.545613  439.419643  17.535714   
3     0.743125  239.590100  10.401216  0.700667  287.283771  11.993197   
4     0.697183  261.057846  11.664894  0.677924  342.470982  12.577381   
5     0.758436  195.603428   9.776596  0.612373  471.433862  14.936508   
6     0.736308  216.405437  10.432624  0.640181  433.027778  13.880952   
7     0.724511  263.504993  11.155015  0.673886  295.624879  14.163265   
8     0.753808  216.677377   9.920973  0.602502  392.822157  14.435374   
9     0.777540  225.450282  11.130699  0.718655  198.251701  10.578231   
mean  0.747264  223.547023  10.539372  0.634470  380.657795  14.373978   

      cv_score  
0     0.744440  
1     0.705091  
2     0.634145  
3     0.593087  
4     0.686922  
5     0.7