In [21]:
###########import packages##########
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.backend.tensorflow_backend import set_session
from keras import optimizers
from keras import regularizers
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard
from keras.constraints import max_norm
from keras.models import Sequential 
from keras.layers import Dense 
from keras.layers import Dropout 
from keras.layers import BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier 
from keras.wrappers.scikit_learn import KerasRegressor
from keras.constraints import maxnorm 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import GridSearchCV
%matplotlib

###########fix random seed for reproducability##########
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import sys
sys.path.append("../../lib")
csv = "../../database/database.csv"



seed=1
np.random.seed(seed)
###########wrapping root mean square error for later calls##########
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
###########loading data##########
fdata=pd.read_csv(csv,encoding="gbk", index_col=0)
raw_data = fdata.iloc[:, 1:]

##########handling missing values##########
median_raw_data=raw_data.median()
dict_median_raw_data=median_raw_data.to_dict()
data=raw_data.fillna(dict_median_raw_data)

###########data standardization##########
standardized_data = (data-np.mean(data,axis=0))/np.std(data,axis=0)#即简单实现标准化
###########train test splitting##########
parame=standardized_data.iloc[:,0:614]
degradation=standardized_data.iloc[:,614]
X=parame.values.astype(np.float32)
y=degradation.values.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=751)

###########defining a wrapper function for later call from each machine learning algorithms##########
def try_different_method(model):
    model.fit(X_train,y_train)
    score = model.score(X_test, y_test)
    result = model.predict(X_test)
    x_prediction_degradation_ann=result*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    y_real_degradation=y_test*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    print(x_prediction_degradation_ann)
    x_prediction_degradation_ann_series=pd.Series(x_prediction_degradation_ann)
    y_real_degradation_series=pd.Series(y_real_degradation)
    ###########evaluating the regression quality##########
    corr_ann = round(x_prediction_degradation_ann_series.corr(y_real_degradation_series), 4)
    rmse_val= rmse(x_prediction_degradation_ann,y_real_degradation)
    print(rmse_val)
    print(corr_ann)
    print(y_real_degradation)
    ###########generating a figure##########
    x_y_x=np.arange(0,100,10)
    x_y_y=np.arange(0,100,10)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x_prediction_degradation_ann,y_real_degradation,color='red',label='Artificial Neural Network')
    plt.legend()
    ax.plot(x_y_x,x_y_y)
    plt.xlabel(u"Predicted_Degradable_Conductivity %")
    plt.ylabel(u"Real_Degradable_Conductivity %")
    plt.show()
####import machine learning algorithms packages and define the corresponding models####
####Support Vector Regressor####
from sklearn import svm
model_SVR = svm.SVR()

####Random Forest####
from sklearn import ensemble
model_RandomForestRegressor = ensemble.RandomForestRegressor()

####Gaussian Process####
from sklearn.gaussian_process import GaussianProcessRegressor
model_GaussianProcessRegressor=GaussianProcessRegressor()

####XGBoost####
import xgboost as xgb
model_XGboostRegressor=xgb.XGBRegressor()

Using matplotlib backend: Qt5Agg


In [3]:
seed=1
np.random.seed(seed)
####XGboost####
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]
)

###########defining the parameters dictionary##########
param = {
         'learning_rate':[0.01],
   'n_estimators':[1000],
    'max_depth':[5],  
    'objective':['reg:squarederror'],
    'subsample':[0.5],
    'lambda':[0.1],
    'alpha':[0.1]
       }
grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
grid.fit(X_train,y_train)
print('Best Regressor:',grid.best_params_,'Best Score:', grid.best_score_) 
best_model=grid.best_estimator_

algorithm_name='XGBoost Regressor'
try_different_method(best_model)

Best Regressor: {'alpha': 0.1, 'lambda': 0.1, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'objective': 'reg:squarederror', 'subsample': 0.5} Best Score: 0.6516226352327188
[107.09476   65.10108  108.14436   81.54621   47.10421  110.313934
  92.540245  35.609585  70.35246   94.63491   59.373993 137.57312
  25.139233 120.95519  109.757904  61.73006   67.09707   69.350586
  63.29402   80.367165  55.081078  39.155373 104.38531   82.74753
 136.53838   85.169426  62.933014  61.502625  73.960434  39.894142
  29.799255  38.883472  81.588104  62.27542   50.210205  82.13156
 152.49138   51.105637  62.317734  52.375454  80.01731   56.15576
  92.96692   50.49855   56.120613  56.08187   63.460144  72.464455
  85.856316  35.002674 104.47423   63.44604   74.289345  48.08413
  69.65373   36.2612    71.40812  113.05168   40.985535 116.883835
  89.813515  91.57865   82.41396  122.43112   46.69287   66.21391
  74.30173   85.66831   70.25325  112.45877   44.40496   52.113785
  67.0771    

In [4]:
####GPU####
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]
)

from sklearn.feature_selection import SelectFromModel
from numpy import sort
thresholds = sort(best_model.feature_importances_)
print(thresholds)

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [5]:
max_corr = 0
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
valiable_model=best_model
for thresh in thresholds:
    if thresh < 0.0001:
        continue
    selection = SelectFromModel(valiable_model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_test  = selection.transform(X_test)
    # train model
    param = {
         'learning_rate':[0.01,0.1,1],
   'n_estimators':[100,200,400,1000,2000,4000,8000],
    'max_depth':[5,7,9,11],  
    'objective':['reg:squarederror'],
    'subsample':[0.5,0.6,0.7,0.8,0.9],
    'lambda':[0.1],
    'alpha':[0.1]
       }
    grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
    grid.fit(X_train,y_train)
    selection_model = grid.best_estimator_
    #selection_model = XGBClassifier(reg_alpha = 0.1,reg_lambda=0.1,learning_rate=0.01, max_depth= 5, n_estimators=200, objective='reg:squarederror', subsample=0.5)
    selection_model.fit(select_X_train, y_train)  
    # eval model
    result = selection_model.predict(select_X_test)
    x_prediction_degradation_ann=result*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    y_real_degradation=y_test*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    x_prediction_degradation_ann_series=pd.Series(x_prediction_degradation_ann)
    y_real_degradation_series=pd.Series(y_real_degradation)
    ###########evaluating the regression quality##########
    corr_ann = round(x_prediction_degradation_ann_series.corr(y_real_degradation_series), 4)
    rmse_val= rmse(x_prediction_degradation_ann,y_real_degradation)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], corr_ann*100.0))
    if max_corr < corr_ann:
        max_corr = corr_ann
        max_thresh = thresh
        max_n = select_X_train.shape[1]
print("max_Thresh=%.3f, max_n=%d, Max_Accuracy: %.2f%%" % (max_thresh, max_n, max_corr*100.0))

Thresh=0.000, n=424, Accuracy: 83.80%
Thresh=0.000, n=423, Accuracy: 83.79%


KeyboardInterrupt: 

In [6]:
max_corr = 0
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
valiable_model=best_model
for thresh in thresholds:
    if thresh < 0.0001:
        continue
    selection = SelectFromModel(valiable_model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_test  = selection.transform(X_test)
    # train model
    param = {
         'learning_rate':[0.01],
   'n_estimators':[1000],
    'max_depth':[5],  
    'objective':['reg:squarederror'],
    'subsample':[0.5],
    'lambda':[0.1],
    'alpha':[0.1]
       }
    grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
    grid.fit(X_train,y_train)
    selection_model = grid.best_estimator_
    #selection_model = XGBClassifier(reg_alpha = 0.1,reg_lambda=0.1,learning_rate=0.01, max_depth= 5, n_estimators=200, objective='reg:squarederror', subsample=0.5)
    selection_model.fit(select_X_train, y_train)  
    # eval model
    result = selection_model.predict(select_X_test)
    x_prediction_degradation_ann=result*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    y_real_degradation=y_test*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    x_prediction_degradation_ann_series=pd.Series(x_prediction_degradation_ann)
    y_real_degradation_series=pd.Series(y_real_degradation)
    ###########evaluating the regression quality##########
    corr_ann = round(x_prediction_degradation_ann_series.corr(y_real_degradation_series), 4)
    rmse_val= rmse(x_prediction_degradation_ann,y_real_degradation)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], corr_ann*100.0))
    if max_corr < corr_ann:
        max_corr = corr_ann
        max_thresh = thresh
        max_n = select_X_train.shape[1]
print("max_Thresh=%.3f, max_n=%d, Max_Accuracy: %.2f%%" % (max_thresh, max_n, max_corr*100.0))

Thresh=0.000, n=424, Accuracy: 83.86%
Thresh=0.000, n=423, Accuracy: 83.87%
Thresh=0.000, n=422, Accuracy: 84.00%
Thresh=0.000, n=421, Accuracy: 83.90%
Thresh=0.000, n=420, Accuracy: 83.87%
Thresh=0.000, n=419, Accuracy: 83.95%
Thresh=0.000, n=418, Accuracy: 83.70%
Thresh=0.000, n=417, Accuracy: 83.90%
Thresh=0.000, n=416, Accuracy: 83.90%
Thresh=0.000, n=415, Accuracy: 83.90%
Thresh=0.001, n=414, Accuracy: 83.76%
Thresh=0.001, n=413, Accuracy: 83.75%
Thresh=0.001, n=412, Accuracy: 83.86%
Thresh=0.001, n=411, Accuracy: 83.86%
Thresh=0.001, n=410, Accuracy: 83.85%
Thresh=0.001, n=409, Accuracy: 83.98%
Thresh=0.001, n=408, Accuracy: 83.70%
Thresh=0.001, n=407, Accuracy: 83.70%
Thresh=0.001, n=406, Accuracy: 83.70%
Thresh=0.001, n=405, Accuracy: 83.70%
Thresh=0.001, n=404, Accuracy: 83.51%
Thresh=0.001, n=403, Accuracy: 83.60%
Thresh=0.001, n=402, Accuracy: 83.64%
Thresh=0.001, n=401, Accuracy: 83.56%
Thresh=0.001, n=400, Accuracy: 83.63%
Thresh=0.001, n=399, Accuracy: 83.80%
Thresh=0.001

Thresh=0.002, n=208, Accuracy: 82.88%
Thresh=0.002, n=207, Accuracy: 82.74%
Thresh=0.002, n=206, Accuracy: 82.63%
Thresh=0.002, n=205, Accuracy: 82.63%
Thresh=0.002, n=204, Accuracy: 82.79%
Thresh=0.002, n=203, Accuracy: 82.58%
Thresh=0.002, n=202, Accuracy: 82.65%
Thresh=0.002, n=201, Accuracy: 82.99%
Thresh=0.002, n=200, Accuracy: 82.77%
Thresh=0.002, n=199, Accuracy: 82.87%
Thresh=0.002, n=198, Accuracy: 82.78%
Thresh=0.002, n=197, Accuracy: 82.83%
Thresh=0.002, n=196, Accuracy: 82.66%
Thresh=0.002, n=195, Accuracy: 82.62%
Thresh=0.002, n=194, Accuracy: 82.69%
Thresh=0.002, n=193, Accuracy: 82.73%
Thresh=0.002, n=192, Accuracy: 82.84%
Thresh=0.002, n=191, Accuracy: 82.97%
Thresh=0.002, n=190, Accuracy: 82.96%
Thresh=0.002, n=189, Accuracy: 82.59%
Thresh=0.002, n=188, Accuracy: 82.71%
Thresh=0.002, n=187, Accuracy: 82.88%
Thresh=0.002, n=186, Accuracy: 82.39%
Thresh=0.002, n=185, Accuracy: 82.73%
Thresh=0.002, n=184, Accuracy: 82.49%
Thresh=0.002, n=183, Accuracy: 82.63%
Thresh=0.002

In [7]:
import joblib
joblib.dump(valiable_model, filename='../../results/Importance/filepath.xgb')

['../../results/Importance/filepath.xgb']

In [22]:
model = joblib.load('../../results/Importance/filepath.xgb')
selection = SelectFromModel(model, max_features=422,threshold=-1, prefit=True)
X_transform = selection.transform(X)
X_transform.shape

(790, 422)

In [23]:
data_pri = data.iloc[:,0:614]
X_transform = selection.transform(data_pri)
X_transform.shape

(790, 422)

In [24]:
all_name = data_pri.columns.values.tolist()  # 获得所有的特征名称
select_name_index0 = selection.get_support(indices=True)  # 留下特征的索引值，list格式
select_name0 = []
for i in select_name_index0:
    select_name0.append(all_name[i])
print(select_name0)
len(select_name0)

['MaxEStateIndex of 1', 'MinEStateIndex of 1', 'MinAbsEStateIndex of 1', 'qed of 1', 'MolWt of 1', 'HeavyAtomMolWt of 1', 'NumValenceElectrons of 1', 'MaxPartialCharge of 1', 'MinPartialCharge of 1', 'MinAbsPartialCharge of 1', 'FpDensityMorgan1 of 1', 'FpDensityMorgan2 of 1', 'FpDensityMorgan3 of 1', 'BCUT2D_MWHI of 1', 'BCUT2D_MWLOW of 1', 'BCUT2D_CHGHI of 1', 'BCUT2D_CHGLO of 1', 'BCUT2D_LOGPHI of 1', 'BCUT2D_LOGPLOW of 1', 'BCUT2D_MRHI of 1', 'BCUT2D_MRLOW of 1', 'BalabanJ of 1', 'BertzCT of 1', 'Chi0 of 1', 'Chi0n of 1', 'Chi0v of 1', 'Chi1 of 1', 'Chi1n of 1', 'Chi1v of 1', 'Chi2n of 1', 'Chi2v of 1', 'Chi3n of 1', 'Chi3v of 1', 'Chi4n of 1', 'Chi4v of 1', 'HallKierAlpha of 1', 'Ipc of 1', 'Kappa1 of 1', 'Kappa2 of 1', 'Kappa3 of 1', 'LabuteASA of 1', 'PEOE_VSA1 of 1', 'PEOE_VSA10 of 1', 'PEOE_VSA11 of 1', 'PEOE_VSA2 of 1', 'PEOE_VSA3 of 1', 'PEOE_VSA5 of 1', 'PEOE_VSA6 of 1', 'PEOE_VSA7 of 1', 'PEOE_VSA8 of 1', 'PEOE_VSA9 of 1', 'SMR_VSA1 of 1', 'SMR_VSA10 of 1', 'SMR_VSA3 of 1'

422

In [26]:
X_transform=pd.DataFrame(X_transform) 
X_transform.columns =select_name0
X_transform.to_csv("../../results/Importance/Importance-42x.csv")

In [28]:
import joblib
###########import packages##########
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.backend.tensorflow_backend import set_session
from keras import optimizers
from keras import regularizers
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard
from keras.constraints import max_norm
from keras.models import Sequential 
from keras.layers import Dense 
from keras.layers import Dropout 
from keras.layers import BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier 
from keras.wrappers.scikit_learn import KerasRegressor
from keras.constraints import maxnorm 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import GridSearchCV
%matplotlib

###########fix random seed for reproducability##########

import sys
sys.path.append("../../lib")
csv = "../../database/database.csv"



seed=1
np.random.seed(seed)
###########wrapping root mean square error for later calls##########
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
###########loading data##########
fdata=pd.read_csv(csv,encoding="gbk", index_col=0)
raw_data = fdata.iloc[:, 1:]

##########handling missing values##########
median_raw_data=raw_data.median()
dict_median_raw_data=median_raw_data.to_dict()
data=raw_data.fillna(dict_median_raw_data)

###########data standardization##########
standardized_data = (data-np.mean(data,axis=0))/np.std(data,axis=0)#即简单实现标准化
###########train test splitting##########
parame=standardized_data.iloc[:,0:614]
degradation=standardized_data.iloc[:,614]
X=parame.values.astype(np.float32)
y=degradation.values.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=751)

max_corr = 0
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from numpy import sort
import xgboost as xgb
model_XGboostRegressor=xgb.XGBRegressor()

model = joblib.load('../../results/Importance/filepath.xgb')


for nmax_features in range(50,613,50):
    selection = SelectFromModel(model, max_features=nmax_features,threshold=-1, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_test  = selection.transform(X_test)
    # train model
    param = {
         'learning_rate':[0.01],
   'n_estimators':[1000],
    'max_depth':[5],  
    'objective':['reg:squarederror'],
    'subsample':[0.5],
    'lambda':[0.1],
    'alpha':[0.1]
           }
    grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
    grid.fit(X_train,y_train)
    selection_model = grid.best_estimator_
    #selection_model = XGBClassifier(reg_alpha = 0.1,reg_lambda=0.1,learning_rate=0.01, max_depth= 5, n_estimators=200, objective='reg:squarederror', subsample=0.5)
    selection_model.fit(select_X_train, y_train)  
    #eval model
    result = selection_model.predict(select_X_test)
    x_prediction_degradation_ann = result*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    y_real_degradation = y_test*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    x_prediction_degradation_ann_series = pd.Series(x_prediction_degradation_ann)
    y_real_degradation_series = pd.Series(y_real_degradation)
    ###########evaluating the regression quality##########
    corr_ann = round(x_prediction_degradation_ann_series.corr(y_real_degradation_series), 4)
    rmse_val = rmse(x_prediction_degradation_ann,y_real_degradation)
    print("n=%d,RMSE=%.2f, Accuracy: %.2f%%" % (select_X_train.shape[1],rmse_val, corr_ann*100.0))
    if max_corr < corr_ann:
        max_corr = corr_ann
        max_n = select_X_train.shape[1]
print("max_n=%d, Max_Accuracy: %.2f%%" % (max_n, max_corr*100.0))

Using matplotlib backend: Qt5Agg
n=50,RMSE=25.51, Accuracy: 75.89%
n=100,RMSE=22.01, Accuracy: 82.86%
n=150,RMSE=22.08, Accuracy: 82.72%
n=200,RMSE=22.06, Accuracy: 82.77%
n=250,RMSE=21.78, Accuracy: 83.29%
n=300,RMSE=21.82, Accuracy: 83.27%
n=350,RMSE=21.74, Accuracy: 83.42%
n=400,RMSE=21.60, Accuracy: 83.63%
n=450,RMSE=21.49, Accuracy: 83.86%
n=500,RMSE=21.49, Accuracy: 83.86%
n=550,RMSE=21.49, Accuracy: 83.86%
n=600,RMSE=21.49, Accuracy: 83.86%
max_n=450, Max_Accuracy: 83.86%


In [3]:
for nmax_features in range(95,126,5):
    selection = SelectFromModel(model, max_features=nmax_features,threshold=-1, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_test  = selection.transform(X_test)
    # train model
    param = {
         'learning_rate':[0.01],
   'n_estimators':[1000],
    'max_depth':[5],  
    'objective':['reg:squarederror'],
    'subsample':[0.5],
    'lambda':[0.1],
    'alpha':[0.1]
           }}
    grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
    grid.fit(X_train,y_train)
    selection_model = grid.best_estimator_
    #selection_model = XGBClassifier(reg_alpha = 0.1,reg_lambda=0.1,learning_rate=0.01, max_depth= 5, n_estimators=200, objective='reg:squarederror', subsample=0.5)
    selection_model.fit(select_X_train, y_train)  
    #eval model
    result = selection_model.predict(select_X_test)
    x_prediction_degradation_ann = result*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    y_real_degradation = y_test*np.std(data,axis=0)[614]+np.mean(data,axis=0)[614]
    x_prediction_degradation_ann_series = pd.Series(x_prediction_degradation_ann)
    y_real_degradation_series = pd.Series(y_real_degradation)
    ###########evaluating the regression quality##########
    corr_ann = round(x_prediction_degradation_ann_series.corr(y_real_degradation_series), 4)
    rmse_val = rmse(x_prediction_degradation_ann,y_real_degradation)
    print("n=%d,RMSE=%.2f, Accuracy: %.2f%%" % (select_X_train.shape[1],rmse_val, corr_ann*100.0))
    if max_corr < corr_ann:
        max_corr = corr_ann
        max_n = select_X_train.shape[1]
print("max_n=%d, Max_Accuracy: %.2f%%" % (max_n, max_corr*100.0))

n=95,RMSE=13.33, Accuracy: 93.61%
n=100,RMSE=13.39, Accuracy: 93.54%
n=105,RMSE=13.46, Accuracy: 93.47%
n=110,RMSE=13.39, Accuracy: 93.54%
n=115,RMSE=13.40, Accuracy: 93.53%
n=120,RMSE=13.40, Accuracy: 93.52%
n=125,RMSE=13.39, Accuracy: 93.53%
max_n=40, Max_Accuracy: 94.46%


In [4]:
seed=1
np.random.seed(seed)
####XGboost####
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]
)

###########defining the parameters dictionary##########
param = {
        'learning_rate':[0.01,0.1,1],
        'n_estimators':[100,200,400,1000,2000,4000,8000],
        'max_depth':[5,7,9,11],  
        'objective':['reg:squarederror'],
        'subsample':[0.5,0.6,0.7,0.8,0.9],
        'lambda':[0.1],
        'alpha':[0.1]
       }
grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
grid.fit(X_train,y_train)
print('Best Regressor:',grid.best_params_,'Best Score:', grid.best_score_) 
best_model=grid.best_estimator_

algorithm_name='XGBoost Regressor'
try_different_method(best_model)

Best Regressor: {'alpha': 0.1, 'lambda': 0.1, 'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 8000, 'objective': 'reg:squarederror', 'subsample': 0.7} Best Score: 0.794175938602262
[ 5.20944519e+01  4.94044456e+01  9.54802704e+01  7.02759705e+01
  2.74203682e+00  5.81717606e+01  4.29696274e+01  8.96507263e-01
  1.87556992e+01  4.99090195e+01  1.73628883e+01  5.77849617e+01
  5.27315140e+00  5.00202408e+01  5.23217316e+01  2.08559990e+00
  1.02717758e+02 -3.20625305e-01  1.02700577e+01  2.47988586e+01
  9.07813454e+00 -1.29755402e+00  4.97042465e+01 -2.24706650e+00
  6.18386078e+00  9.99134636e+00  8.85013771e+00  2.25002537e+01
  4.36925354e+01  7.17431259e+01 -1.58802795e+00  4.47654724e-02
  3.01697922e+00  2.63240395e+01  3.57955780e+01  1.27488937e+01
  4.81299896e+01  2.34669495e+00  9.27228394e+01  8.38053322e+00
  8.86372528e+01  1.99347839e+01  1.44039154e-01  1.00095688e+02
  3.05782700e+01  4.90900993e+00  9.52668953e+00  6.92062759e+00
  2.55799675e+01  1.24372864e+00

In [12]:
max_corr = 0
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from numpy import sort
model=best_model

for nmax_features in range(5,100,5):
    selection = SelectFromModel(model, max_features=nmax_features,threshold=-1, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_test  = selection.transform(X_test)
    # train model
    param = {
        'learning_rate':[0.1],
        'n_estimators':[8000],
        'max_depth':[9],  
        'objective':['reg:squarederror'],
        'subsample':[0.8],
        'lambda':[0.1],
        'alpha':[0.1]
        }
    grid = GridSearchCV(model_XGboostRegressor,param_grid=param,cv=5)
    grid.fit(X_train,y_train)
    selection_model = grid.best_estimator_
    #selection_model = XGBClassifier(reg_alpha = 0.1,reg_lambda=0.1,learning_rate=0.01, max_depth= 5, n_estimators=200, objective='reg:squarederror', subsample=0.5)
    selection_model.fit(select_X_train, y_train)  
    #eval model
    result = selection_model.predict(select_X_test)
    x_prediction_degradation_ann = result*np.std(data,axis=0)[163]+np.mean(data,axis=0)[163]
    y_real_degradation = y_test*np.std(data,axis=0)[163]+np.mean(data,axis=0)[163]
    x_prediction_degradation_ann_series = pd.Series(x_prediction_degradation_ann)
    y_real_degradation_series = pd.Series(y_real_degradation)
    ###########evaluating the regression quality##########
    corr_ann = round(x_prediction_degradation_ann_series.corr(y_real_degradation_series), 4)
    rmse_val = rmse(x_prediction_degradation_ann,y_real_degradation)
    print("n=%d,RMSE=%.2f, Accuracy: %.2f%%" % (select_X_train.shape[1],rmse_val, corr_ann*100.0))
    if max_corr < corr_ann:
        max_corr = corr_ann
        max_n = select_X_train.shape[1]
print("max_n=%d, Max_Accuracy: %.2f%%" % (max_n, max_corr*100.0))

n=5,RMSE=75.05, Accuracy: 24.73%
n=10,RMSE=75.53, Accuracy: 25.55%
n=15,RMSE=74.53, Accuracy: 29.12%
n=20,RMSE=75.45, Accuracy: 27.25%
n=25,RMSE=73.90, Accuracy: 30.73%
n=30,RMSE=69.36, Accuracy: 44.00%
n=35,RMSE=69.18, Accuracy: 44.45%


KeyboardInterrupt: 