## Import packages

In [3]:
from keras.layers import Dense

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import text
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense,Activation,Flatten,Dropout
from keras.layers.normalization import BatchNormalization
import numpy as np
from keras.layers import Input, Conv1D, MaxPooling1D, Concatenate
from keras.optimizers import adam, SGD, rmsprop
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from keras.models import model_from_json
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
#from feature_selector import FeatureSelector
from sklearn.ensemble import RandomForestRegressor
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor

## Read data

In [4]:
#%%df = pd.read_csv('test.csv')
xls = pd.ExcelFile('CA_MO_combined.xlsx')
df1 = pd.read_excel(xls, 'CA_MO_combined_raw')
df2 = pd.read_excel(xls, 'CA_MO_combined_1nm')

dataset = df1.values
# split into input (X) and output (Y) variables, and training/test data
X = dataset[:, 1:202]
Y = dataset[:, 209]   # Gs 205; Fs 209;

## RFR-based feature selection

In [None]:
#%% Module-1: RFR-based Variable Importance selection
rf = RandomForestRegressor(n_estimators=1000) # Create a random forest regressor
rf.fit(X, Y) # Train the regressor
feature_importances = pd.DataFrame(rf.feature_importances_,index = X.columns,columns=['importance']).sort_values('importance',ascending=False) # show feature importance
accum_importance = np.cumsum(feature_importances)

idx = accum_importance[accum_importance['importance']<=0.99].index.tolist()[-1]
threshold = feature_importances.loc[idx]

sfm = SelectFromModel(rf, threshold=threshold) # Create a selector object that will use the random forest regressor to identify features that have an importance of more than the defined threshold
sfm.fit(X, Y) #Train the selector
X = sfm.transform(X) # Transform the data to create a new dataset containing only the most important features
X = pd.DataFrame(X) # Convert data format

## PCA-based feature selection

In [None]:
#%% Module-2: PCA-based feature selection
pca = PCA(0.95)
pca.fit(X)
X = pca.transform(X)
print(pca.n_components_)


## normalize and splite training/test data

In [5]:
#%% normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 0)

num_features = X_train.shape[1]

print (X_train.shape,Y_train.shape,Y_test.shape,X_test.shape, num_features)

(170, 201) (170,) (73,) (73, 201) 201




## Build DNN layers

In [6]:
#%% Function to create model, required for KerasRegressor
def create_model(learn_rate,dropout_rate):
    model = Sequential()
    model.add(Dense(64, input_dim=num_features, kernel_initializer='normal', activation='relu'))
    model.add(Dense(128, input_dim=num_features, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(256, input_dim=num_features, activation='relu'))
    model.add(Dense(512, input_dim=num_features, activation='relu'))
    model.add(BatchNormalization())
#    model.add(Dense(512, input_dim=num_features, activation='relu'))
#    model.add(Dense(1024, input_dim=num_features, activation='relu'))
#    model.add(BatchNormalization())

    model.add(Dropout(dropout_rate))
    #model.add(BatchNormalization())
    model.add(Dense(1))

    # Compile model
    opt = adam(lr=learn_rate)
    model.summary()
    model.compile(loss='mse', optimizer= opt, metrics=['mean_squared_error','accuracy'])

    return model

In [8]:
## Run DNN for different batch size, learning and dropout rates

In [7]:
#%% create model
   
model = KerasRegressor(build_fn=create_model, verbose=2)
# grid search epochs, batch size and optimizer
batches = [16,32]
learn_rates = [0.0005,0.005,0.009,0.09,0.01]
dropout_rates = [0.23,0.25]

para = {}
for bt in batches:
    for lr in learn_rates:
        for dr in dropout_rates:
            print('batch_size: '+str(bt),'learn_rate: '+str(lr),'dropout_rate: '+str(dr))
            model=create_model(lr, dr)
            
            # CheckPoint
            filepath="weights.best.hdf5"
            checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
            callbacks_list = [checkpoint]
            # Fit the model
            hist = model.fit(X_train, Y_train, epochs=300, batch_size=bt, validation_data = (X_test, Y_test),callbacks=callbacks_list, verbose=0)
           
            # Load the opitimal weights
            model.load_weights("weights.best.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss='mse', optimizer='adam', metrics=['mean_squared_error'])
            print("Created model and loaded weights from file") 
            
            # Prediction
            y_pred = model.predict(X_test)
            R2 = r2_score(Y_test, y_pred)
            
            y_pred_train = model.predict(X_train)
            R2_train = r2_score(Y_train, y_pred_train)
            
            # Save the model
            model_json = model.to_json()
            with open(str(R2)+ "_model.json", "w") as json_file:
                json_file.write(model_json)
            model.save_weights(str(R2)+ "_model.h5")
            para[R2] = (R2, R2_train, bt, lr, dr)

# Extract the best prameters
R2_max = max(para)
bt = para[R2_max][2]
lr = para[R2_max][3]
dr = para[R2_max][4]

#R2_max = 0.3234658252296695

json_file = open(str(R2_max)+'_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(str(R2_max)+"_model.h5")
print("Loaded best_model from disk")

# Prediction
y_pred = loaded_model.predict(X_test)

# The mean squared error
MSE = mean_squared_error(Y_test, y_pred)
RMSE = np.sqrt (MSE)
print("Tesing RMSE: %.3f" % RMSE)
testY_mean = np.mean(Y_test)
RRMSE = 100*RMSE/testY_mean
print("Tesing Relative RMSE: %.3f" % RRMSE)
R2 = r2_score(Y_test, y_pred)
print('Tesing R2: %.3f' % R2)

## Training accuracy metrics
y_pred_train = loaded_model.predict(X_train) 
MSE = mean_squared_error(Y_train, y_pred_train)
RMSE = np.sqrt (MSE)
print("RMSE of Training: %.3f" % RMSE)
trainY_mean = np.mean(Y_train)
RRMSE = 100*RMSE/trainY_mean
print("Relative RMSE of Trainging: %.3f" % RRMSE)
# Explained variance score: 1 is perfect prediction
print('Traing R2: %.3f' % r2_score(Y_train, y_pred_train))

y_pred_all = loaded_model.predict(X)

# Visulization
plt.scatter(Y_test,y_pred)
plt.show()
plt.scatter(Y_train,y_pred_train)
plt.show()
plt.scatter(Y,y_pred_all)
plt.show()

batch_size: 16 learn_rate: 0.0005 dropout_rate: 0.23
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                12928     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               8320      
_________________________________________________________________
batch_normalization_1 (Batch (None, 128)               512       
_________________________________________________________________
dense_3 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_4 (Dense)              (None, 512)               131584    
_________________________________________________________________
batch_normalization_2 (Batch (None, 512)               2048      
_________________________________________________________________
dropout_1 (Dropout)    


Epoch 00106: val_loss did not improve from 284495.83155

Epoch 00107: val_loss did not improve from 284495.83155

Epoch 00108: val_loss did not improve from 284495.83155

Epoch 00109: val_loss did not improve from 284495.83155

Epoch 00110: val_loss did not improve from 284495.83155

Epoch 00111: val_loss did not improve from 284495.83155

Epoch 00112: val_loss did not improve from 284495.83155

Epoch 00113: val_loss did not improve from 284495.83155

Epoch 00114: val_loss did not improve from 284495.83155

Epoch 00115: val_loss did not improve from 284495.83155

Epoch 00116: val_loss did not improve from 284495.83155

Epoch 00117: val_loss did not improve from 284495.83155

Epoch 00118: val_loss did not improve from 284495.83155

Epoch 00119: val_loss improved from 284495.83155 to 263007.22624, saving model to weights.best.hdf5

Epoch 00120: val_loss did not improve from 263007.22624

Epoch 00121: val_loss improved from 263007.22624 to 261145.26284, saving model to weights.best.hdf5


KeyboardInterrupt: 