In [1]:
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

In [2]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [3]:
from keras_tuner import BayesianOptimization

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from google.colab import drive
df1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data.csv')

In [6]:
df1['Address'] = df1['Address'].apply(lambda x : 'NA' if pd.isnull(x) else x)
df1['if_return'] = df1['Visit.No'].apply(lambda x: True if x>1 else False)
cal_session = min(df1[df1['Session']==2].index)
df1['duration_history2'] = pd.NA
for i in range(cal_session,len(df1)):#The patient's past visits were calculated from the second session
    session = df1.loc[i,'Session']
    if df1.loc[i,'if_return']:#If s/he is a return patient
        id = df1.loc[i,'ID']
        df_temp = df1[(df1['ID']==id)&(df1['Session']<session)]['ServTime'].values#Excluding the data of the same session, only the data of previous sessions can be used for calculation
        if len(df_temp)!=0:
            df1.loc[i,'duration_history2'] = np.mean(df_temp)
        else:
            df1.loc[i,'duration_history2'] = pd.NA#Leave the missing pieces intact and fill in the next 15 minutes
    else:#If s/he is a new patient
        df_temp = df1[(df1['if_return']==False)&(df1['Session']<session)]['ServTime'].values
        df1.loc[i,'duration_history2'] = np.mean(df_temp)

In [7]:
df1['duration_history2'] = df1['duration_history2'].fillna(900)#Take 15 minutes to fill in the missing values

In [8]:
train_start = min(df1[df1['Session']==2].index)
train_end = max(df1[df1['Session']==194].index)
session1_len = len(df1[df1['Session']==1])
def data_process(df,feature,duration_history,y):
    data = df[feature]
    data = pd.get_dummies(data,columns = ['Gender','Address'], drop_first=True)#one-hot code
    data_x = data.drop([y],axis = 1)
    data_y = data[y]
    x_train = data_x.loc[:train_end,].copy()
    x_test = data_x.loc[train_end+1:,].copy()
    y_train = data_y.loc[:train_end,].copy()
    y_test = data_y.loc[train_end+1:,].copy()
    for i in [duration_history,'Visit.No']:#normalization
        x1 = np.array(x_train.loc[:,i]).reshape(-1,1)
        scaler = StandardScaler()
        scaler.fit(x1)
        x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
        x2 = np.array(x_test.loc[:,i]).reshape(-1,1)
        x_test[i+'_scaled'] = scaler.transform(x2).reshape(1,-1)[0]
    x_train = x_train.drop(['Visit.No',duration_history],axis = 1)
    x_test = x_test.drop(['Visit.No',duration_history],axis = 1)
    return x_train, x_test, y_train, y_test

In [9]:
input_feature = ['Visit.No','M.Cancer','S.Cancer','Gender','ServTime','if_return','duration_history2','Address']
x_train, x_test, y_train, y_test = data_process(df1,input_feature,'duration_history2','ServTime')

In [10]:
def MAPE(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def eval_model(xtrain_pred,ytrain,predictions,groundtrue):
    print('--------------training set----------------')
    print('RMSE',np.sqrt(mean_squared_error(ytrain,xtrain_pred)))
    print('MAPE',MAPE(ytrain,xtrain_pred))
    print('MAE',mean_absolute_error(ytrain,xtrain_pred))
    print('R^2 test: %.3f' % (r2_score(ytrain,xtrain_pred)))
    print('--------------test set----------------')
    print('RMSE',np.sqrt(mean_squared_error(groundtrue,predictions)))
    print('MAPE',MAPE(groundtrue,predictions))
    print('MAE',mean_absolute_error(groundtrue,predictions))
    print('R^2 test: %.3f' % (r2_score(groundtrue,predictions)))
    print('--------------Post-group assessment--------------')
    predictions =  predictions.reshape(1,-1)[0]
    eval = pd.DataFrame({'pred':predictions,'true':groundtrue})
    eval['class'] = eval['pred'].apply(lambda x: 'short' if x<=630.5 else ('long' if x>975.5 else 'median'))
    df = eval.groupby('class').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df)

In [11]:
INPUT_SIZE = x_train.shape[1]
def build_model(hp):
  model = keras.Sequential()
  model.add(keras.layers.Flatten(input_shape = [INPUT_SIZE,]))
  model.add(keras.layers.Dense(
      hp.Choice('layer1_units',[2,4,6,8,10,12]),
      #hp.Int('units', min_value=32, max_value=512, step=32),
      activation = 'relu'
  ))
  model.add(keras.layers.Dropout(
      hp.Choice('dropout1',[0.1,0.2,0.3,0.4,0.5])
  ))
  model.add(keras.layers.Dense(
      hp.Choice('layer2_units',[2,4,6,8,10,12]),
      activation = 'relu'
  ))
  model.add(keras.layers.Dropout(
      hp.Choice('dropout2',[0.1,0.2,0.3,0.4,0.5])
  ))
  #model.add(keras.layers.Dropout(0.5))
  model.add(keras.layers.Dense(1,activation = 'linear'))
  model.compile(
      optimizer = keras.optimizers.Adam(learning_rate = 0.01),
      loss= 'mse')
  return model

tuner = BayesianOptimization(
    build_model,
    objective='val_loss',
    seed = 55)

In [12]:
tuner.search(x_train.astype(np.float32), y_train.astype(np.float32), epochs=30, validation_split=0.2,  verbose=2)

Trial 10 Complete [00h 00m 11s]
val_loss: 109720.8203125

Best val_loss So Far: 107098.7265625
Total elapsed time: 00h 01m 38s


In [13]:
tuner.results_summary(1)

Results summary
Results in ./untitled_project
Showing 1 best trials
Objective(name="val_loss", direction="min")

Trial 06 summary
Hyperparameters:
layer1_units: 2
dropout1: 0.2
layer2_units: 8
dropout2: 0.1
Score: 107098.7265625


In [14]:
x_train2, x_val2, y_train2, y_val2 = train_test_split(x_train.astype(np.float32), y_train.astype(np.float32), test_size=0.2, random_state=123)
model_final2 = keras.Sequential([
    keras.layers.Dense(2, activation='relu', input_shape=[INPUT_SIZE,]),
    keras.layers.Dropout(0.2, seed=42),
    keras.layers.Dense(8, activation='relu'),
    keras.layers.Dropout(0.1, seed=42),
    keras.layers.Dense(1, activation='linear')
])

model_final2.compile(
      optimizer = keras.optimizers.Adam(learning_rate = 0.01),
      loss= 'mse')
model_final2.fit(x_train2, y_train2,epochs = 20,validation_data=(x_val2, y_val2), verbose=2)

Epoch 1/20
83/83 - 1s - loss: 758939.7500 - val_loss: 759322.7500 - 1s/epoch - 12ms/step
Epoch 2/20
83/83 - 0s - loss: 571371.8125 - val_loss: 385302.5625 - 210ms/epoch - 3ms/step
Epoch 3/20
83/83 - 0s - loss: 287525.9062 - val_loss: 209322.1406 - 173ms/epoch - 2ms/step
Epoch 4/20
83/83 - 0s - loss: 255410.6250 - val_loss: 203074.7969 - 175ms/epoch - 2ms/step
Epoch 5/20
83/83 - 0s - loss: 249674.3906 - val_loss: 188423.5156 - 207ms/epoch - 2ms/step
Epoch 6/20
83/83 - 0s - loss: 238492.3750 - val_loss: 182297.4531 - 192ms/epoch - 2ms/step
Epoch 7/20
83/83 - 0s - loss: 231774.9219 - val_loss: 172752.7812 - 164ms/epoch - 2ms/step
Epoch 8/20
83/83 - 0s - loss: 223567.0000 - val_loss: 168309.5469 - 207ms/epoch - 2ms/step
Epoch 9/20
83/83 - 0s - loss: 222026.1250 - val_loss: 162131.1094 - 213ms/epoch - 3ms/step
Epoch 10/20
83/83 - 0s - loss: 219890.2656 - val_loss: 156278.3750 - 209ms/epoch - 3ms/step
Epoch 11/20
83/83 - 0s - loss: 205516.1562 - val_loss: 148361.8281 - 215ms/epoch - 3ms/step

<keras.src.callbacks.History at 0x7f5f8d98d480>

In [15]:
dnn_predictions = model_final2.predict(x_test.astype(np.float32))
dnn_train =  model_final2.predict(x_train.astype(np.float32))
eval_model(dnn_train,y_train,dnn_predictions ,y_test)

--------------training set----------------
RMSE 355.9569642173712
MAPE 37.068483205337955
MAE 250.29271744661713
R^2 test: 0.034
--------------test set----------------
RMSE 378.4270090848161
MAPE 40.427932068883905
MAE 266.51024894875314
R^2 test: 0.025
--------------Post-group assessment--------------
        true                                               
       count <lambda_0>         mean  median            var
class                                                      
long      60   0.018083  1071.016667  1007.0  178389.237006
median  2114   0.637131   857.083728   787.5  164336.210213
short   1144   0.344786   676.623252   609.0   88468.518479


In [17]:
from sklearn.utils import resample
test_data = np.hstack([x_test, np.array(y_test).reshape(-1,1)])

# Bootstrap
n_bootstraps = 1000

# Save the model performance indicators obtained from each resampling
score_rmse = []
score_r_square = []
score_mape = []
score_mae = []


for i in range(n_bootstraps):
    # Take a number of samples from the test set with put backs to form a new dataset
    test_data_resampled = resample(test_data, replace=True, n_samples=len(test_data), random_state=i)
    x_test_resampled = test_data_resampled[:, :-1]
    y_test_resampled = test_data_resampled[:, -1]

    # Model Evaluation on Bootstrap Resampled Test Sets
    y_pred = model_final2.predict(x_test_resampled.astype(np.float32))
    y_pred = y_pred.reshape(-1,)
    rmse = np.sqrt(mean_squared_error(y_test_resampled, y_pred))
    r2 = r2_score(y_test_resampled,y_pred)
    mape = MAPE(y_test_resampled,y_pred)
    mae = mean_absolute_error(y_test_resampled,y_pred)
    score_rmse.append(rmse)
    score_r_square.append(r2)
    score_mape.append(mape)
    score_mae.append(mae)

# Calculate confidence intervals and means for model performance metrics after Bootstrap resampling
confidence_interval1 = np.percentile(score_rmse, [2.5, 97.5])
mean_score1 = np.mean(score_rmse)
confidence_interval2 = np.percentile(score_r_square, [2.5, 97.5])
mean_score2 = np.mean(score_r_square)
confidence_interval3 = np.percentile(score_mape, [2.5, 97.5])
mean_score3 = np.mean(score_mape)
confidence_interval4 = np.percentile(score_mae,[2.5, 97.5])
mean_score4 = np.mean(score_mae)
print('rmse')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score1))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval1[0], confidence_interval1[1]))
print('r2')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score2))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval2[0], confidence_interval2[1]))
print('mape')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score3))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval3[0], confidence_interval3[1]))
print('mae')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score4))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval4[0], confidence_interval4[1]))

rmse
Mean performance metrics after Bootstrap resampling: 377.803
Confidence interval of performance metrics after Bootstrap resampling: [360.571, 395.093]
r2
Mean performance metrics after Bootstrap resampling: 0.026
Confidence interval of performance metrics after Bootstrap resampling: [0.004, 0.049]
mape
Mean performance metrics after Bootstrap resampling: 35.670
Confidence interval of performance metrics after Bootstrap resampling: [34.435, 36.889]
mae
Mean performance metrics after Bootstrap resampling: 266.323
Confidence interval of performance metrics after Bootstrap resampling: [257.005, 275.086]
