In [3]:
import pandas as pd
import numpy as np 
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import xgboost as xgb
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,accuracy_score,roc_auc_score

## Feature engineering

In [4]:
df1 = pd.read_csv('Data.csv')

In [5]:
df1['Address'] = df1['Address'].apply(lambda x : 'NA' if pd.isnull(x) else x)
df1['if_return'] = df1['Visit.No'].apply(lambda x: True if x>1 else False)

In [6]:
cal_session = min(df1[df1['Session']==2].index)
df1['duration_history2'] = pd.NA
for i in range(cal_session,len(df1)):#The patient's past visits were calculated from the second session
    session = df1.loc[i,'Session']
    if df1.loc[i,'if_return']:#If s/he is a return patient
        id = df1.loc[i,'ID']
        df_temp = df1[(df1['ID']==id)&(df1['Session']<session)]['ServTime'].values#Excluding the data of the same session, only the data of previous sessions can be used for calculation
        if len(df_temp)!=0:
            df1.loc[i,'duration_history2'] = np.mean(df_temp)
        else:
            df1.loc[i,'duration_history2'] = pd.NA#Leave the missing pieces intact and fill in the next 15 minutes
    else:#If s/he is a new patient
        df_temp = df1[(df1['if_return']==False)&(df1['Session']<session)]['ServTime'].values
        df1.loc[i,'duration_history2'] = np.mean(df_temp)

In [7]:
df1.isnull().sum()

ID                     0
Session                0
Month                  0
DayOfWeek              0
WorkingDay             0
AM_PM                  0
Visit.No               0
Gender                 0
M.Cancer               0
S.Cancer               0
StartTime              0
PayTime              279
Address                0
ServTime               0
if_return              0
duration_history2    403
dtype: int64

In [8]:
df1['duration_history2'] = df1['duration_history2'].fillna(900)#Take 15 minutes to fill in the missing values
#df1['duration_history2b'] = df1['duration_history2b'].fillna(900)#Take 15 minutes to fill in the missing values

In [9]:
train_start = min(df1[df1['Session']==2].index)
train_end = max(df1[df1['Session']==194].index)
session1_len = len(df1[df1['Session']==1])
def data_process(df,feature,duration_history,y):
    data = df[feature]
    data = pd.get_dummies(data,columns = ['Gender','Address'], drop_first=True)#one-hot code
    data_x = data.drop([y],axis = 1)
    data_y = data[y]
    x_train = data_x.loc[:train_end,]
    x_test = data_x.loc[train_end+1:,]
    y_train = data_y.loc[:train_end,]
    y_test = data_y.loc[train_end+1:,]
    for i in [duration_history,'Visit.No']:#normalization
        x1 = np.array(x_train.loc[:,i]).reshape(-1,1)
        scaler = StandardScaler()
        scaler.fit(x1)
        x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
        x2 = np.array(x_test.loc[:,i]).reshape(-1,1)
        x_test[i+'_scaled'] = scaler.transform(x2).reshape(1,-1)[0]
    x_train = x_train.drop(['Visit.No',duration_history],axis = 1)
    x_test = x_test.drop(['Visit.No',duration_history],axis = 1)
    return x_train, x_test, y_train, y_test

In [10]:
input_feature2 = ['Visit.No','M.Cancer','S.Cancer','Gender','ServTime','if_return','duration_history2','Address']
x_train2, x_test2, y_train2, y_test2 = data_process(df1,input_feature2,'duration_history2','ServTime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[i+'_scaled'] = scaler.transform(x2).reshape(1,-1)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
A value is trying to be

## Regression model training

In [11]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [12]:
def eval_model(xtrain_pred,ytrain,predictions,groundtrue):
    print('--------------training set----------------')
    print('RMSE',np.sqrt(mean_squared_error(ytrain,xtrain_pred)))
    print('MAPE',MAPE(ytrain,xtrain_pred))
    print('MAE',mean_absolute_error(ytrain,xtrain_pred))
    print('R^2 test: %.3f' % (r2_score(ytrain,xtrain_pred)))
    print('--------------test set----------------')
    print('RMSE',np.sqrt(mean_squared_error(groundtrue,predictions)))
    print('MAPE',MAPE(groundtrue,predictions))
    print('MAE',mean_absolute_error(groundtrue,predictions))
    print('R^2 test: %.3f' % (r2_score(groundtrue,predictions)))
    print('--------------Post-group assessment--------------')
    eval = pd.DataFrame({'pred':predictions,'true':groundtrue})
    eval['class'] = eval['pred'].apply(lambda x: 'short' if x<=630.5 else ('long' if x>975.5 else 'median'))
    eval['class2'] = eval['pred'].apply(lambda x: 'short' if x<=811.5 else 'long')
    df = eval.groupby('class').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df)
    df1 = eval.groupby('class2').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df1)

In [13]:
def model_fit(model,params,train_x,train_y,test_x,seed):
    bayes_search = BayesSearchCV(model, params, scoring='neg_mean_squared_error',verbose=2,cv=10,random_state=seed)
    bayes_search.fit(train_x,train_y)
    bestparam = bayes_search.best_params_
    print('bestparam:',bestparam)
    bestmodel = bayes_search.best_estimator_
    pred_test = bestmodel.predict(test_x)
    pred_train = bestmodel.predict(train_x)
    return bayes_search,bestmodel,bestparam,pred_test,pred_train
param_space_xgb = {
    'colsample_bytree': Real(0.4,0.9),
    'colsample_bylevel':Real(0.4,0.9),
    'learning_rate': Categorical([0.1]),
    'lambda':Real(0,200),
    'alpha':Real(0,200),
    'gamma':Real(0,200),
    'n_estimators': Categorical([100,300,500]),
    'max_depth': Integer(2,6),
    'min_child_weight':Integer(1,10)
}

### xgb

In [15]:
xgb_reg = xgb.XGBRegressor(random_state=42)
xgb_bayse,xgb_model2_42,xgb_model2_42_param,xgb_predictions2_42,xgb_train2_42 = model_fit(xgb_reg,param_space_xgb,x_train2, y_train2, x_test2, seed = 55)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300; total time=   0.1s
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300; total time=   0.1s
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300; total time=   0.1s
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth

[CV] END alpha=32.04469069560395, colsample_bylevel=0.840641421534533, colsample_bytree=0.7945408610395379, gamma=38.047288145785586, lambda=49.405183506691955, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=300; total time=   0.1s
[CV] END alpha=32.04469069560395, colsample_bylevel=0.840641421534533, colsample_bytree=0.7945408610395379, gamma=38.047288145785586, lambda=49.405183506691955, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=300; total time=   0.1s
[CV] END alpha=32.04469069560395, colsample_bylevel=0.840641421534533, colsample_bytree=0.7945408610395379, gamma=38.047288145785586, lambda=49.405183506691955, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=300; total time=   0.1s
[CV] END alpha=32.04469069560395, colsample_bylevel=0.840641421534533, colsample_bytree=0.7945408610395379, gamma=38.047288145785586, lambda=49.405183506691955, learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=300; total time=   0.1s


[CV] END alpha=51.6009805754902, colsample_bylevel=0.4408660339717803, colsample_bytree=0.6040742532506413, gamma=148.74267327700682, lambda=141.639322954778, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
[CV] END alpha=51.6009805754902, colsample_bylevel=0.4408660339717803, colsample_bytree=0.6040742532506413, gamma=148.74267327700682, lambda=141.639322954778, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
[CV] END alpha=51.6009805754902, colsample_bylevel=0.4408660339717803, colsample_bytree=0.6040742532506413, gamma=148.74267327700682, lambda=141.639322954778, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
[CV] END alpha=51.6009805754902, colsample_bylevel=0.4408660339717803, colsample_bytree=0.6040742532506413, gamma=148.74267327700682, lambda=141.639322954778, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.1s
[CV] END

[CV] END alpha=14.678892833121985, colsample_bylevel=0.750676304570058, colsample_bytree=0.8109145268781603, gamma=52.382338019302935, lambda=69.36923308576722, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=500; total time=   0.3s
[CV] END alpha=14.678892833121985, colsample_bylevel=0.750676304570058, colsample_bytree=0.8109145268781603, gamma=52.382338019302935, lambda=69.36923308576722, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=500; total time=   0.3s
[CV] END alpha=14.678892833121985, colsample_bylevel=0.750676304570058, colsample_bytree=0.8109145268781603, gamma=52.382338019302935, lambda=69.36923308576722, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=500; total time=   0.3s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=72.36638974360142, colsample_bylevel=0.9, colsample_bytree=0.4164566999619614, gamma=166.51098830974433, lambda=64.1805668417217, learning_rate=0.1, max_depth=2, min_child_we

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=39.90986391134921, colsample_bylevel=0.9, colsample_bytree=0.9, gamma=200.0, lambda=0.0, learning_rate=0.1, max_depth=2, min_child_weight=1, n_estimators=500; total time=   0.2s
[CV] END alpha=39.90986391134921, colsample_bylevel=0.9, colsample_bytree=0.9, gamma=200.0, lambda=0.0, learning_rate=0.1, max_depth=2, min_child_weight=1, n_estimators=500; total time=   0.2s
[CV] END alpha=39.90986391134921, colsample_bylevel=0.9, colsample_bytree=0.9, gamma=200.0, lambda=0.0, learning_rate=0.1, max_depth=2, min_child_weight=1, n_estimators=500; total time=   0.2s
[CV] END alpha=39.90986391134921, colsample_bylevel=0.9, colsample_bytree=0.9, gamma=200.0, lambda=0.0, learning_rate=0.1, max_depth=2, min_child_weight=1, n_estimators=500; total time=   0.1s
[CV] END alpha=39.90986391134921, colsample_bylevel=0.9, colsample_bytree=0.9, gamma=200.0, lambda=0.0, learning_rate=0.1, max_depth=2, min_child_weight=1, n_estimator

[CV] END alpha=51.08595993561678, colsample_bylevel=0.42852352266048377, colsample_bytree=0.654290657398142, gamma=163.5110924121251, lambda=141.72406321766385, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.1s
[CV] END alpha=51.08595993561678, colsample_bylevel=0.42852352266048377, colsample_bytree=0.654290657398142, gamma=163.5110924121251, lambda=141.72406321766385, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
[CV] END alpha=51.08595993561678, colsample_bylevel=0.42852352266048377, colsample_bytree=0.654290657398142, gamma=163.5110924121251, lambda=141.72406321766385, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=142.1509588462965, colsample_bylevel=0.6994438531561099, colsample_bytree=0.6287966594155026, gamma=200.0, lambda=142.09142099103252, learning_rate=0.1, max_depth=2, min_chil

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.6512196912067164, gamma=198.52731066901538, lambda=141.43749036693418, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=300; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.6512196912067164, gamma=198.52731066901538, lambda=141.43749036693418, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=300; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.6512196912067164, gamma=198.52731066901538, lambda=141.43749036693418, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=300; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.6512196912067164, gamma=198.52731066901538, lambda=141.43749036693418, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=300; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsampl

[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.657327630127335, gamma=136.73767532913757, lambda=141.63577873467958, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.657327630127335, gamma=136.73767532913757, lambda=141.63577873467958, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.657327630127335, gamma=136.73767532913757, lambda=141.63577873467958, learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300; total time=   0.0s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=111.41106996723771, colsample_bylevel=0.4, colsample_bytree=0.6160837918640258, gamma=188.23035481520677, lambda=141.81576952470297, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=300; total time=   0.0s
[CV] END alpha=111.41106996723771, cols

[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.0, lambda=76.05107240814431, learning_rate=0.1, max_depth=2, min_child_weight=3, n_estimators=500; total time=   0.1s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.0, lambda=76.05107240814431, learning_rate=0.1, max_depth=2, min_child_weight=3, n_estimators=500; total time=   0.1s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.0, lambda=76.05107240814431, learning_rate=0.1, max_depth=2, min_child_weight=3, n_estimators=500; total time=   0.1s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.0, lambda=76.05107240814431, learning_rate=0.1, max_depth=2, min_child_weight=3, n_estimators=500; total time=   0.1s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.9, gamma=0.0, lambda=76.05107240814431, learning_rate=0.1, max_depth=2, min_child_weight=3, n_estimators=500; total time=   0.1s
[CV] END alpha=0.0, colsample_bylevel=0.4, co

[CV] END alpha=74.69120012243691, colsample_bylevel=0.4316939552933059, colsample_bytree=0.4, gamma=56.74503962241183, lambda=61.68855756277256, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=500; total time=   0.1s
[CV] END alpha=74.69120012243691, colsample_bylevel=0.4316939552933059, colsample_bytree=0.4, gamma=56.74503962241183, lambda=61.68855756277256, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=500; total time=   0.1s
[CV] END alpha=74.69120012243691, colsample_bylevel=0.4316939552933059, colsample_bytree=0.4, gamma=56.74503962241183, lambda=61.68855756277256, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=500; total time=   0.1s
[CV] END alpha=74.69120012243691, colsample_bylevel=0.4316939552933059, colsample_bytree=0.4, gamma=56.74503962241183, lambda=61.68855756277256, learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=500; total time=   0.1s
[CV] END alpha=74.69120012243691, colsample_bylevel=0.4316939552

[CV] END alpha=93.37559767426742, colsample_bylevel=0.5622225686968445, colsample_bytree=0.5505981316142925, gamma=62.263247708036985, lambda=17.977924849829368, learning_rate=0.1, max_depth=5, min_child_weight=4, n_estimators=100; total time=   0.0s
[CV] END alpha=93.37559767426742, colsample_bylevel=0.5622225686968445, colsample_bytree=0.5505981316142925, gamma=62.263247708036985, lambda=17.977924849829368, learning_rate=0.1, max_depth=5, min_child_weight=4, n_estimators=100; total time=   0.0s
[CV] END alpha=93.37559767426742, colsample_bylevel=0.5622225686968445, colsample_bytree=0.5505981316142925, gamma=62.263247708036985, lambda=17.977924849829368, learning_rate=0.1, max_depth=5, min_child_weight=4, n_estimators=100; total time=   0.0s
[CV] END alpha=93.37559767426742, colsample_bylevel=0.5622225686968445, colsample_bytree=0.5505981316142925, gamma=62.263247708036985, lambda=17.977924849829368, learning_rate=0.1, max_depth=5, min_child_weight=4, n_estimators=100; total time=   0

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5970202699684851, gamma=0.0, lambda=14.686596012225333, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5970202699684851, gamma=0.0, lambda=14.686596012225333, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5970202699684851, gamma=0.0, lambda=14.686596012225333, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5970202699684851, gamma=0.0, lambda=14.686596012225333, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5970202699684851, gamma=0.0, lambda=14.6865960122

[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5756483433616537, gamma=2.2355186229375095, lambda=14.961291486900933, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.4, colsample_bytree=0.5756483433616537, gamma=2.2355186229375095, lambda=14.961291486900933, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=43.906070163094896, colsample_bylevel=0.4, colsample_bytree=0.5681028525195139, gamma=83.74301394378537, lambda=14.21848182237229, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=43.906070163094896, colsample_bylevel=0.4, colsample_bytree=0.5681028525195139, gamma=83.74301394378537, lambda=14.21848182237229, learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END alpha=43.9060701

[CV] END alpha=144.0428925361077, colsample_bylevel=0.9, colsample_bytree=0.5766889373371988, gamma=0.0, lambda=13.173748680169442, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100; total time=   0.0s
[CV] END alpha=144.0428925361077, colsample_bylevel=0.9, colsample_bytree=0.5766889373371988, gamma=0.0, lambda=13.173748680169442, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100; total time=   0.0s
[CV] END alpha=144.0428925361077, colsample_bylevel=0.9, colsample_bytree=0.5766889373371988, gamma=0.0, lambda=13.173748680169442, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100; total time=   0.0s
[CV] END alpha=144.0428925361077, colsample_bylevel=0.9, colsample_bytree=0.5766889373371988, gamma=0.0, lambda=13.173748680169442, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100; total time=   0.0s
[CV] END alpha=144.0428925361077, colsample_bylevel=0.9, colsample_bytree=0.5766889373371988, gamma=0.0, lambda=

[CV] END alpha=0.0, colsample_bylevel=0.48979045119827236, colsample_bytree=0.5742098785341617, gamma=0.0, lambda=7.793129012334621, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.48979045119827236, colsample_bytree=0.5742098785341617, gamma=0.0, lambda=7.793129012334621, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.48979045119827236, colsample_bytree=0.5742098785341617, gamma=0.0, lambda=7.793129012334621, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.48979045119827236, colsample_bytree=0.5742098785341617, gamma=0.0, lambda=7.793129012334621, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100; total time=   0.0s
[CV] END alpha=0.0, colsample_bylevel=0.48979045119827236, colsample_bytree=0.5742098785341617, gamma=0.0, lambd

In [16]:
#model2 random state = 42
eval_model(xgb_train2_42,y_train2,xgb_predictions2_42,y_test2)

--------------training set----------------
RMSE 322.6323646526369
MAPE 34.53599088817166
MAE 238.2566093129902
R^2 test: 0.206
--------------test set----------------
RMSE 367.26467089519053
MAPE 40.50636603761225
MAE 269.2009605053608
R^2 test: 0.082
--------------Post-group assessment--------------
        true                                               
       count <lambda_0>         mean  median            var
class                                                      
long     177   0.053345  1072.598870  1015.0  233871.105226
median  2722   0.820374   809.987877   738.0  142042.479456
short    419   0.126281   609.918854   561.0   75057.605840
        true                                             
       count <lambda_0>        mean median            var
class2                                                   
long    1679   0.506028  893.959500  818.0  173058.956642
short   1639   0.493972  701.180598  638.0  101572.138305


In [17]:
from sklearn.utils import resample
test_data = np.hstack([x_test2, np.array(y_test2).reshape(-1,1)])

# Bootstrap 
n_bootstraps = 1000

# Save the model performance indicators obtained from each resampling
score_rmse = []
score_r_square = []
score_mape = []
score_mae = []

#  Bootstrap 
for i in range(n_bootstraps):
    # Take a number of samples from the test set with put backs to form a new dataset
    test_data_resampled = resample(test_data, replace=True, n_samples=len(test_data), random_state=i)
    x_test_resampled = test_data_resampled[:, :-1]
    y_test_resampled = test_data_resampled[:, -1]

    # Model Evaluation on Bootstrap Resampled Test Sets
    y_pred = xgb_model2_42.predict(x_test_resampled)
    #print(y_unsclaed_xgbpred.shape)
    #y_test_resampled = y_test_resampled.reshape(-1,1)
    #print(y_test_resampled.shape)
    rmse = np.sqrt(mean_squared_error(y_test_resampled, y_pred))
    r2 = r2_score(y_test_resampled,y_pred)
    mape = MAPE(y_test_resampled,y_pred)
    mae = mean_absolute_error(y_test_resampled,y_pred)
    score_rmse.append(rmse)
    score_r_square.append(r2)
    score_mape.append(mape)
    score_mae.append(mae)

# Calculate confidence intervals and means for model performance metrics after Bootstrap resampling
confidence_interval1 = np.percentile(score_rmse, [2.5, 97.5])
mean_score1 = np.mean(score_rmse)
confidence_interval2 = np.percentile(score_r_square, [2.5, 97.5])
mean_score2 = np.mean(score_r_square)
confidence_interval3 = np.percentile(score_mape, [2.5, 97.5])
mean_score3 = np.mean(score_mape)
confidence_interval4 = np.percentile(score_mae,[2.5, 97.5])
mean_score4 = np.mean(score_mae)
print('rmse')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score1))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval1[0], confidence_interval1[1]))
print('r2')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score2))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval2[0], confidence_interval2[1]))
print('mape')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score3))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval3[0], confidence_interval3[1]))
print('mae')
print("Mean performance metrics after Bootstrap resampling: {:.3f}".format(mean_score4))
print("Confidence interval of performance metrics after Bootstrap resampling: [{:.3f}, {:.3f}]".format(confidence_interval4[0], confidence_interval4[1]))

rmse
Mean performance metrics after Bootstrap resampling: 366.758
Confidence interval of performance metrics after Bootstrap resampling: [349.940, 382.999]
r2
Mean performance metrics after Bootstrap resampling: 0.082
Confidence interval of performance metrics after Bootstrap resampling: [0.055, 0.108]
mape
Mean performance metrics after Bootstrap resampling: 40.526
Confidence interval of performance metrics after Bootstrap resampling: [39.020, 42.059]
mae
Mean performance metrics after Bootstrap resampling: 269.041
Confidence interval of performance metrics after Bootstrap resampling: [260.715, 277.501]


### svr

In [18]:
param_space_svr = {
    'C': Real(0.01,100),
    'gamma':Real(0.01,100),
    'kernel': Categorical(['rbf'])
}
svr = SVR()
svr_bayse,svr_model2,svr_model2_param,svr_predictions2,svr_train2 = model_fit(svr,param_space_svr,x_train2, y_train2, x_test2, seed = 55)


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.6s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.6s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.5s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=43.54358330440522

[CV] END C=9.96371675916949, gamma=21.192983604008408, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.5s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.5s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.4s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.4s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.5s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.5s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.5s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.6s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; total time=   0.5s
[CV] END C=7.3487124719193355, gamma=70.13824738792017, kernel=rbf; to

[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.5s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.5s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
[CV] END C=76.28326138640108, gamma=11.031774999223487, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=77.25679457329643, gamma=11.208533343742685, kernel=rbf; t

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
[CV] END C=91.78794114787458, gamma=6.599950377025495, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=95.1306

[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
[CV] END ......C=100.0, gamma=5.9514109487657185, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END .......C=100.0, gamma=5.346239273315319, kernel=rbf; total time=   0.5s
[CV] END .......C=100.0, gamma=5.346239273315319, kernel=rbf; total time=   0.4s
[CV] END .......C=100.0, gamma=5.346239273315319, kernel=rbf; total time=   0.4s
[CV] END .......C=100.0, gamma=5.346239273315319

[CV] END .......C=100.0, gamma=3.155773681873303, kernel=rbf; total time=   0.4s
[CV] END .......C=100.0, gamma=3.155773681873303, kernel=rbf; total time=   0.4s
[CV] END .......C=100.0, gamma=3.155773681873303, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.5s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.5s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.4s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.4s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.5s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.4s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.4s
[CV] END ........C=0.01, gamma=51.40967058512804, kernel=rbf; total time=   0.4s
[CV] END ........C=0.01, gamma=51.40967058512804

In [19]:
eval_model(svr_train2,y_train2,svr_predictions2,y_test2)

--------------training set----------------
RMSE 331.4641804071228
MAPE 30.376564905726806
MAE 227.7230841974008
R^2 test: 0.162
--------------test set----------------
RMSE 374.3311647205253
MAPE 37.435657421778465
MAE 266.9791738557738
R^2 test: 0.046
--------------Post-group assessment--------------
        true                                               
       count <lambda_0>         mean  median            var
class                                                      
long      73   0.022001  1219.027397  1278.0  207585.943683
median  2559   0.771248   821.266120   750.0  147100.200067
short    686   0.206751   669.947522   590.0  103343.133008
        true                                             
       count <lambda_0>        mean median            var
class2                                                   
long    1087   0.327607  903.388224  829.0  174375.232200
short   2231   0.672393  747.740923  677.0  125790.480831


### lr

In [20]:
lr = LinearRegression()
lr.fit(x_train2,y_train2)
print('coef',lr.coef_)
print('intercept',lr.intercept_)

lr_predictions2 = lr.predict(x_test2)
lr_train2 = lr.predict(x_train2)
eval_model(lr_train2,y_train2,lr_predictions2,y_test2)


coef [ 157.15177678  115.43417208 -187.4018734   -13.77195628   14.40759647
   46.69267981  163.23042705   52.88481447   11.17336462]
intercept 887.4299963720218
--------------training set----------------
RMSE 340.8302635222001
MAPE 36.83851080096318
MAE 252.81531198287513
R^2 test: 0.114
--------------test set----------------
RMSE 367.1574547891542
MAPE 41.45027744171663
MAE 272.91128917986737
R^2 test: 0.083
--------------Post-group assessment--------------
        true                                              
       count <lambda_0>         mean median            var
class                                                     
long     255   0.076854  1029.101961  963.0  200058.170665
median  2874   0.866184   792.733820  720.0  138756.654846
short    189   0.056962   579.126984  492.0   81793.324215
        true                                             
       count <lambda_0>        mean median            var
class2                                                   
long    

## dnn

In [24]:
import random
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
random.seed(42)
np.random.seed(123)
tf.random.set_seed(42)
# from tensorflow.keras.initializers import RandomUniform#GlorotUniform
INPUT_SIZE2 = x_train2.shape[1]
x_train2_1, x_val2, y_train2_1, y_val2 = train_test_split(x_train2.astype(np.float32), y_train2.astype(np.float32), test_size=0.2, random_state=123)
model_final2 = keras.Sequential([
    keras.layers.Dense(12, activation='relu', input_shape=[INPUT_SIZE2,]),
    keras.layers.Dropout(0.2, seed=42),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dropout(0.1, seed=42),
    keras.layers.Dense(1, activation='linear')
])

model_final2.compile(
      optimizer = keras.optimizers.Adam(learning_rate = 0.01),
      loss= 'mse')
model_final2.fit(x_train2_1, y_train2_1,epochs = 20,validation_data=(x_val2, y_val2), verbose=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
83/83 - 1s - 15ms/step - loss: 717382.5625 - val_loss: 563264.5625
Epoch 2/20
83/83 - 0s - 2ms/step - loss: 274958.4062 - val_loss: 185743.7500
Epoch 3/20
83/83 - 0s - 2ms/step - loss: 196514.4688 - val_loss: 166044.8594
Epoch 4/20
83/83 - 0s - 2ms/step - loss: 177839.9219 - val_loss: 149783.2344
Epoch 5/20
83/83 - 0s - 2ms/step - loss: 165895.9062 - val_loss: 140657.7500
Epoch 6/20
83/83 - 0s - 2ms/step - loss: 155662.1094 - val_loss: 134558.9219
Epoch 7/20
83/83 - 0s - 2ms/step - loss: 152092.3750 - val_loss: 128494.0391
Epoch 8/20
83/83 - 0s - 2ms/step - loss: 150297.4844 - val_loss: 127368.7500
Epoch 9/20
83/83 - 0s - 2ms/step - loss: 147031.8281 - val_loss: 126848.1562
Epoch 10/20
83/83 - 0s - 2ms/step - loss: 144949.7500 - val_loss: 124621.2500
Epoch 11/20
83/83 - 0s - 2ms/step - loss: 143290.9688 - val_loss: 123416.4062
Epoch 12/20
83/83 - 0s - 2ms/step - loss: 147238.4531 - val_loss: 124569.0156
Epoch 13/20
83/83 - 0s - 2ms/step - loss: 140816.4688 - val_loss: 123887

<keras.src.callbacks.history.History at 0x1c1b0c9b040>

In [25]:
def eval_model_dnn(xtrain_pred,ytrain,predictions,groundtrue):
    print('--------------训练集----------------')
    print('RMSE',np.sqrt(mean_squared_error(ytrain,xtrain_pred)))
    print('MAPE',MAPE(ytrain,xtrain_pred))
    print('MAE',mean_absolute_error(ytrain,xtrain_pred))
    print('R^2 test: %.3f' % (r2_score(ytrain,xtrain_pred)))
    print('--------------测试集----------------')
    print('RMSE',np.sqrt(mean_squared_error(groundtrue,predictions)))
    print('MAPE',MAPE(groundtrue,predictions))
    print('MAE',mean_absolute_error(groundtrue,predictions))
    print('R^2 test: %.3f' % (r2_score(groundtrue,predictions)))
    print('--------------分组后评估--------------')
    predictions =  predictions.reshape(1,-1)[0]
    eval = pd.DataFrame({'pred':predictions,'true':groundtrue})
    eval['class'] = eval['pred'].apply(lambda x: 'short' if x<=630.5 else ('long' if x>975.5 else 'median'))
    df = eval.groupby('class').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df)

In [26]:
dnn_predictions2 = model_final2.predict(x_test2.astype(np.float32))
dnn_train2 =  model_final2.predict(x_train2.astype(np.float32))
eval_model_dnn(dnn_train2,y_train2,dnn_predictions2 ,y_test2)

[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 940us/step
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 815us/step
--------------训练集----------------
RMSE 343.5306778161649
MAPE 40.014985766533506
MAE 249.57616652361992
R^2 test: 0.100
--------------测试集----------------
RMSE 368.88348399776214
MAPE 44.017236603373675
MAE 268.89008947996723
R^2 test: 0.074
--------------分组后评估--------------
        true                                              
       count <lambda_0>         mean median            var
class                                                     
long     223   0.067209  1038.192825  963.0  206062.381570
median  2636   0.794454   810.413126  739.0  143373.053930
short    459   0.138336   615.309368  560.0   77440.384432
