In [1]:
import pandas as pd
import numpy as np 
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import xgboost as xgb
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,accuracy_score,roc_auc_score

## 特征工程

In [2]:
df1 = pd.read_csv('Data.csv')

In [3]:
df1['Address'] = df1['Address'].apply(lambda x : 'NA' if pd.isnull(x) else x)
df1['if_return'] = df1['Visit.No'].apply(lambda x: True if x>1 else False)

In [4]:
cal_session = min(df1[df1['Session']==2].index)
df1['duration_history2'] = pd.NA
for i in range(cal_session,len(df1)):#从第二个session开始计算患者过去历史看诊时长
    session = df1.loc[i,'Session']
    if df1.loc[i,'if_return']:#如果是复诊患者
        id = df1.loc[i,'ID']
        df_temp = df1[(df1['ID']==id)&(df1['Session']<session)]['ServTime'].values#排除同一个session的数据，只能使用之前session的数据计算
        if len(df_temp)!=0:
            df1.loc[i,'duration_history2'] = np.mean(df_temp)
        else:
            df1.loc[i,'duration_history2'] = pd.NA#保留缺失，后续用15分钟填补
    else:#如果是初诊患者
        df_temp = df1[(df1['if_return']==False)&(df1['Session']<session)]['ServTime'].values
        df1.loc[i,'duration_history2'] = np.mean(df_temp)

In [5]:
df1.isnull().sum()

Unnamed: 0             0
ID                     0
Session                0
Month                  0
DayOfWeek              0
WorkingDay             0
AM_PM                  0
Visit.No               0
Gender                 0
M.Cancer               0
S.Cancer               0
StartTime              0
PayTime              279
Address                0
ServTime               0
if_return              0
duration_history2    403
dtype: int64

In [6]:
df1['duration_history2'] = df1['duration_history2'].fillna(900)#先用15分钟填补缺失值
#df1['duration_history2b'] = df1['duration_history2b'].fillna(900)#先用15分钟填补缺失值

In [7]:
train_start = min(df1[df1['Session']==2].index)
train_end = max(df1[df1['Session']==194].index)
session1_len = len(df1[df1['Session']==1])
def data_process(df,feature,duration_history,y):
    data = df[feature]
    data = pd.get_dummies(data,columns = ['Gender','Address'], drop_first=True)#one-hot编码
    data_x = data.drop([y],axis = 1)
    data_y = data[y]
    x_train = data_x.loc[:train_end,]
    x_test = data_x.loc[train_end+1:,]
    y_train = data_y.loc[:train_end,]
    y_test = data_y.loc[train_end+1:,]
    for i in [duration_history,'Visit.No']:#归一化
        x1 = np.array(x_train.loc[:,i]).reshape(-1,1)
        scaler = StandardScaler()
        scaler.fit(x1)
        x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
        x2 = np.array(x_test.loc[:,i]).reshape(-1,1)
        x_test[i+'_scaled'] = scaler.transform(x2).reshape(1,-1)[0]
    x_train = x_train.drop(['Visit.No',duration_history],axis = 1)
    x_test = x_test.drop(['Visit.No',duration_history],axis = 1)
    return x_train, x_test, y_train, y_test

In [8]:
input_feature2 = ['Visit.No','M.Cancer','S.Cancer','Gender','ServTime','if_return','duration_history2','Address']
x_train2, x_test2, y_train2, y_test2 = data_process(df1,input_feature2,'duration_history2','ServTime')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[i+'_scaled'] = scaler.transform(x2).reshape(1,-1)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[i+'_scaled'] = scaler.transform(x1).reshape(1,-1)[0]
A value is trying to be

## 回归模型训练

In [9]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [10]:
def eval_model(xtrain_pred,ytrain,predictions,groundtrue):
    print('--------------训练集----------------')
    print('RMSE',np.sqrt(mean_squared_error(ytrain,xtrain_pred)))
    print('MAPE',MAPE(ytrain,xtrain_pred))
    print('MAE',mean_absolute_error(ytrain,xtrain_pred))
    print('R^2 test: %.3f' % (r2_score(ytrain,xtrain_pred)))
    print('--------------测试集----------------')
    print('RMSE',np.sqrt(mean_squared_error(groundtrue,predictions)))
    print('MAPE',MAPE(groundtrue,predictions))
    print('MAE',mean_absolute_error(groundtrue,predictions))
    print('R^2 test: %.3f' % (r2_score(groundtrue,predictions)))
    print('--------------分组后评估--------------')
    eval = pd.DataFrame({'pred':predictions,'true':groundtrue})
    eval['class'] = eval['pred'].apply(lambda x: 'short' if x<=630.5 else ('long' if x>975.5 else 'median'))
    eval['class2'] = eval['pred'].apply(lambda x: 'short' if x<=811.5 else 'long')
    df = eval.groupby('class').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df)
    df1 = eval.groupby('class2').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df1)

In [11]:
def model_fit(model,params,train_x,train_y,test_x,seed):
    bayes_search = BayesSearchCV(model, params, scoring='neg_mean_squared_error',verbose=2,cv=10,random_state=seed)
    bayes_search.fit(train_x,train_y)
    bestparam = bayes_search.best_params_
    print('bestparam:',bestparam)
    bestmodel = bayes_search.best_estimator_
    pred_test = bestmodel.predict(test_x)
    pred_train = bestmodel.predict(train_x)
    return bayes_search,bestmodel,bestparam,pred_test,pred_train
param_space_xgb = {
    'colsample_bytree': Real(0.4,0.9),
    'colsample_bylevel':Real(0.4,0.9),
    'learning_rate': Categorical([0.1]),
    'lambda':Real(0,200),
    'alpha':Real(0,200),
    'gamma':Real(0,200),
    'n_estimators': Categorical([100,300,500]),
    'max_depth': Integer(2,6),
    'min_child_weight':Integer(1,10)
}

### xgb

In [None]:
xgb_reg = xgb.XGBRegressor(random_state=42)
xgb_bayse,xgb_model2_42,xgb_model2_42_param,xgb_predictions2_42,xgb_train2_42 = model_fit(xgb_reg,param_space_xgb,x_train2, y_train2, x_test2, seed = 42)

In [12]:
xgb_reg = xgb.XGBRegressor(random_state=42)
xgb_bayse,xgb_model2_42,xgb_model2_42_param,xgb_predictions2_42,xgb_train2_42 = model_fit(xgb_reg,param_space_xgb,x_train2, y_train2, x_test2, seed = 55)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300; total time=   0.5s
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300; total time=   0.3s
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=300; total time=   0.3s
[CV] END alpha=158.807024300404, colsample_bylevel=0.7800647945904826, colsample_bytree=0.8501681628173743, gamma=15.910167904632159, lambda=54.557374029227525, learning_rate=0.1, max_depth

In [13]:
#model2 random state = 42
eval_model(xgb_train2_42,y_train2,xgb_predictions2_42,y_test2)

--------------训练集----------------
RMSE 326.42679730513106
MAPE 34.89118219779518
MAE 240.93698417109587
R^2 test: 0.187
--------------测试集----------------
RMSE 366.6359920470354
MAPE 40.41809229207963
MAE 268.78459992503605
R^2 test: 0.085
--------------分组后评估--------------
        true                                               
       count <lambda_0>         mean  median            var
class                                                      
long     193   0.058168  1050.176166  1003.0  220534.541721
median  2698   0.813140   810.690141   739.0  142286.123455
short    427   0.128692   609.524590   559.0   78939.353267
        true                                             
       count <lambda_0>        mean median            var
class2                                                   
long    1665   0.501808  900.890691  820.0  179974.159919
short   1653   0.498192  695.831821  633.0   92757.020125


In [32]:
from sklearn.utils import resample
test_data = np.hstack([x_test2, np.array(y_test2).reshape(-1,1)])

# Bootstrap 重采样次数
n_bootstraps = 1000

# 保存每次重采样得到的模型性能指标
score_rmse = []
score_r_square = []
score_mape = []
score_mae = []

# 进行 Bootstrap 重采样
for i in range(n_bootstraps):
    # 从测试集中有放回地抽取若干个样本，组成一个新的数据集
    test_data_resampled = resample(test_data, replace=True, n_samples=len(test_data), random_state=i)
    x_test_resampled = test_data_resampled[:, :-1]
    y_test_resampled = test_data_resampled[:, -1]

    # 在 Bootstrap 重采样后的测试集上进行模型评估
    y_pred = xgb_model2_42.predict(x_test_resampled)
    #print(y_unsclaed_xgbpred.shape)
    #y_test_resampled = y_test_resampled.reshape(-1,1)
    #print(y_test_resampled.shape)
    rmse = np.sqrt(mean_squared_error(y_test_resampled, y_pred))
    r2 = r2_score(y_test_resampled,y_pred)
    mape = MAPE(y_test_resampled,y_pred)
    mae = mean_absolute_error(y_test_resampled,y_pred)
    score_rmse.append(rmse)
    score_r_square.append(r2)
    score_mape.append(mape)
    score_mae.append(mae)

# 计算 Bootstrap 重采样后的模型性能指标的置信区间和均值
confidence_interval1 = np.percentile(score_rmse, [2.5, 97.5])
mean_score1 = np.mean(score_rmse)
confidence_interval2 = np.percentile(score_r_square, [2.5, 97.5])
mean_score2 = np.mean(score_r_square)
confidence_interval3 = np.percentile(score_mape, [2.5, 97.5])
mean_score3 = np.mean(score_mape)
confidence_interval4 = np.percentile(score_mae,[2.5, 97.5])
mean_score4 = np.mean(score_mae)
print('rmse')
print("Bootstrap 重采样后的模型性能指标均值: {:.3f}".format(mean_score1))
print("Bootstrap 重采样后的模型性能指标置信区间: [{:.3f}, {:.3f}]".format(confidence_interval1[0], confidence_interval1[1]))
print('r2')
print("Bootstrap 重采样后的模型性能指标均值: {:.3f}".format(mean_score2))
print("Bootstrap 重采样后的模型性能指标置信区间: [{:.3f}, {:.3f}]".format(confidence_interval2[0], confidence_interval2[1]))
print('mape')
print("Bootstrap 重采样后的模型性能指标均值: {:.3f}".format(mean_score3))
print("Bootstrap 重采样后的模型性能指标置信区间: [{:.3f}, {:.3f}]".format(confidence_interval3[0], confidence_interval3[1]))
print('mae')
print("Bootstrap 重采样后的模型性能指标均值: {:.3f}".format(mean_score4))
print("Bootstrap 重采样后的模型性能指标置信区间: [{:.3f}, {:.3f}]".format(confidence_interval4[0], confidence_interval4[1]))

rmse
Bootstrap 重采样后的模型性能指标均值: 366.150
Bootstrap 重采样后的模型性能指标置信区间: [349.105, 382.330]
r2
Bootstrap 重采样后的模型性能指标均值: 0.085
Bootstrap 重采样后的模型性能指标置信区间: [0.059, 0.111]
mape
Bootstrap 重采样后的模型性能指标均值: 40.441
Bootstrap 重采样后的模型性能指标置信区间: [38.949, 41.972]
mae
Bootstrap 重采样后的模型性能指标均值: 268.646
Bootstrap 重采样后的模型性能指标置信区间: [260.193, 276.885]


### svr

In [13]:
param_space_svr = {
    'C': Real(0.01,100),
    'gamma':Real(0.01,100),
    'kernel': Categorical(['rbf'])
}
svr = SVR()
svr_bayse,svr_model2,svr_model2_param,svr_predictions2,svr_train2 = model_fit(svr,param_space_svr,x_train2, y_train2, x_test2, seed = 55)


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
[CV] END C=79.40557179898698, gamma=76.0153576222047, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=43.54358330440522



Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
[CV] END ....................C=100.0, gamma=0.01, kernel=rbf; total time=   0.4s
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END C=78.39069152191642, gamma=12.319651313561161, kernel=rbf; 

In [14]:
eval_model(svr_train2,y_train2,svr_predictions2,y_test2)

--------------训练集----------------
RMSE 331.78024557872106
MAPE 30.422580349163603
MAE 228.0508738344116
R^2 test: 0.160
--------------测试集----------------
RMSE 374.2866873417175
MAPE 37.424124374385734
MAE 266.97405350676826
R^2 test: 0.047
--------------分组后评估--------------
        true                                               
       count <lambda_0>         mean  median            var
class                                                      
long      73   0.022001  1219.027397  1278.0  207585.943683
median  2557   0.770645   821.620258   750.0  147170.978979
short    688   0.207354   669.071221   590.0  102873.702344
        true                                             
       count <lambda_0>        mean median            var
class2                                                   
long    1086   0.327306  903.795580  829.5  174355.405179
short   2232   0.672694  747.612455  676.5  125770.934905


### lr

In [67]:
lr = LinearRegression()
lr.fit(x_train2,y_train2)
print('coef',lr.coef_)
print('intercept',lr.intercept_)

lr_predictions2 = lr.predict(x_test2)
lr_train2 = lr.predict(x_train2)
eval_model(lr_train2,y_train2,lr_predictions2,y_test2)


coef [ 157.15177678  115.43417208 -187.4018734   -13.77195628   14.40759647
   46.69267981  163.23042705   52.88481447   11.17336462]
intercept 887.4299963720251
--------------训练集----------------
RMSE 340.8302635222
MAPE 36.838510800963164
MAE 252.81531198287507
R^2 test: 0.114
--------------测试集----------------
RMSE 367.15745478915426
MAPE 41.45027744171662
MAE 272.9112891798674
R^2 test: 0.083
--------------分组后评估--------------
        true                                              
       count <lambda_0>         mean median            var
class                                                     
long     255   0.076854  1029.101961  963.0  200058.170665
median  2874   0.866184   792.733820  720.0  138756.654846
short    189   0.056962   579.126984  492.0   81793.324215
        true                                             
       count <lambda_0>        mean median            var
class2                                                   
long    1743   0.525316  884.572002  803

## dnn

In [None]:
import random
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
random.seed(42)
np.random.seed(123)
tf.random.set_seed(42)
# from tensorflow.keras.initializers import RandomUniform#GlorotUniform
INPUT_SIZE2 = x_train2.shape[1]
x_train2_1, x_val2, y_train2_1, y_val2 = train_test_split(x_train2.astype(np.float32), y_train2.astype(np.float32), test_size=0.2, random_state=123)
model_final2 = keras.Sequential([
    keras.layers.Dense(12, activation='relu', input_shape=[INPUT_SIZE2,]),
    keras.layers.Dropout(0.2, seed=42),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dropout(0.1, seed=42),
    keras.layers.Dense(1, activation='linear')
])

model_final2.compile(
      optimizer = keras.optimizers.Adam(learning_rate = 0.01),
      loss= 'mse')
model_final2.fit(x_train2_1, y_train2_1,epochs = 20,validation_data=(x_val2, y_val2), verbose=2)

In [None]:
def eval_model_dnn(xtrain_pred,ytrain,predictions,groundtrue):
    print('--------------训练集----------------')
    print('RMSE',np.sqrt(mean_squared_error(ytrain,xtrain_pred)))
    print('MAPE',MAPE(ytrain,xtrain_pred))
    print('MAE',mean_absolute_error(ytrain,xtrain_pred))
    print('R^2 test: %.3f' % (r2_score(ytrain,xtrain_pred)))
    print('--------------测试集----------------')
    print('RMSE',np.sqrt(mean_squared_error(groundtrue,predictions)))
    print('MAPE',MAPE(groundtrue,predictions))
    print('MAE',mean_absolute_error(groundtrue,predictions))
    print('R^2 test: %.3f' % (r2_score(groundtrue,predictions)))
    print('--------------分组后评估--------------')
    predictions =  predictions.reshape(1,-1)[0]
    eval = pd.DataFrame({'pred':predictions,'true':groundtrue})
    eval['class'] = eval['pred'].apply(lambda x: 'short' if x<=630.5 else ('long' if x>975.5 else 'median'))
    df = eval.groupby('class').agg({'true':['count',lambda x: x.count() / len(eval),'mean','median','var']})
    print(df)

In [None]:
dnn_predictions2 = model_final2.predict(x_test2.astype(np.float32))
dnn_train2 =  model_final2.predict(x_train2.astype(np.float32))
eval_model_dnn(dnn_train2,y_train2,dnn_predictions2 ,y_test2)