In [3]:
set.seed(123)

require(dplyr)
require(ggplot2)
require(caret)
library(bigrquery)
library(httpuv)
library(xgboost)
require(Matrix)
require(data.table)
library(ROCR)

Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: ggplot2
Loading required package: caret
Loading required package: lattice

Attaching package: ‘xgboost’

The following object is masked from ‘package:dplyr’:

    slice

Loading required package: Matrix
Loading required package: data.table

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

Loading required package: gplots

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess



In [4]:
library(rBayesianOptimization)

## make dataset of 2-class

In [5]:
## converting to 2 class
convert_cnt2 <- function(x){
    if(x>2) { return(1)
    }else{return(0)}
}

In [6]:
##　分析対象の遊び体験ユーザーの抽出　　##

load("repeater_asb.Rdata")


dat_fil <- 
    dat_asb %>%
    filter(f_year <= 2016) %>%
    filter(age>=18 & age < 70) %>%
    #filter(cnt ==1 | cnt > 3) %>%
    filter(cnt <= 10) 

#重複データのドロップ
dat_fil <- dat_fil[!duplicated(dat_fil$cap),]

## $x$の作成

load(file = "asb_usr_jln_booking.Rdata")

## 　分析対象ユーザーのじゃらん使用状況　　##

# じゃらんをアホみたいに多く使っているユーザーがいる（30回以上とか）ので、それらは除去
dat_ovl_jln <- 
    dat_jln %>%
    filter(c_checkout_total < 30 & c_checkout_total>= 4) %>%
    filter(cap_member_id %in% dat_fil$cap) %>%
    arrange(cap_member_id)

# yの数をxの数に合わせる
dat_ovl_asb <- 
    dat_fil %>%
    filter(cap %in% dat_ovl_jln$cap_member_id) %>%
    arrange(cap)

# 回数になっているデータを割合変換
dat_ovl_jln[,2:62] <- dat_ovl_jln[2:62]/dat_ovl_jln$c_checkout_total

## 2-class化
dat_set <-  cbind(dat_ovl_jln[ , -1], label = sapply(dat_ovl_asb$cnt,convert_cnt2) )
dat_set[is.na(dat_set)] <- 0


In [8]:
## make data
dat <- dat_set
num_dat <- nrow(dat)
ind_train <- sample(num_dat, round(0.8 *num_dat) )
df_train <- dat[ind_train, ]
df_test <- dat[-ind_train, ]
train.mx <- sparse.model.matrix(label~. , df_train)
test.mx <- sparse.model.matrix(label~. , df_test)
dtrain <- xgb.DMatrix( train.mx, label = df_train$label )
dtest <- xgb.DMatrix( test.mx, label = df_test$label )
weight <- nrow(filter(dat_set,label==0)) / nrow(filter(dat_set,label==1))

In [24]:

xgb_holdout <- function( ex, xweight, xdepth, nr ){
    model <- xgb.train( 
        params = list( eta = ex/100, min_child_weight = xweight, max_depth = xdepth, gamma = 0.6, silent = 0, lambda = 2, objective = 'binary:logistic' ,
              eval_metric = 'auc', scale_pos_weight = weight ,  subsample = 0.7, colsample_bytree = 0.7 ), data = dtrain, nrounds=nr ,nthread = 2)
        
        ##eval score
        t.pred <- rbind (predict( model, newdata = dtest ) , df_test$label )
        pred_roc <- prediction(t.pred[1,],t.pred[2,])
        auc.tmp <- performance(pred_roc,"auc")
        auc <- as.numeric(auc.tmp@y.values)
        list( Score = auc , Pred = auc )
}

In [25]:
## early stopping ver.

watchlist <- list( eval = dtest, train = dtrain ) 

xgb_holdout2 <- function( ex, xweight, xdepth, nr, xg, xcol ){
    model <- xgb.train( 
        params = list( eta = ex/100, min_child_weight = xweight, max_depth = xdepth, gamma = xg/10 ,##serch parameters 
                              colsample_bytree = xcol/50, ##serch parameters 
                              silent = 1, lambda = 2, objective = 'binary:logistic' ,
                              eval_metric = 'auc', scale_pos_weight = weight ,  subsample = 0.7 ), 
                data = dtrain, nrounds=nr, nthread = 2,
                early_stopping_rounds = 10, watchlist = watchlist)
        
        ##eval score
        t.pred <- rbind (predict( model, newdata = dtest ) , df_test$label )
        pred_roc <- prediction(t.pred[1,],t.pred[2,])
        auc.tmp <- performance(pred_roc,"auc")
        auc <- as.numeric(auc.tmp@y.values)
        list( Score = auc , Pred = auc )
}

In [26]:
opt_xgb <- rBayesianOptimization :: BayesianOptimization(xgb_holdout2,
                                                        bounds = list(ex = c(1L,20L), xweight = c(3L, 10L), xdepth = c(3L , 10L), nr = c(30L,1000L), 
                                                                     xg = c(1L, 30L), xcol = c(35L, 50L)),
                                                         init_points = 20, n_iter = 1, acq = "ei" , kappa = 2.576 , eps = 0.0 ,verbose = TRUE
                                                        )

elapsed = 216.08	Round = 1	ex = 5.0000	xweight = 7.0000	xdepth = 9.0000	nr = 477.0000	xg = 9.0000	xcol = 48.0000	Value = 0.6650 
elapsed = 146.21	Round = 2	ex = 17.0000	xweight = 8.0000	xdepth = 6.0000	nr = 479.0000	xg = 13.0000	xcol = 44.0000	Value = 0.6518 
elapsed = 148.93	Round = 3	ex = 11.0000	xweight = 8.0000	xdepth = 4.0000	nr = 660.0000	xg = 4.0000	xcol = 45.0000	Value = 0.6704 
elapsed = 120.00	Round = 4	ex = 3.0000	xweight = 4.0000	xdepth = 4.0000	nr = 546.0000	xg = 2.0000	xcol = 46.0000	Value = 0.6949 
elapsed = 68.68	Round = 5	ex = 15.0000	xweight = 8.0000	xdepth = 3.0000	nr = 453.0000	xg = 23.0000	xcol = 35.0000	Value = 0.6750 
elapsed = 44.91	Round = 6	ex = 12.0000	xweight = 7.0000	xdepth = 8.0000	nr = 106.0000	xg = 28.0000	xcol = 48.0000	Value = 0.6668 
elapsed = 97.03	Round = 7	ex = 17.0000	xweight = 6.0000	xdepth = 4.0000	nr = 508.0000	xg = 24.0000	xcol = 38.0000	Value = 0.6670 
elapsed = 178.81	Round = 8	ex = 5.0000	xweight = 7.0000	xdepth = 6.0000	nr = 665.0000	xg = 