# Setup

In [1]:
library(party)
library(tidyverse)
library(randomForest)
# library(class)
# library(e1071)

Loading required package: grid
Loading required package: mvtnorm
Loading required package: modeltools
Loading required package: stats4
Loading required package: strucchange
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: sandwich
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5     ✔ purrr   0.3.4
✔ tibble  3.1.6     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ stringr::boundary() masks strucchange::boundary()
✖ dplyr::filter()     masks stats::filter()
✖ dplyr::lag()        masks stats::lag()
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked 

In [2]:
data <- read.csv("data_cleaned.csv")
dim(data)

In [3]:
head(data)

hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
Resort Hotel,0,7,2015,Jul,27,1,0,1,1,...,C,0,No Deposit,0,0,0,Transient,75,0,0
Resort Hotel,0,13,2015,Jul,27,1,0,1,1,...,A,0,No Deposit,304,0,0,Transient,75,0,0
Resort Hotel,0,14,2015,Jul,27,1,0,2,2,...,A,0,No Deposit,240,0,0,Transient,98,0,1
Resort Hotel,0,14,2015,Jul,27,1,0,2,2,...,A,0,No Deposit,240,0,0,Transient,98,0,1
Resort Hotel,0,0,2015,Jul,27,1,0,2,2,...,C,0,No Deposit,0,0,0,Transient,107,0,0
Resort Hotel,0,9,2015,Jul,27,1,0,2,2,...,C,0,No Deposit,303,0,0,Transient,103,0,1


### Changing values of Arrival date months to a smaller representation

In [4]:
data$arrival_date_month <- factor(data$arrival_date_month)   
levels(data$arrival_date_month) <- list(Jan = "January",
                                       Feb = "February",
                                       Mar = "March",
                                       Apr = "April",
                                       May = "May",
                                       Jun = "June",
                                       Jul = "July",
                                       Aug = "August",
                                       Sep = "September",
                                       Oct = "October",
                                       Nov = "November",
                                       Dec = "December")

levels(data$arrival_date_month)
table(data$arrival_date_month)


  Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec 
 5758  7886  9607 11008 11418 10771 12158 13503 10132 10794  6594  6512 

### Convert character and is_canceled to factor

In [5]:
char_colnames <- colnames(data %>% select_if(is.character))
char_colnames

data[char_colnames] <- lapply(data[char_colnames], factor)
# data[,"is_canceled"] <- as.factor(data[,"is_canceled"])


In [6]:
str(data)

'data.frame':	116141 obs. of  30 variables:
 $ hotel                         : Factor w/ 2 levels "City Hotel","Resort Hotel": 2 2 2 2 2 2 2 2 2 2 ...
 $ is_canceled                   : int  0 0 0 0 0 0 1 1 1 0 ...
 $ lead_time                     : int  7 13 14 14 0 9 85 75 23 35 ...
 $ arrival_date_year             : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
 $ arrival_date_month            : Factor w/ 12 levels "Jan","Feb","Mar",..: 7 7 7 7 7 7 7 7 7 7 ...
 $ arrival_date_week_number      : int  27 27 27 27 27 27 27 27 27 27 ...
 $ arrival_date_day_of_month     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ stays_in_weekend_nights       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ stays_in_week_nights          : int  1 1 2 2 2 2 3 3 4 4 ...
 $ adults                        : int  1 1 2 2 2 2 2 2 2 2 ...
 $ children                      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ babies                        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ meal                          : Factor w/ 5 levels "BB","F

### Reduce country levels

In [7]:
reduce_levels <- function(level, threshold = 500, new_level_value = "OTHER") {
  freq <- table(level)

  levels(level) <- c(levels(level), new_level_value)
  
  
  # Add a new level to assign for countries appearing less than a threshold
  level[level %in% names(freq[freq < threshold])] <- new_level_value

  # Drop levels that were replaced
  level <- droplevels(level)

  return(level)
}

# print frequency of levels appearing more than or equal a certain threshold and their number
view_thresh <- function(level, threshold) {
  cnt <-table(level)
  print(length(cnt[cnt >= threshold]))  
  print(cnt[cnt >= threshold])
}

data$country <- reduce_levels(data$country, 100)
# data$company <- reduce_levels(data$company, 30)

# data$agent <- reduce_levels(data$agent, 250)


# Classifiers 

### Data splitting

In [8]:
set.seed(101) # Set Seed so that same sample can be reproduced in future also
# Now Selecting 75% of data as sample from total 'n' rows of the data  
sample <- sample.int(n = nrow(data), size = floor(.8*nrow(data)), replace = F)
train <- data[sample, ]
test  <- data[-sample, ]
dim(train)
dim(test)

In [9]:
tree <- ctree(is_canceled ~ . , train)

In [10]:
tpred <- predict(tree, train, type= "response")
table(tpred, train$is_canceled)
acc <- mean(tpred == train$is_canceled)*100
sprintf("Decision tree accuracy on train data is %f", acc)

                      
tpred                      0     1
  0                    12895     0
  0.000924214417744917  1081     1
  0.00144717800289436    690     1
  0.00190766882869134   2616     5
  0.00280898876404494    355     1
  0.00589970501474926    337     2
  0.00666666666666667    149     1
  0.00761697497279652    912     7
  0.00821917808219178    724     6
  0.00847457627118644    234     2
  0.00858895705521472    808     7
  0.00868055555555556   1142    10
  0.00961538461538462    103     1
  0.0105263157894737      94     1
  0.0107913669064748     275     3
  0.010989010989011       90     1
  0.0166666666666667     118     2
  0.0175438596491228      56     1
  0.0178571428571429     385     7
  0.025                   39     1
  0.027027027027027       36     1
  0.0279503105590062     313     9
  0.0311614730878187     342    11
  0.03125                 93     3
  0.0324149108589951     597    20
  0.037037037037037      104     4
  0.0384615384615385      75    

In [11]:
tpred <- predict(tree, test, type="response")
mean(tpred == test$is_canceled)*100

In [None]:
rf <- randomForest(is_canceled ~., train)
print(rf)

“The response has five or fewer unique values.  Are you sure you want to do regression?”

In [None]:
varImpPlot(rf)

In [None]:
tpred <- predict(rf, test, type="response")
mean(tpred == test$is_canceled)*100

In [None]:
tpred <- predict(rf, train, type="response")
mean(tpred == train$is_canceled)*100