In [1]:
# Setup:

# common packages
import numpy as np
import pandas as pd
import random

# ML packages
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split

# display options

# seed setup
seed = 2020

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org


In [2]:
# Load data:
hotels = pd.read_csv("../data/raw/hotels_dataset.csv")

In [3]:
# Check full data:
hotels.shape

(119390, 32)

In [4]:
# Number of observations for the resort rotel
resort = hotels.loc[(hotels["hotel"] == "Resort Hotel")]
resort.shape

(40060, 32)

In [5]:
# Number of observations for the city rotel
city = hotels.loc[(hotels["hotel"] == "City Hotel")]
city.shape

(79330, 32)

## Summary of the data set

The data set used in this project comes from the Hotel Booking demand datasets from [Antonio, Almeida and Nunes, 2019](https://www.sciencedirect.com/science/article/pii/S2352340918315191#ack0005) and the data can be found from the GitHub Repository [here](https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11). The dataset contains real world data obtained from two hotels: one resort hotel and one city hotel. Each row represents an individual hotel booking due to arrive between July 1st, 2015 and August 31st, 2017. There are 119390 observations in the data set, and 31 features. The following table shows the counts of observations for each hotel. 

| Resort Hotel | City Hotel |
| -----------: | ---------: |
|        40060 |      79330 |

Table 1: Counts of observation for each hotel. 

In [6]:
# Split data: 
# 80% of observations are in the training and 20% of observations are in the test set
train_df, test_df = train_test_split(hotels, test_size=0.2, random_state=seed)

In [7]:
# Data inspection:
train_df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
74159,City Hotel,1,349,2015,October,40,1,0,2,2,...,Non Refund,1.0,,0,Contract,62.0,0,0,Canceled,2015-01-01
59282,City Hotel,1,254,2016,October,44,25,0,5,2,...,No Deposit,9.0,,0,Transient,90.95,0,0,Canceled,2016-02-17
56463,City Hotel,1,174,2016,September,37,7,0,1,2,...,No Deposit,9.0,,0,Transient,126.9,0,2,Canceled,2016-08-24
94823,City Hotel,0,169,2016,August,32,5,2,4,2,...,No Deposit,9.0,,0,Transient,99.45,0,1,Check-Out,2016-08-11
247,Resort Hotel,1,69,2015,July,28,9,2,6,2,...,No Deposit,240.0,,0,Transient,118.13,0,2,Canceled,2015-05-13


In [8]:
# Data inspection:
train_df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,95512.0,95512.0,95512.0,95512.0,95512.0,95512.0,95512.0,95512.0,95508.0,95512.0,95512.0,95512.0,95512.0,95512.0,82411.0,5441.0,95512.0,95512.0,95512.0,95512.0
mean,0.370707,104.108049,2016.157907,27.16243,15.792717,0.927171,2.499246,1.855421,0.103552,0.007873,0.031609,0.088282,0.136684,0.221899,86.547136,188.947988,2.284278,101.701882,0.062568,0.570975
std,0.482997,106.77405,0.706551,13.615684,8.773898,1.000599,1.913424,0.589122,0.398279,0.094784,0.174957,0.870309,1.479877,0.658002,110.632486,131.549836,17.31498,51.089236,0.245705,0.792781
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.0,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.5,0.0,0.0
75%,1.0,161.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [9]:
# Data inspection:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95512 entries, 74159 to 41824
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           95512 non-null  object 
 1   is_canceled                     95512 non-null  int64  
 2   lead_time                       95512 non-null  int64  
 3   arrival_date_year               95512 non-null  int64  
 4   arrival_date_month              95512 non-null  object 
 5   arrival_date_week_number        95512 non-null  int64  
 6   arrival_date_day_of_month       95512 non-null  int64  
 7   stays_in_weekend_nights         95512 non-null  int64  
 8   stays_in_week_nights            95512 non-null  int64  
 9   adults                          95512 non-null  int64  
 10  children                        95508 non-null  float64
 11  babies                          95512 non-null  int64  
 12  meal                        

In [10]:
# Number of observations for the resort rotel in training data set:
train_df_resort = train_df.loc[(train_df["hotel"] == "Resort Hotel")]
print(train_df_resort.shape)
test_df_resort = test_df.loc[(test_df["hotel"] == "Resort Hotel")]
print(test_df_resort.shape)

(31994, 32)
(8066, 32)


In [11]:
# Number of observations for the city rotel in training data set:
train_df_city = train_df.loc[(train_df["hotel"] == "City Hotel")]
print(train_df_city.shape)
test_df_city = test_df.loc[(test_df["hotel"] == "City Hotel")]
print(test_df_city.shape)

(63518, 32)
(15812, 32)


In [12]:
# Check missing values:
train_df.isnull().sum()

hotel                                 0
is_canceled                           0
lead_time                             0
arrival_date_year                     0
arrival_date_month                    0
arrival_date_week_number              0
arrival_date_day_of_month             0
stays_in_weekend_nights               0
stays_in_week_nights                  0
adults                                0
children                              4
babies                                0
meal                                  0
country                             383
market_segment                        0
distribution_channel                  0
is_repeated_guest                     0
previous_cancellations                0
previous_bookings_not_canceled        0
reserved_room_type                    0
assigned_room_type                    0
booking_changes                       0
deposit_type                          0
agent                             13101
company                           90071


In [13]:
# Replace missing values:
# for "agent": if no agency is given, we assume the booking was made without agency.
# for "company": if no company is given, we assume the booking was made by individuals.
replacements={"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
train_df_nan = train_df.fillna(replacements)

# Combine same meaning values:
# for "meal": "undefined" and "SC" have the same meaning
train_df_nan["meal"].replace("Undefined", "SC", inplace=True)

# Some rooms have 0 adults, 0 children and 0 babies, 
# uncomment below to drop these "ghost" rooms
# empty_room = list(full_data_cln.loc[full_data_cln["adults"]
#                    + full_data_cln["children"]
#                    + full_data_cln["babies"]==0].index)
# full_data_cln.drop(full_data_cln.index[empty_room], inplace=True)

In [14]:
# Check data left:
train_df_nan.shape

(95512, 32)

## Splitting the data set into training and test data sets

- 80% of observations are in the training and 20% of observations are in the test set

| Data partition | Resort Hotel | City Hotel |
| :------------- | -----------: | ---------: |
| Training       |        31994 |      63518 |
| Test           |         8066 |      15812 |

Table 2: Counts of observation for each hotel for each data partition

- minor class imbalance
- There are 4 observations with missing values in `children` variable, replaced with 0; 
- 383 observations with missing values in `country` variable, replaced with "Unknown"; 
- 13101 observations with missing values in `agent` variable, replaced with 0, as we assume those bookings were made without agency; 
- 90071 observations with missing values in `company` variable, replaced with 0, as we assume those bookings was made by individuals. 

## Exploratory data analysis on the training data set

In [15]:
# Split the features and targets:
X_train = train_df_nan.drop(['is_canceled'], axis = 1)
y_train = train_df_nan['is_canceled']
X_test = test_df.drop(['is_canceled'], axis = 1)
y_test = test_df['is_canceled']

In [16]:
# Seperate Resort adn City Hotel:
resort_train = X_train.loc[(X_train["hotel"] == "Resort Hotel")]
city_train = X_train.loc[(X_train["hotel"] == "City Hotel")]