# Table of Content
1. Importing libraries and dataset
2. Understanding data
3. Data quality checks
4. Data Cleaning after data quality checks¶
5. Data sense checks
6. Data Cleaning after data sense checks
7. Exporting cleaned dataframe¶

# 1. Importing libraries and dataset

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [3]:
# Defining path for data import
path = r'/Users/frederikeschulz-mullensiefen/Desktop/Masterfolder_Hotel Bookings'

In [4]:
# Define needed columns of dataframe
bookings_list = ['hotel', 'is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'meal', 'country', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'deposit_type', 'days_in_waiting_list', 'customer_type', 'adr', 'reservation_status']

In [5]:
# Import hotel bookings dataset 
df_bookings = pd.read_csv(os.path.join(path, '02_Data', 'Original Data', 'hotel_bookings.csv'), usecols = bookings_list)

# 2. Understanding data

In [6]:
# Checking dataframe shape
df_bookings.shape

(119390, 22)

In [7]:
# Setting no limit on max columns to display
pd.set_option('display.max_columns', None)

In [8]:
# Checking first rows of dataframe
df_bookings.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status
0,Resort Hotel,0,342,2015,July,27,0,0,2,0.0,0,BB,PRT,0,0,0,3,No Deposit,0,Transient,0.0,Check-Out
1,Resort Hotel,0,737,2015,July,27,0,0,2,0.0,0,BB,PRT,0,0,0,4,No Deposit,0,Transient,0.0,Check-Out
2,Resort Hotel,0,7,2015,July,27,0,1,1,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,75.0,Check-Out
3,Resort Hotel,0,13,2015,July,27,0,1,1,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,75.0,Check-Out
4,Resort Hotel,0,14,2015,July,27,0,2,2,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,98.0,Check-Out
5,Resort Hotel,0,14,2015,July,27,0,2,2,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,98.0,Check-Out
6,Resort Hotel,0,0,2015,July,27,0,2,2,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,107.0,Check-Out
7,Resort Hotel,0,9,2015,July,27,0,2,2,0.0,0,FB,PRT,0,0,0,0,No Deposit,0,Transient,103.0,Check-Out
8,Resort Hotel,1,85,2015,July,27,0,3,2,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,82.0,Canceled
9,Resort Hotel,1,75,2015,July,27,0,3,2,0.0,0,HB,PRT,0,0,0,0,No Deposit,0,Transient,105.5,Canceled


# 3. Data quality checks

In [9]:
# Checking data types 
df_bookings.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
booking_changes                     int64
deposit_type                       object
days_in_waiting_list                int64
customer_type                      object
adr                               float64
reservation_status                 object
dtype: object

In [10]:
# Finding missing data 
df_bookings.isnull().sum()

hotel                               0
is_canceled                         0
lead_time                           0
arrival_date_year                   0
arrival_date_month                  0
arrival_date_week_number            0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            4
babies                              0
meal                                0
country                           488
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
booking_changes                     0
deposit_type                        0
days_in_waiting_list                0
customer_type                       0
adr                                 0
reservation_status                  0
dtype: int64

In [10]:
# Finding duplicates
df_dups = df_bookings[df_bookings.duplicated()]

In [11]:
# Showing duplicates
df_dups

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status
5,Resort Hotel,0,14,2015,July,27,0,2,2,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,98.00,Check-Out
22,Resort Hotel,0,72,2015,July,27,2,4,2,0.0,0,BB,PRT,0,0,0,1,No Deposit,0,Transient,84.67,Check-Out
43,Resort Hotel,0,70,2015,July,27,2,3,2,0.0,0,HB,ROU,0,0,0,0,No Deposit,0,Transient,137.00,Check-Out
129,Resort Hotel,0,100,2015,July,27,1,1,2,0.0,0,BB,FRA,0,0,0,0,No Deposit,0,Transient-Party,73.80,Check-Out
130,Resort Hotel,0,100,2015,July,27,1,1,2,0.0,0,BB,FRA,0,0,0,0,No Deposit,0,Transient-Party,73.80,Check-Out
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119349,City Hotel,0,186,2017,August,35,0,3,2,0.0,0,BB,DEU,0,0,0,0,No Deposit,0,Transient,126.00,Check-Out
119352,City Hotel,0,63,2017,August,35,0,3,3,0.0,0,BB,SWE,0,0,0,0,No Deposit,0,Transient-Party,195.33,Check-Out
119353,City Hotel,0,63,2017,August,35,0,3,3,0.0,0,BB,SWE,0,0,0,0,No Deposit,0,Transient-Party,195.33,Check-Out
119354,City Hotel,0,63,2017,August,35,0,3,3,0.0,0,BB,SWE,0,0,0,0,No Deposit,0,Transient-Party,195.33,Check-Out


In [12]:
# Checking for mixed-type data
for col in df_bookings.columns.tolist():
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_bookings[weird]) > 0:
    print (col)

  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(t

country


  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)


# 4. Data Cleaning after data quality checks

In [13]:
# Correcting datatypes
df_bookings['is_canceled'] = df_bookings['is_canceled'].astype('str')
df_bookings['is_repeated_guest'] = df_bookings['is_repeated_guest'].astype('str')
df_bookings['country'] = df_bookings['country'].astype('str')

In [14]:
# Checking if data type change was sucessful
df_bookings.dtypes

hotel                              object
is_canceled                        object
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
is_repeated_guest                  object
previous_cancellations              int64
previous_bookings_not_canceled      int64
booking_changes                     int64
deposit_type                       object
days_in_waiting_list                int64
customer_type                      object
adr                               float64
reservation_status                 object
dtype: object

In [15]:
# Checking if mixed-type data are gone
for col in df_bookings.columns.tolist():
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_bookings[weird]) > 0:
    print (col)

  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_bookings[[col]].applymap(type) != df_bookings[[col]].iloc[0].apply(t

In [16]:
# Removing missing values in country column
df_bookings_clean1 = df_bookings[df_bookings['country'].isnull() == False]

In [17]:
df_bookings_clean1.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          4
babies                            0
meal                              0
country                           0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
reservation_status                0
dtype: int64

In [18]:
# Removing missing values in children column
df_bookings_clean2 = df_bookings_clean1[df_bookings_clean1['children'].isnull() == False]

In [19]:
# Checking successul removal of null values
df_bookings_clean2.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
reservation_status                0
dtype: int64

# 5. Data sense checks

In [20]:
# Checking descriptive statistics
df_bookings_clean2.describe()

Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr
count,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0,119386.0
mean,104.014801,2016.156593,27.165003,0.927605,2.50031,1.85639,0.10389,0.007949,0.087121,0.137102,0.221131,2.321227,101.833541
std,106.863286,0.707456,13.605334,0.998618,1.908289,0.579261,0.398561,0.097438,0.84435,1.497462,0.652315,17.595011,50.534664
min,0.0,2015.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.38
25%,18.0,2016.0,16.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,69.29
50%,69.0,2016.0,28.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,94.59
75%,160.0,2017.0,38.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,126.0
max,737.0,2017.0,53.0,19.0,50.0,55.0,10.0,10.0,26.0,72.0,21.0,391.0,5400.0


From the descriptive statistics, almost all columns seem to make sense. There are a few things that would need further investigation:

-55 adults for one booking seems excessive (though not impossible)

-391 days in the waiting list seems rather long (though not impossible)

-An average daily rate of below zero does not make sense

-It should be checked whether the total number of nights of stay is above zero

-It should be checked whether the total number of guests is above zero

In [21]:
# Further sense check for adr below zero
sense_check_adr = df_bookings_clean2[df_bookings_clean2['adr']<0]

In [22]:
# Checking count of observations where adr < 0
sense_check_adr

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status
14969,Resort Hotel,0,195,2017,March,10,4,6,2,0.0,0,BB,GBR,1,0,2,2,No Deposit,0,Transient-Party,-6.38,Check-Out


As there is only one record with a negative adr, this record should be removed.

In [23]:
# Deriving new variable "total_guests"
df_bookings_clean2['total_guests'] = (df_bookings_clean2['adults'] + df_bookings_clean2['children'] + df_bookings_clean2['babies'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bookings_clean2['total_guests'] = (df_bookings_clean2['adults'] + df_bookings_clean2['children'] + df_bookings_clean2['babies'])


In [24]:
# Deriving new variable "total_number_nights"
df_bookings_clean2['total_number_nights'] = (df_bookings_clean2['stays_in_weekend_nights'] + df_bookings_clean2['stays_in_week_nights'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bookings_clean2['total_number_nights'] = (df_bookings_clean2['stays_in_weekend_nights'] + df_bookings_clean2['stays_in_week_nights'])


In [25]:
# Check if derivation of new variables worked
df_bookings_clean2.head(25)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status,total_guests,total_number_nights
0,Resort Hotel,0,342,2015,July,27,0,0,2,0.0,0,BB,PRT,0,0,0,3,No Deposit,0,Transient,0.0,Check-Out,2.0,0
1,Resort Hotel,0,737,2015,July,27,0,0,2,0.0,0,BB,PRT,0,0,0,4,No Deposit,0,Transient,0.0,Check-Out,2.0,0
2,Resort Hotel,0,7,2015,July,27,0,1,1,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,75.0,Check-Out,1.0,1
3,Resort Hotel,0,13,2015,July,27,0,1,1,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,75.0,Check-Out,1.0,1
4,Resort Hotel,0,14,2015,July,27,0,2,2,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,98.0,Check-Out,2.0,2
5,Resort Hotel,0,14,2015,July,27,0,2,2,0.0,0,BB,GBR,0,0,0,0,No Deposit,0,Transient,98.0,Check-Out,2.0,2
6,Resort Hotel,0,0,2015,July,27,0,2,2,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,107.0,Check-Out,2.0,2
7,Resort Hotel,0,9,2015,July,27,0,2,2,0.0,0,FB,PRT,0,0,0,0,No Deposit,0,Transient,103.0,Check-Out,2.0,2
8,Resort Hotel,1,85,2015,July,27,0,3,2,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,82.0,Canceled,2.0,3
9,Resort Hotel,1,75,2015,July,27,0,3,2,0.0,0,HB,PRT,0,0,0,0,No Deposit,0,Transient,105.5,Canceled,2.0,3


In [26]:
# Sense check for observations with zero total guests
sense_check_guests = df_bookings_clean2[df_bookings_clean2['total_guests']<1]

In [27]:
sense_check_guests

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status,total_guests,total_number_nights
2224,Resort Hotel,0,1,2015,October,41,0,3,0,0.0,0,SC,PRT,0,0,0,1,No Deposit,0,Transient-Party,0.00,Check-Out,0.0,3
2409,Resort Hotel,0,0,2015,October,42,0,0,0,0.0,0,SC,PRT,0,0,0,0,No Deposit,0,Transient,0.00,Check-Out,0.0,0
3181,Resort Hotel,0,36,2015,November,47,1,2,0,0.0,0,SC,ESP,0,0,0,0,No Deposit,0,Transient-Party,0.00,Check-Out,0.0,3
3684,Resort Hotel,0,165,2015,December,53,1,4,0,0.0,0,SC,PRT,0,0,0,1,No Deposit,122,Transient-Party,0.00,Check-Out,0.0,5
3708,Resort Hotel,0,165,2015,December,53,2,4,0,0.0,0,SC,PRT,0,0,0,1,No Deposit,122,Transient-Party,0.00,Check-Out,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115029,City Hotel,0,107,2017,June,26,0,3,0,0.0,0,BB,CHE,0,0,0,1,No Deposit,0,Transient,100.80,Check-Out,0.0,3
115091,City Hotel,0,1,2017,June,26,0,1,0,0.0,0,SC,PRT,0,0,0,0,No Deposit,0,Transient,0.00,Check-Out,0.0,1
116251,City Hotel,0,44,2017,July,28,1,1,0,0.0,0,SC,SWE,0,0,0,2,No Deposit,0,Transient,73.80,Check-Out,0.0,2
116534,City Hotel,0,2,2017,July,28,2,5,0,0.0,0,SC,RUS,0,0,0,1,No Deposit,0,Transient-Party,22.86,Check-Out,0.0,7


It does not make sense to have a booking with 0 guests. As 180 rows is not a lot compared to total number of observations (0,15%), these rows should be removed.

In [28]:
# Sense check for observations with zero total number of nights
sense_check_nights = df_bookings_clean2[df_bookings_clean2['total_number_nights']<1]

In [29]:
sense_check_nights

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,reservation_status,total_guests,total_number_nights
0,Resort Hotel,0,342,2015,July,27,0,0,2,0.0,0,BB,PRT,0,0,0,3,No Deposit,0,Transient,0.0,Check-Out,2.0,0
1,Resort Hotel,0,737,2015,July,27,0,0,2,0.0,0,BB,PRT,0,0,0,4,No Deposit,0,Transient,0.0,Check-Out,2.0,0
167,Resort Hotel,0,111,2015,July,28,0,0,2,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,0.0,Check-Out,2.0,0
168,Resort Hotel,0,0,2015,July,28,0,0,1,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,0.0,Check-Out,1.0,0
196,Resort Hotel,0,8,2015,July,28,0,0,2,0.0,0,BB,PRT,0,0,0,0,No Deposit,0,Transient,0.0,Check-Out,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115483,City Hotel,0,15,2017,July,27,0,0,1,0.0,0,SC,FRA,0,0,0,0,No Deposit,0,Transient-Party,0.0,Check-Out,1.0,0
117701,City Hotel,0,0,2017,August,32,0,0,2,0.0,0,BB,PRT,1,0,0,0,No Deposit,0,Transient,0.0,Check-Out,2.0,0
118029,City Hotel,0,0,2017,August,33,0,0,2,0.0,0,BB,PRT,1,0,0,0,No Deposit,0,Transient,0.0,Check-Out,2.0,0
118631,City Hotel,0,78,2017,August,34,0,0,1,0.0,0,BB,PRT,0,0,0,7,No Deposit,0,Transient-Party,0.0,Check-Out,1.0,0


It does not make sense to have a booking with 0 nights. As 715 rows is not a lot compared to total number of observations (0,59%), these rows should be removed.

# 6. Data Cleaning after data sense checks

In [30]:
# Removing observations with adr < 0
df_bookings_clean3 = df_bookings_clean2[(df_bookings_clean2['adr'] < 0) == False]

In [31]:
# Removing observations with total guests < 1
df_bookings_clean4 = df_bookings_clean3[(df_bookings_clean3['total_guests'] < 1) == False]

In [32]:
# Removing observations with total guests < 1
df_bookings_clean5 = df_bookings_clean4[(df_bookings_clean4['total_number_nights'] < 1) == False]

In [33]:
# Sense check after data cleaning
df_bookings_clean5.describe()

Unnamed: 0,lead_time,arrival_date_year,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,total_guests,total_number_nights
count,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0,118560.0
mean,104.508494,2016.157667,27.157001,0.932077,2.51277,1.859995,0.1042,0.007962,0.087635,0.137171,0.218354,2.333114,102.527187,1.972158,3.444846
std,106.915823,0.707692,13.589513,0.995431,1.893242,0.575443,0.399127,0.097562,0.84719,1.500721,0.63762,17.643149,50.003554,0.719098,2.534904
min,0.0,2015.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,18.0,2016.0,16.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,2.0,2.0
50%,70.0,2016.0,28.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0,2.0,3.0
75%,161.0,2017.0,38.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,126.0,2.0,4.0
max,709.0,2017.0,53.0,19.0,50.0,55.0,10.0,10.0,26.0,72.0,18.0,391.0,5400.0,55.0,69.0


In [34]:
# Removing observations with adr > 1000
df_bookings_clean6 = df_bookings_clean5[(df_bookings_clean5['adr'] > 1000) == False]

In [35]:
# Datatypes after data cleanign
df_bookings_clean6.dtypes

hotel                              object
is_canceled                        object
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
is_repeated_guest                  object
previous_cancellations              int64
previous_bookings_not_canceled      int64
booking_changes                     int64
deposit_type                       object
days_in_waiting_list                int64
customer_type                      object
adr                               float64
reservation_status                 object
total_guests                      float64
total_number_nights               

In [36]:
# Check of final shape
df_bookings_clean6.shape

(118559, 24)

# 7. Exporting cleaned dataframe

In [37]:
# Exporting dataframe
df_bookings_clean6.to_pickle(os.path.join(path,'02_Data','Prepared Data', 'hotel_bookings_cleaned.pkl'))