# Read data

------

In [1]:
import numpy as np
import pandas as pd

h1 = pd.read_csv("../data/H1.csv")
h1["HotelType"] = "Resort"
h2 = pd.read_csv("../data/H2.csv")
h2["HotelType"] = "City"
df = pd.concat([h1, h2], ignore_index=True)
df.describe()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,Babies,IsRepeatedGuest,PreviousCancellations,PreviousBookingsNotCanceled,BookingChanges,DaysInWaitingList,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,391.0,5400.0,8.0,5.0


In [2]:
df.dtypes

IsCanceled                       int64
LeadTime                         int64
ArrivalDateYear                  int64
ArrivalDateMonth                object
ArrivalDateWeekNumber            int64
ArrivalDateDayOfMonth            int64
StaysInWeekendNights             int64
StaysInWeekNights                int64
Adults                           int64
Children                       float64
Babies                           int64
Meal                            object
Country                         object
MarketSegment                   object
DistributionChannel             object
IsRepeatedGuest                  int64
PreviousCancellations            int64
PreviousBookingsNotCanceled      int64
ReservedRoomType                object
AssignedRoomType                object
BookingChanges                   int64
DepositType                     object
Agent                           object
Company                         object
DaysInWaitingList                int64
CustomerType             

# Cleaning data
------

In [3]:
# Trimming the ending space at the end of string values
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

Check missing values

In [4]:
missing_values_count = df.isnull().sum()
missing_values_count.sort_values()

IsCanceled                       0
ReservationStatus                0
TotalOfSpecialRequests           0
RequiredCarParkingSpaces         0
ADR                              0
CustomerType                     0
DaysInWaitingList                0
Company                          0
Agent                            0
DepositType                      0
BookingChanges                   0
AssignedRoomType                 0
ReservedRoomType                 0
PreviousBookingsNotCanceled      0
PreviousCancellations            0
IsRepeatedGuest                  0
DistributionChannel              0
MarketSegment                    0
Meal                             0
Babies                           0
Adults                           0
StaysInWeekNights                0
StaysInWeekendNights             0
ArrivalDateDayOfMonth            0
ArrivalDateWeekNumber            0
ArrivalDateMonth                 0
ArrivalDateYear                  0
LeadTime                         0
ReservationStatusDat

Fill missing values

In [5]:
# Fill missing values
df['Children'].fillna(0, inplace=True)
df['Agent'].fillna(0, inplace=True)
df['Company'].fillna(0, inplace=True)
df['Country'].fillna("Undefined", inplace=True)

# Replace type for number of children
df['Children'] = df['Children'].astype(np.int64)
# booking.info()

In [6]:
# Replace inconsitent data 
df["Meal"].replace(["Undefined"], "SC", inplace=True)

df.loc[df["ADR"] < 0, "ADR"] = 0
df.loc[(df["IsRepeatedGuest"] == 0) & (df["PreviousBookingsNotCanceled"] + df["PreviousCancellations"] > 0), "IsRepeatedGuest"] = 1

# Delete records with 0 guests
df = df[(df['Adults'] + df['Children'] + df['Babies']) > 0]

# Delete records with 0 night stay
df = df[df["StaysInWeekendNights"] + df["StaysInWeekNights"] > 0]

In [7]:
df.to_csv("../data/booking.csv", index=False)