In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Analysis and Cleaning:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1:-Basic understanding of data



In [None]:
# Load the dataset
file_path = "/content/drive/MyDrive/hotel_bookings-1.csv"
df = pd.read_csv(file_path)


In [None]:
df.shape

(119390, 32)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [None]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


## Detail Explanation of Column



* hotel :                          There are Resort Hotel or City Hotel
* is_canceled :                    It indicates that for the booking cancellation(1) or not cancelled(0).
* lead_time :                      Number of days that passed between the check-out date of the booking and the arrival date.
* arrival_date_year :              Year of arrival date.
* arrival_date_month :             Month of arrival date.
* arrival_date_week_number :       Week number of year for arrival date.
* arrival_date_day_of_month :      Day of arrival date.
* stays_in_weekend_nights :        Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay.
* stays_in_week_nights :           Number of week nights (Monday to Friday) the guest stayed or booked to stay.
* adults :                         Number of adults.
* children :                       Number of children.
* babies :                         Number of babies.
* meal :                           Type of meal booked Undefined/SC – no meal package; BB – Bed & Breakfast; HB – Half board (breakfast and one other meal – usually dinner); FB – Full board (breakfast, lunch and dinner).
* country:                         Country of origin.
* market_segment :                 Here the term TA means Travel Agents and TO means Tour Operators.
* distribution_channel :           Here the term TA means Travel Agents and TO means Tour Operators.
* is_repeated_guest :              Here  the Value indicating (1) a repeated guest and (0) for not repeated guest.
* previous_cancellations :         Number of previous bookings that were cancelled by the customer prior to the current booking.
* previous_bookings_not_canceled : Number of previous bookings not cancelled by the customer prior to the current booking.
* reserved_room_type :             Code of room type reserved.
* assigned_room_type :             Code for the type of room assigned to the booking.   
* booking_changes :                Number of changes made to the booking from the moment entered until the moment of check-in or cancellation.
* deposit_type :                   If the customer made a deposit to guarantee the booking. There are three categories: No Deposit, Non Refund and Refundable.  
* agent :                          ID of the travel agency that made the booking.
* company :                        ID of the company that made the booking or responsible for paying the booking.  
* days_in_waiting_list :           Number of days the booking was in the waiting list before it was confirmed to the customer.   
* customer_type :                  Type of booking, there are four categories: Contract, Group, Transient and Transient-party.
* adr :                            Average Daily Rate as defined by dividing the sum of all lodging transactions by the total number of staying nights.               
* required_car_parking_spaces :    Number of car parking spaces required by the customer.
* total_of_special_requests :      Number of special requests made by the customer.
* reservation_status :             Reservation last status, there are three categories: Canceled, Check-Out and No-Show.
* reservation_status_date :        Date at which the last status was set.  

In [None]:
df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [None]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
hotel,119390.0,2.0,City Hotel,79330.0,,,,,,,
is_canceled,119390.0,,,,0.370416,0.482918,0.0,0.0,0.0,1.0,1.0
lead_time,119390.0,,,,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
arrival_date_year,119390.0,,,,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_month,119390.0,12.0,August,13877.0,,,,,,,
arrival_date_week_number,119390.0,,,,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,119390.0,,,,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,119390.0,,,,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
stays_in_week_nights,119390.0,,,,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
adults,119390.0,,,,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0


## Step 2: Cleaning of data¶