Global Tourism Analysis Part 1
---

In [18]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



Cleaning Hotel Bookings Dataset
---

In [16]:
# Reference the file where the CSV is located
hotel_booking_file = Path("Data_Resources/hotel_bookings_dataset.csv")

# Import the date into a Pandas DataFrame
hotel_data = pd.read_csv(hotel_booking_file)

In [17]:
# Remove specified columns from the DataFrame
columns_to_remove = [
    'arrival_date_week_number', 'arrival_date_day_of_month', 'meal',
    'market_segment', 'distribution_channel', 'previous_bookings_not_canceled',
    'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type',
    'agent', 'company', 'days_in_waiting_list', 'customer_type',
    'required_car_parking_spaces', 'total_of_special_requests',
    'reservation_status', 'reservation_status_date', 'lead_time',
    'previous_cancellations'
]

hotel_data = hotel_data.drop(columns=columns_to_remove)
hotel_data.head()

Unnamed: 0,hotel,is_canceled,arrival_date_year,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,country,is_repeated_guest,adr
0,Resort Hotel,0,2015,July,0,0,2,0.0,0,PRT,0,0.0
1,Resort Hotel,0,2015,July,0,0,2,0.0,0,PRT,0,0.0
2,Resort Hotel,0,2015,July,0,1,1,0.0,0,GBR,0,75.0
3,Resort Hotel,0,2015,July,0,1,1,0.0,0,GBR,0,75.0
4,Resort Hotel,0,2015,July,0,2,2,0.0,0,GBR,0,98.0


In [19]:
# Combine "stays_in_weekend_nights" and "stays_in_week_nights" into a new column "stays_nights"
if 'stays_in_weekend_nights' in hotel_data.columns and 'stays_in_week_nights' in hotel_data.columns:
    hotel_data['stays_nights'] = hotel_data['stays_in_weekend_nights'] + hotel_data['stays_in_week_nights']

# Combine "adults", "children" and "babies" into a new column "number_of_guests"
hotel_data['number_of_guests'] = hotel_data['adults'] + hotel_data['children'] + hotel_data['babies']

# Calculate guest nights
hotel_data['guest_nights'] = hotel_data['stays_nights'] * hotel_data['number_of_guests']

# Get only bookings that were not canceled
hotel_data = hotel_data[hotel_data['is_canceled'] == 0]

# Drop null rows
hotel_data = hotel_data.dropna(how='any')

# Drop unnecessary columns
hotel_data = hotel_data.drop(columns=["stays_in_weekend_nights", "stays_in_week_nights", "adults", "children", "babies", "is_canceled"])

# Display the final DataFrame
hotel_data.head()

Unnamed: 0,hotel,arrival_date_year,arrival_date_month,country,is_repeated_guest,adr,stays_nights,number_of_guests,guest_nights
0,Resort Hotel,2015,July,PRT,0,0.0,0,2.0,0.0
1,Resort Hotel,2015,July,PRT,0,0.0,0,2.0,0.0
2,Resort Hotel,2015,July,GBR,0,75.0,1,1.0,1.0
3,Resort Hotel,2015,July,GBR,0,75.0,1,1.0,1.0
4,Resort Hotel,2015,July,GBR,0,98.0,2,2.0,4.0
