In [None]:
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

from pathlib import Path

# Get root directory 
notebook_dir = Path.cwd()
project_root = notebook_dir.parents[1]

# Append project path to system path for module import
sys.path.append(str(project_root / 'data' / 'raw'))

In [None]:
# Read the hotel reservations CSV file into a DataFrame
base_hotel = pd.read_csv('../../data/raw/hotel_reservations.csv')
# Drop 'Booking_ID' column
base_hotel = base_hotel.drop(columns='Booking_ID')
base_hotel

In [None]:
# Display descriptive statistics for numerical columns
base_hotel.describe()

In [None]:
# Check for missing values in each column
base_hotel.isnull().sum()

In [None]:
# Create dummy variables for categorical columns
base_hotel_dummies = pd.get_dummies(base_hotel, columns=[
        'type_of_meal_plan', 'room_type_reserved', 
        'market_segment_type', 'booking_status'
    ])

In [None]:
# Calculate correlation matrix for the DataFrame with dummy variables
base_hotel_dummies.corr()

In [None]:
# Calculate correlations between columns
correlation_matrix = base_hotel_dummies.corr()
print(correlation_matrix['avg_price_per_room'].sort_values(ascending=False))


In [None]:
# Scatter plot to visualize relationship between 'no_of_children' and 'avg_price_per_room'
plt.scatter(base_hotel['no_of_children'], base_hotel['avg_price_per_room'])
plt.xlabel('coluna_interessante')
plt.ylabel('avg_price_per_room')
plt.title('Relação entre coluna_interessante e avg_price_per_room')
plt.show()


In [None]:
# Drop categorical columns from the original DataFrame
base_hotel = base_hotel.drop(columns=['type_of_meal_plan', 'room_type_reserved', 
                                        'market_segment_type', 'booking_status'])

In [None]:
# Create a heatmap to visualize correlations between remaining columns
figura = plt.figure(figsize=(10,10))
sns.heatmap(base_hotel.corr(), annot = True)
