In [11]:
import pandas as pd
import numpy as np  
import os
from prophet import Prophet
import matplotlib.pyplot as plt

In [None]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = 'data\pull-pesquisas-city-2851556'

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [6]:
combined_df['DiariaMedia'] = combined_df.apply(
    lambda row: row['DiariaMedia'] * 0.16483969339817028 if row['Moeda_ID'] == 16 else row['DiariaMedia'], 
    axis=1
)


combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'Ocupacao_ID', 'DiariaMedia', "Estadia", 'Reservas']]

combined_df = combined_df.loc[combined_df.index.repeat(combined_df['Reservas'])].reset_index(drop=True)
combined_df.Reservas = 1

In [143]:
hotels_top_100 = combined_df.groupby('Hotel_ID').size().reset_index(name='Counts').sort_values(by='Counts', ascending=False).head(100)['Hotel_ID']

In [None]:
hotels_top_100

In [None]:
# Calculate the median of DiariaMedia per Hotel_ID
median_diaria_media = combined_df.groupby('Hotel_ID')['DiariaMedia'].median().reset_index()

# Sort the result by DiariaMedia in descending order and select the top 10
top_10_median_diaria_media = median_diaria_media.sort_values(by='DiariaMedia', ascending=False).head(100)

print(top_10_median_diaria_media)

In [255]:
# Pick random hotel from top 100 in volume (not price)

In [None]:
random_hotel = hotels_top_100.sample()
print(random_hotel)

In [None]:
random_hotel = top_10_median_diaria_media.sample()['Hotel_ID']
random_hotel

In [None]:
data = combined_df[combined_df['Hotel_ID'].isin(random_hotel)]
data.shape

In [None]:
data

In [371]:
data = data.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

data.rename(columns={'Data': 'ds', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'y'}, inplace=True)
data = data.loc[:, ['hotel_id',  'ds', 'y']]

In [None]:
data

In [373]:
hotel_data = data.groupby('ds', as_index=False).agg({'y': 'mean'})

In [None]:
hotel_data['ds'] = pd.to_datetime(hotel_data['ds']).dt.tz_localize(None)
hotel_data

In [None]:
hotel_data['ds'].max()

In [None]:
model = Prophet()
model.fit(hotel_data)

In [377]:
# Create future dataframe for predictions
future = model.make_future_dataframe(periods=365)  # Forecast 30 days ahead

# Make predictions
forecast = model.predict(future)


In [None]:
print(f" Hello {random_hotel.values[0]}")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

# Set the seaborn theme
sns.set_theme()

# Plot observed data and predictions with different colors
plt.figure(figsize=(16, 8))
plt.plot(hotel_data['ds'], hotel_data['y'], label="Observed Data", color="navy", alpha=0.8)
plt.plot(forecast['ds'], forecast['yhat'], label="Predicted Data", color="firebrick", alpha=0.6)

# Add confidence intervals for predictions
plt.fill_between(
    forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color="firebrick", alpha=0.2, label="Uncertainty Interval"
)

# Customize x-axis for better readability
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))  # Set ticks every 3 months
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  # Format as Year-Month
plt.gcf().autofmt_xdate(rotation=90)  # Rotate x-axis labels vertically

# Customize the plot
plt.title(f"Hotel Data Forecast - Hotel ID {random_hotel.values[0]}", fontsize=16)
plt.xlabel("\n Exact Date", fontsize=14)
plt.ylabel("Average Daily Price (USD) \n", fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle="--", alpha=0.6)

# Set y-axis range
plt.ylim(0, 400)  # Change the range as needed

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Plot all seasonal components except the main prediction
fig = model.plot_components(forecast)

sns.set_theme()  # Apply a clean, modern theme
fig.set_size_inches(12, 16)       # Resize the entire figure

# Customize the color of the lines and confidence intervals for all subplots
for ax in fig.axes:  # Loop through all subplots
    for line in ax.get_lines():  # Change all lines to red
        line.set_color("firebrick")
    for collection in ax.collections:  # Change confidence intervals to red
        collection.set_facecolor("firebrick")
        collection.set_alpha(0.2)  # Keep transparency for confidence intervals

        
# Display the figure
plt.show()