In [None]:
import pandas as pd
import numpy as np  
import os
from prophet import Prophet
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = "data/other/From2021_to_Jan2025_PricesHotelsDates"

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
combined_df.shape

In [None]:
del current_dir, data, data_dir, dataframes, df, file_name, file_path, parent_dir, total_rows

In [None]:
data_lake_prd_314410_cz_moedas = pd.read_csv('..\\data\\lookups\\data-lake-prd-314410.cz.moedas.csv')
data_lake_prd_314410_cz_moedas.head

In [None]:
data_lake_prd_314410_cz_hoteis = pd.read_csv('..\\data\\lookups\\data-lake-prd-314410.cz.hoteis.csv')
data_lake_prd_314410_cz_hoteis.head

In [None]:
data_lake_prd_314410_cz_hoteis = data_lake_prd_314410_cz_hoteis[['Hotel_ID', 'Moeda']]

In [None]:
data_lake_prd_314410_cz_hoteis = pd.merge(data_lake_prd_314410_cz_hoteis, 
                     data_lake_prd_314410_cz_moedas[['Moeda', 'Cotacao_USD']], 
                     on='Moeda', 
                     how='left')

In [None]:
data_lake_prd_314410_cz_hoteis = data_lake_prd_314410_cz_hoteis.dropna()

In [None]:
data_lake_prd_314410_cz_hoteis # if wanting to convert it all to dollars

In [None]:
combined_df

In [None]:
combined_df['Data'] = pd.to_datetime(combined_df['Data']).dt.tz_localize(None)
combined_df.head

In [None]:
# Filter for data in 2024
hotels_2024 = combined_df[combined_df['Data'].dt.year == 2024]['Hotel_ID'].unique()

# Filter the original combined_df to keep only hotels that appeared in 2024
combined_df = combined_df[combined_df['Hotel_ID'].isin(hotels_2024)]

# Display the filtered DataFrame
combined_df

In [None]:
hotels_2024.shape

In [None]:
combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

combined_df.rename(columns={'Data': 'ds', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'y'}, inplace=True)

combined_df = combined_df.loc[:, ['hotel_id',  'ds', 'y']]

In [None]:
combined_df = combined_df.groupby(['hotel_id', 'ds']).agg({'y': 'mean'}).reset_index()
combined_df.shape

In [None]:
print(combined_df['ds'].min())
print(combined_df['ds'].max())

In [None]:
combined_df

In [None]:
# Group by 'hotel_id' and count the number of rows for each hotel
hotel_counts = combined_df.groupby('hotel_id').size()

# Filter for hotels that have more than 183 rows/dates
hotels_with_more_than_183_rows = hotel_counts[hotel_counts > 183].index

# Filter the original table for these hotels
filtered_combined_df = combined_df[combined_df['hotel_id'].isin(hotels_with_more_than_183_rows)]

# Display the filtered DataFrame
filtered_combined_df

In [None]:
print(combined_df.shape)
print(filtered_combined_df.shape)

In [None]:
hotel_list = filtered_combined_df['hotel_id'].drop_duplicates()
print(hotel_list.shape)

In [None]:
print(hotel_list.max())
print(hotel_list.min())

In [None]:
# Split the hotel list into 5 chunks based on the conditions you provided
chunk_1 = hotel_list[hotel_list < 5000]
chunk_2 = hotel_list[(hotel_list >= 5000) & (hotel_list < 10000)]
chunk_3 = hotel_list[(hotel_list >= 10000) & (hotel_list < 12500)]
chunk_4 = hotel_list[(hotel_list >= 12500) & (hotel_list < 13000)]
chunk_5 = hotel_list[(hotel_list >= 13000) & (hotel_list < 13500)]
chunk_6 = hotel_list[(hotel_list >= 13500) & (hotel_list < 14000)]
chunk_7 = hotel_list[(hotel_list >= 14000) & (hotel_list < 14500)]
chunk_8 = hotel_list[(hotel_list >= 14500) & (hotel_list < 14750)]
chunk_9 = hotel_list[(hotel_list >= 14750) & (hotel_list < 15000)]
chunk_10 = hotel_list[(hotel_list >= 15000) & (hotel_list < 17500)]
chunk_11 = hotel_list[(hotel_list >= 17500) & (hotel_list < 20000)]
chunk_12 = hotel_list[(hotel_list >= 20000) & (hotel_list < 25000)]


# List of chunks to process
chunks = [ chunk_1, chunk_2, chunk_3, chunk_4, chunk_5, chunk_6, chunk_7, chunk_8, chunk_9, chunk_10, chunk_11, chunk_12]

# Loop through each chunk
for idx, chunk in enumerate(chunks):
    print(f"Processing chunk {idx + 1} with {len(chunk)} hotels...")

    # Initialize an empty DataFrame to store all results for this chunk
    all_forecasts_chunk = pd.DataFrame()

    # Loop through each hotel in the current chunk
    for hotel in chunk:
        print(f"Processing forecast for Hotel {hotel}...")

        # Load data for the specific hotel
        hotel_data = combined_df[combined_df['hotel_id'].isin([hotel])]  # Define your function to get hotel data
        
        # Fit the model
        model = Prophet()
        model.fit(hotel_data)

        # Create future dataframe for predictions
        future = model.make_future_dataframe(periods=365)  # Forecast 1 year ahead

        # Make predictions
        forecast = model.predict(future)

        # Add hotel name for identification
        forecast["hotel"] = hotel
        
        # Mark historical vs. forecasted data
        forecast["data_type"] = ["historical" if date <= hotel_data["ds"].max() else "forecast" for date in forecast["ds"]]

        # Append to the main DataFrame for this chunk
        all_forecasts_chunk = pd.concat([all_forecasts_chunk, forecast], ignore_index=True)

    # Save the results for this chunk to a CSV file
    all_forecasts_chunk.to_csv(f"../out/hotel_forecasts_From2021_to_Jan2025onwards_chunk_{idx + 1}.csv", index=False)
    print(f"Chunk {idx + 1} forecasting complete. Results saved as 'hotel_forecasts_From2021_to_Jan2025onwards_chunk_{idx + 1}.csv'.")

print("All chunks processed successfully.")

In [None]:

# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = "out/"

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.csv') and 'chunk' in file_name:  # Check if the file is a chunk CSV
            file_path = os.path.join(data_dir, file_name)
            
            # Check if file is non-empty before reading
            if os.path.getsize(file_path) == 0:
                print(f"Skipping empty file (0 bytes): {file_name}")
                continue

            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path, on_bad_lines='skip')

                # Ensure the DataFrame has valid columns and is not empty
                if not df.empty and len(df.columns) > 0:
                    dataframes.append(df)  # Append the DataFrame to the list
                    print(f"File: {file_name} | Dimensions: {df.shape}")
                    total_rows += df.shape[0]
                else:
                    print(f"Skipping file with no valid data: {file_name}")

            except pd.errors.EmptyDataError:
                print(f"Skipping empty/corrupt file: {file_name}")
            except pd.errors.ParserError:
                print(f"Skipping file with parsing issues: {file_name}")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    # Ensure there is at least one valid DataFrame before concatenating
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"Combined DataFrame Dimensions: {combined_df.shape}")

        if total_rows == combined_df.shape[0]:
            print("Row count verification successful! Total rows match.")
        else:
            print("Row count verification failed! Mismatch in row count.")

        print(combined_df.head())  # Display the first few rows
    else:
        print("No valid CSV files found.")

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
combined_df['data_type'].unique()

In [None]:
combined_df.to_csv("../out/all_hotel_forecasts_From2021_to_Jan2025onwards.csv", index=False)

In [None]:
all_hotel_forecasts_From2021_to_Jan2025onwards = pd.read_csv("../out/all_hotel_forecasts_From2021_to_Jan2025onwards.csv")

In [None]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = "data/other/From2021_to_Jan2025_PricesHotelsDates"

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
combined_df['Data'] = pd.to_datetime(combined_df['Data']).dt.tz_localize(None)
combined_df.head

In [None]:

# Filter for data in 2024
hotels_2024 = combined_df[combined_df['Data'].dt.year == 2024]['Hotel_ID'].unique()

# Filter the original combined_df to keep only hotels that appeared in 2024
combined_df = combined_df[combined_df['Hotel_ID'].isin(hotels_2024)]

# Display the filtered DataFrame
combined_df

In [None]:
combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

combined_df.rename(columns={'Data': 'ds', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'y'}, inplace=True)

combined_df = combined_df.loc[:, ['hotel_id',  'ds', 'y']]

In [None]:
combined_df = combined_df.groupby(['hotel_id', 'ds']).agg({'y': 'mean'}).reset_index()
combined_df.shape

In [None]:
# Group by 'hotel_id' and count the number of rows for each hotel
hotel_counts = combined_df.groupby('hotel_id').size()

# Filter for hotels that have more than 183 rows
hotels_with_more_than_183_rows = hotel_counts[hotel_counts > 183].index

# Filter the original table for these hotels
filtered_combined_df = combined_df[combined_df['hotel_id'].isin(hotels_with_more_than_183_rows)]

# Display the filtered DataFrame
filtered_combined_df

In [None]:
hotel_list = all_hotel_forecasts_From2021_to_Jan2025onwards['hotel'].drop_duplicates()

In [None]:
filtered_combined_df.rename(columns={'hotel_id': 'hotel'}, inplace=True)

In [None]:
filtered_combined_df["ds"] = pd.to_datetime(filtered_combined_df["ds"])
all_hotel_forecasts_From2021_to_Jan2025onwards["ds"] = pd.to_datetime(all_hotel_forecasts_From2021_to_Jan2025onwards["ds"])

In [None]:
filtered_combined_df = filtered_combined_df[filtered_combined_df['hotel'].isin(hotel_list)]
all_hotel_forecasts_From2021_to_Jan2025onwards = all_hotel_forecasts_From2021_to_Jan2025onwards[all_hotel_forecasts_From2021_to_Jan2025onwards['hotel'].isin(hotel_list)]


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Initialize a list to store results
error_metrics = []

# Loop through each hotel
for hotel in hotel_list:
    hotel_forecast = all_hotel_forecasts_From2021_to_Jan2025onwards[all_hotel_forecasts_From2021_to_Jan2025onwards["hotel"] == hotel]
    historical_data = filtered_combined_df[filtered_combined_df["hotel"] == hotel]

    # Merge on the date column
    comparison = historical_data.merge(hotel_forecast, on="ds", how="inner")

    # Skip if no matching dates found
    if comparison.empty:
        print(f"Skipping hotel {hotel}: No matching dates found.")
        continue  

    # Compute error metrics
    mae = mean_absolute_error(comparison["y"], comparison["yhat"])
    rmse = np.sqrt(mean_squared_error(comparison["y"], comparison["yhat"]))
    mape = np.mean(np.abs((comparison["y"] - comparison["yhat"]) / comparison["y"])) * 100

    # Append results
    error_metrics.append({"hotel": hotel, "MAE": mae, "RMSE": rmse, "MAPE": mape})


# Convert results to a DataFrame
error_df = pd.DataFrame(error_metrics)

# Display the error metrics table
print(error_df)

# Save to CSV for further analysis
error_df.to_csv("../out/hotel_error_metrics_From2021_to_Jan2025onwards.csv", index=False)

print("Error metrics saved to 'hotel_error_metrics_From2021_to_Jan2025onwards.csv'.")


In [None]:
error_df = error_df[ (error_df['MAPE'] < 100) & (error_df['MAPE'] > 0) ]

In [None]:
error_df

In [None]:
error_df['MAPE'].describe()

In [None]:
# Remove rows where MAPE is infinite
error_df = error_df.replace([np.inf, -np.inf], np.nan).dropna(subset=["MAPE"])


In [None]:
error_df['MAPE'].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a histogram of the "MAPE" variable
plt.figure(figsize=(8, 6))
sns.histplot(data=error_df[ (error_df['MAPE']<100) & (error_df['MAPE']>0) ], x="MAPE", bins=500, kde=False)

# Add labels and title
plt.xlabel("\n MAPE")
plt.ylabel("Frequency \n")
plt.title("Histogram of Mean Absolute Percent Error [MAPE]")

# Show the plot
plt.show()


In [None]:
# Ensure 'ds' columns are in datetime format
filtered_combined_df["ds"] = pd.to_datetime(filtered_combined_df["ds"])
all_hotel_forecasts_From2021_to_Jan2025onwards["ds"] = pd.to_datetime(all_hotel_forecasts_From2021_to_Jan2025onwards["ds"])

# Merge both tables on 'hotel' and 'ds'
comparison = filtered_combined_df.merge(
    all_hotel_forecasts_From2021_to_Jan2025onwards, 
    on=["hotel", "ds"], 
    how="inner"
)

In [None]:
hotels_to_keep_error = error_df['hotel'].drop_duplicates()

In [None]:
comparison.shape

In [None]:
comparison = comparison[comparison['hotel'].isin(hotels_to_keep_error)]


In [None]:
hotel_list = comparison.groupby('hotel').size().sort_values(ascending=False).sample(10).index.tolist()

In [None]:
print(hotel_list)

In [None]:

# Set the seaborn theme
sns.set_theme()

# Define figure size based on the number of hotels
fig, axes = plt.subplots(len(hotel_list), 1, figsize=(12, 6 * len(hotel_list)), sharex=True)

# If there's only one hotel, `axes` won't be an iterable, so wrap it in a list
if len(hotel_list) == 1:
    axes = [axes]

# Define colors for historical and forecasted data
colors = {"historical": "navy", "forecast": "firebrick"}

# Loop through each hotel and create a separate plot
for i, hotel in enumerate(hotel_list):
    # Filter data for this hotel
    hotel_forecast = comparison[comparison["hotel"] == hotel]

    # Plot historical data
    sns.lineplot(ax=axes[i], 
                 data=hotel_forecast, 
                 x="ds", y="y", label="Historical", color=colors["historical"])

    # Plot forecasted data
    sns.lineplot(ax=axes[i], 
                 data=hotel_forecast, 
                 x="ds", y="yhat", label="Forecast", color=colors["forecast"], linestyle="dashed")

    # Formatting for each subplot
    axes[i].set_title(f"Hotel {hotel} Forecast")
    axes[i].set_xlabel("Date")
    axes[i].set_ylabel("Prices [Local Currency]")
    axes[i].legend()
    axes[i].grid(True)

     # Ensure the y-axis starts at zero
    axes[i].set_ylim(0, hotel_forecast[["y", "yhat"]].max().max())


# Adjust layout to prevent overlap
plt.tight_layout()

# Show plots
plt.show()



In [None]:
all_hotel_forecasts_From2021_to_Jan2025onwards['hotel'].nunique()

In [None]:

combined_df = combined_df[combined_df['hotel_id'].isin(hotels_to_keep_error)]

all_hotel_forecasts_From2021_to_Jan2025onwards = all_hotel_forecasts_From2021_to_Jan2025onwards[all_hotel_forecasts_From2021_to_Jan2025onwards['hotel'].isin(hotels_to_keep_error)]



In [None]:
combined_df_hist = combined_df[combined_df['hotel_id']==4937]
all_hotel_forecasts_From2021_to_Jan2025onwards_future = all_hotel_forecasts_From2021_to_Jan2025onwards[all_hotel_forecasts_From2021_to_Jan2025onwards['hotel']==4937]

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

# Set the seaborn theme
sns.set_theme()

# Plot observed data and predictions with different colors
plt.figure(figsize=(16, 8))
plt.plot(combined_df_hist['ds'], combined_df_hist['y'], label="Observed Data", color="navy", alpha=0.8)
plt.plot(all_hotel_forecasts_From2021_to_Jan2025onwards_future['ds'], all_hotel_forecasts_From2021_to_Jan2025onwards_future['yhat'], label="Predicted Data", color="firebrick", alpha=0.6)

# Add confidence intervals for predictions
plt.fill_between(
    all_hotel_forecasts_From2021_to_Jan2025onwards_future['ds'], all_hotel_forecasts_From2021_to_Jan2025onwards_future['yhat_lower'], all_hotel_forecasts_From2021_to_Jan2025onwards_future['yhat_upper'], color="firebrick", alpha=0.2, label="Uncertainty Interval"
)

# Customize x-axis for better readability
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))  # Set ticks every 3 months
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  # Format as Year-Month
plt.gcf().autofmt_xdate(rotation=90)  # Rotate x-axis labels vertically

# Customize the plot
plt.title(f"Hotel Data Forecast - Hotel ID {combined_df_hist['hotel_id'].unique()}", fontsize=16)
plt.xlabel("\n Exact Date", fontsize=14)
plt.ylabel("Average Daily Price (Local Currency) \n", fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle="--", alpha=0.6)


# Save the plot before showing
plt.savefig(f"../out/figures/hotel_forecast_{combined_df_hist['hotel_id'].unique()}.png", dpi=300, bbox_inches="tight")

# Show the plot
plt.tight_layout()
plt.show()


In [None]:

combined_df.to_csv("../out/all_hotel_historic_From2021_to_Jan2025_final_selected_final_selected.csv", index=False)
all_hotel_forecasts_From2021_to_Jan2025onwards.to_csv("../out/all_hotel_forecasts_From2021_to_Jan2025onwards_final_selected.csv", index=False)



In [None]:
# Set the seaborn theme
sns.set_theme()

# Get unique hotel IDs
hotel_ids = combined_df["hotel_id"].unique()

# Loop through each hotel_id
for hotel_id in hotel_ids:
    # Filter data for the current hotel
    hotel_hist = combined_df[combined_df["hotel_id"] == hotel_id]
    hotel_forecast = all_hotel_forecasts_From2021_to_Jan2025onwards[
        all_hotel_forecasts_From2021_to_Jan2025onwards["hotel"] == hotel_id
    ]

    # Create a new figure
    plt.figure(figsize=(16, 8))
    
    # Plot observed data
    plt.plot(hotel_hist["ds"], hotel_hist["y"], label="Observed Data", color="navy", alpha=0.8)
    
    # Plot predicted data
    plt.plot(hotel_forecast["ds"], hotel_forecast["yhat"], label="Predicted Data", color="firebrick", alpha=0.6)
    
    # Add confidence intervals for predictions
    plt.fill_between(
        hotel_forecast["ds"], 
        hotel_forecast["yhat_lower"], 
        hotel_forecast["yhat_upper"], 
        color="firebrick", alpha=0.2, label="Uncertainty Interval"
    )

    # Customize x-axis
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))  
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  
    plt.gcf().autofmt_xdate(rotation=90)  

    # Customize the plot
    plt.title(f"Hotel Data Forecast - Hotel ID {hotel_id}", fontsize=16)
    plt.xlabel("\n Exact Date", fontsize=14)
    plt.ylabel("Average Daily Price (Local Currency) \n", fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True, linestyle="--", alpha=0.6)

    # Save the plot with hotel_id in the filename
    plt.savefig(f"../out/figures/png/hotel_forecast_{hotel_id}.png", dpi=300, bbox_inches="tight")
    
    # Close the plot to free memory before the next iteration
    plt.close()

print("All plots saved successfully!")

In [None]:
# Set the seaborn theme
sns.set_theme()

# Get unique hotel IDs
hotel_ids = combined_df["hotel_id"].unique()

# Create a single PDF to store all plots
with PdfPages("../out/figures/pdf/hotel_forecasts_prophet_365_2025.pdf") as pdf:
    for hotel_id in hotel_ids:
        # Filter data for the current hotel
        hotel_hist = combined_df[combined_df["hotel_id"] == hotel_id]
        hotel_forecast = all_hotel_forecasts_From2021_to_Jan2025onwards[
            all_hotel_forecasts_From2021_to_Jan2025onwards["hotel"] == hotel_id
        ]

        # Create a new figure
        plt.figure(figsize=(16, 8))

        # Plot observed data
        plt.plot(hotel_hist["ds"], hotel_hist["y"], label="Observed Data", color="navy", alpha=0.8)

        # Plot predicted data
        plt.plot(hotel_forecast["ds"], hotel_forecast["yhat"], label="Predicted Data", color="firebrick", alpha=0.6)

        # Add confidence intervals for predictions
        plt.fill_between(
            hotel_forecast["ds"], 
            hotel_forecast["yhat_lower"], 
            hotel_forecast["yhat_upper"], 
            color="firebrick", alpha=0.2, label="Uncertainty Interval"
        )

        # Customize x-axis
        plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))  
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))  
        plt.gcf().autofmt_xdate(rotation=90)  

        # Customize the plot
        plt.title(f"Hotel Data Forecast - Hotel ID {hotel_id}", fontsize=16)
        plt.xlabel("\n Exact Date", fontsize=14)
        plt.ylabel("Average Daily Price (Local Currency) \n", fontsize=14)
        plt.legend(fontsize=12)
        plt.grid(True, linestyle="--", alpha=0.6)

        # Save the current figure to the PDF
        pdf.savefig()
        plt.close()  # Close the figure to free memory

print("All plots saved successfully in 'hotel_forecasts.pdf'!")

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Get unique hotel IDs
hotel_ids = combined_df["hotel_id"].unique()

output_dir = "../out/figures/html/"


for hotel_id in hotel_ids:
    # Extract data for the current hotel
    hotel_hist = combined_df[combined_df["hotel_id"] == hotel_id]
    hotel_forecast = all_hotel_forecasts_From2021_to_Jan2025onwards[
        all_hotel_forecasts_From2021_to_Jan2025onwards["hotel"] == hotel_id
    ]

    # Create interactive plot
    fig = go.Figure()

    # Add observed data
    fig.add_trace(go.Scatter(
        x=hotel_hist["ds"],
        y=hotel_hist["y"],
        mode="lines",
        name="Observed Data",
        line=dict(color="navy")
    ))

    # Add predicted data
    fig.add_trace(go.Scatter(
        x=hotel_forecast["ds"],
        y=hotel_forecast["yhat"],
        mode="lines",
        name="Predicted Data",
        line=dict(color="firebrick", dash="dash")
    ))

    # Add confidence interval
    fig.add_trace(go.Scatter(
        x=hotel_forecast["ds"].tolist() + hotel_forecast["ds"].tolist()[::-1],
        y=hotel_forecast["yhat_upper"].tolist() + hotel_forecast["yhat_lower"].tolist()[::-1],
        fill="toself",
        fillcolor="rgba(178,34,34,0.2)",
        line=dict(color="rgba(255,255,255,0)"),
        name="Uncertainty Interval"
    ))

    # Customize layout
    fig.update_layout(
        title=f"Hotel Data Forecast - Hotel ID {hotel_id}",
        xaxis_title="Exact Date",
        yaxis_title="Average Daily Price (Local Currency)",
        xaxis=dict(tickformat="%Y-%m", tickangle=90),
        template="plotly_white"
    )

    # Save as HTML file
    output_path = os.path.join(output_dir, f"hotel_forecast_{hotel_id}.html")
    fig.write_html(output_path)

print("All interactive plots saved successfully!")
