In [None]:
import pandas as pd
import numpy as np  
import os
from prophet import Prophet
import matplotlib.pyplot as plt

In [None]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = "data/other/2021_Jan2025_PricesHotelsDates"

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
combined_df.shape

In [6]:
del current_dir, data, data_dir, dataframes, df, file_name, file_path, parent_dir, total_rows

In [None]:
data_lake_prd_314410_cz_moedas = pd.read_csv('..\\data\\lookups\\data-lake-prd-314410.cz.moedas.csv')
data_lake_prd_314410_cz_moedas.head

In [None]:
data_lake_prd_314410_cz_hoteis = pd.read_csv('..\\data\\lookups\\data-lake-prd-314410.cz.hoteis.csv')
data_lake_prd_314410_cz_hoteis.head

In [None]:
data_lake_prd_314410_cz_hoteis = data_lake_prd_314410_cz_hoteis[['Hotel_ID', 'Moeda']]

In [16]:
data_lake_prd_314410_cz_hoteis = pd.merge(data_lake_prd_314410_cz_hoteis, 
                     data_lake_prd_314410_cz_moedas[['Moeda', 'Cotacao_USD']], 
                     on='Moeda', 
                     how='left')


In [18]:
data_lake_prd_314410_cz_hoteis = data_lake_prd_314410_cz_hoteis.dropna()

In [None]:
data_lake_prd_314410_cz_hoteis # if wanting to convert it all to dollars

In [None]:
combined_df

In [None]:
combined_df['Data'] = pd.to_datetime(combined_df['Data']).dt.tz_localize(None)
combined_df.head

In [None]:

# Filter for data in 2024
hotels_2024 = combined_df[combined_df['Data'].dt.year == 2024]['Hotel_ID'].unique()

# Filter the original combined_df to keep only hotels that appeared in 2024
combined_df = combined_df[combined_df['Hotel_ID'].isin(hotels_2024)]

# Display the filtered DataFrame
combined_df

In [None]:
hotels_2024.shape

In [26]:
combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

combined_df.rename(columns={'Data': 'ds', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'y'}, inplace=True)

combined_df = combined_df.loc[:, ['hotel_id',  'ds', 'y']]

In [None]:
combined_df = combined_df.groupby(['hotel_id', 'ds']).agg({'y': 'mean'}).reset_index()
combined_df.shape

In [None]:
print(combined_df['ds'].min())
print(combined_df['ds'].max())

In [None]:
combined_df

In [None]:


hotel_data = combined_df[combined_df['hotel_id'].isin([8274, 15392])]


In [None]:
hotel_list = combined_df['hotel_id'].drop_duplicates().sample(10, random_state=42).tolist()
print(hotel_list)

In [None]:
# Group by 'hotel_id' and count the number of rows for each hotel
hotel_list = combined_df.groupby('hotel_id').size().sort_values(ascending=False).head(10).index.tolist()

print(hotel_list)

In [None]:
hotel_list = combined_df['hotel_id'].drop_duplicates()
print(hotel_list)

In [None]:

# Example list of hotel names (assumes you have corresponding data for each)
hotel_list = hotel_list

# Initialize an empty DataFrame to store all results
all_forecasts = pd.DataFrame()

# Loop through each hotel
for hotel in hotel_list:
    print(f"Processing forecast for {hotel}...")

    # Load data for the specific hotel (replace this with actual data loading)
    hotel_data = combined_df[combined_df['hotel_id'].isin([hotel])]  # Define your function to get hotel data
    
    # Fit the model
    model = Prophet()
    model.fit(hotel_data)

    # Create future dataframe for predictions
    future = model.make_future_dataframe(periods=365)  # Forecast 1 year ahead

    # Make predictions
    forecast = model.predict(future)

    # Add hotel name for identification
    forecast["hotel"] = hotel
    
    # Mark historical vs. forecasted data
    forecast["data_type"] = ["historical" if date <= hotel_data["ds"].max() else "forecast" for date in forecast["ds"]]

    # Append to the main DataFrame
    all_forecasts = pd.concat([all_forecasts, forecast], ignore_index=True)

# Save to CSV or use for further analysis
all_forecasts.to_csv("..\out\hotel_forecasts.csv", index=False)

print("Forecasting complete. Results saved in 'hotel_forecasts.csv'.")


In [None]:
# Group by 'hotel_id' and count the number of rows for each hotel
hotel_counts = combined_df.groupby('hotel_id').size()

# Filter for hotels that have more than 92 rows
hotels_with_more_than_30_rows = hotel_counts[hotel_counts > 30].index

# Filter the original table for these hotels
filtered_combined_df = combined_df[combined_df['hotel_id'].isin(hotels_with_more_than_30_rows)]

# Display the filtered DataFrame
filtered_combined_df

In [None]:
print(combined_df.shape)
print(filtered_combined_df.shape)

In [None]:
hotel_list = filtered_combined_df['hotel_id'].drop_duplicates()
print(hotel_list.shape)

In [None]:
# Split the hotel list into 5 chunks based on the conditions you provided
chunk_1 = hotel_list[hotel_list < 5000]
chunk_2 = hotel_list[(hotel_list >= 5000) & (hotel_list < 10000)]
chunk_3 = hotel_list[(hotel_list >= 10000) & (hotel_list < 12500)]
chunk_4 = hotel_list[(hotel_list >= 12500) & (hotel_list < 13000)]
chunk_5 = hotel_list[(hotel_list >= 13000) & (hotel_list < 13500)]
chunk_6 = hotel_list[(hotel_list >= 13500) & (hotel_list < 14000)]
chunk_7 = hotel_list[(hotel_list >= 14000) & (hotel_list < 14500)]
chunk_8 = hotel_list[(hotel_list >= 14500) & (hotel_list < 14750)]
chunk_9 = hotel_list[(hotel_list >= 14750) & (hotel_list < 15000)]
chunk_10 = hotel_list[(hotel_list >= 15000) & (hotel_list < 17500)]
chunk_11 = hotel_list[(hotel_list >= 17500) & (hotel_list < 20000)]
chunk_12 = hotel_list[(hotel_list >= 20000) & (hotel_list < 25000)]


# List of chunks to process
chunks = [ chunk_1, chunk_2, chunk_3, chunk_4, chunk_5, chunk_6, chunk_7, chunk_8, chunk_9, chunk_10, chunk_11, chunk_12]

# Loop through each chunk
for idx, chunk in enumerate(chunks):
    print(f"Processing chunk {idx + 1} with {len(chunk)} hotels...")

    # Initialize an empty DataFrame to store all results for this chunk
    all_forecasts_chunk = pd.DataFrame()

    # Loop through each hotel in the current chunk
    for hotel in chunk:
        print(f"Processing forecast for Hotel {hotel}...")

        # Load data for the specific hotel
        hotel_data = combined_df[combined_df['hotel_id'].isin([hotel])]  # Define your function to get hotel data
        
        # Fit the model
        model = Prophet()
        model.fit(hotel_data)

        # Create future dataframe for predictions
        future = model.make_future_dataframe(periods=365)  # Forecast 1 year ahead

        # Make predictions
        forecast = model.predict(future)

        # Add hotel name for identification
        forecast["hotel"] = hotel
        
        # Mark historical vs. forecasted data
        forecast["data_type"] = ["historical" if date <= hotel_data["ds"].max() else "forecast" for date in forecast["ds"]]

        # Append to the main DataFrame for this chunk
        all_forecasts_chunk = pd.concat([all_forecasts_chunk, forecast], ignore_index=True)

    # Save the results for this chunk to a CSV file
    all_forecasts_chunk.to_csv(f"../out/hotel_forecasts_chunk_{idx + 1}.csv", index=False)
    print(f"Chunk {idx + 1} forecasting complete. Results saved as 'hotel_forecasts_chunk_{idx + 1}.csv'.")

print("All chunks processed successfully.")

In [None]:
combined_df[combined_df['hotel_id']==14358]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the seaborn theme
sns.set_theme()

# Define figure size based on the number of hotels
fig, axes = plt.subplots(len(hotel_list), 1, figsize=(12, 6 * len(hotel_list)), sharex=True)

# If there's only one hotel, `axes` won't be an iterable, so wrap it in a list
if len(hotel_list) == 1:
    axes = [axes]

# Define colors for historical and forecasted data
colors = {"historical": "navy", "forecast": "firebrick"}

# Loop through each hotel and create a separate plot
for i, hotel in enumerate(hotel_list):
    # Filter data for this hotel
    hotel_forecast = all_forecasts[all_forecasts["hotel"] == hotel]

    # Plot historical data
    sns.lineplot(ax=axes[i], 
                 data=hotel_forecast[hotel_forecast["data_type"] == "historical"], 
                 x="ds", y="yhat", label="Historical", color=colors["historical"])

    # Plot forecasted data
    sns.lineplot(ax=axes[i], 
                 data=hotel_forecast[hotel_forecast["data_type"] == "forecast"], 
                 x="ds", y="yhat", label="Forecast", color=colors["forecast"], linestyle="dashed")

    # Formatting for each subplot
    axes[i].set_title(f"Hotel {hotel} Forecast")
    axes[i].set_xlabel("Date")
    axes[i].set_ylabel("Prices [Local Currency]")
    axes[i].legend()
    axes[i].grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show plots
plt.show()



In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd

# Initialize a list to store results
error_metrics = []

# Loop through each hotel
for hotel_id in hotel_list:
    # Get actual and predicted values for the historical period
    hotel_forecast = all_forecasts[all_forecasts["hotel"] == hotel_id]
    historical_data = combined_df[combined_df["hotel_id"] == hotel_id]

    # Merge on the date column to align actual and predicted values
    comparison = historical_data.merge(hotel_forecast, on="ds", how="inner")

    # Compute error metrics
    mae = mean_absolute_error(comparison["y"], comparison["yhat"])
    rmse = np.sqrt(mean_squared_error(comparison["y"], comparison["yhat"]))
    mape = np.mean(np.abs((comparison["y"] - comparison["yhat"]) / comparison["y"])) * 100

    # Append results to the list
    error_metrics.append({"hotel_id": hotel_id, "MAE": mae, "RMSE": rmse, "MAPE": mape})

# Convert results to a DataFrame
error_df = pd.DataFrame(error_metrics)

# Display the error metrics table
print(error_df)

# Save to CSV for further analysis
# error_df.to_csv("hotel_error_metrics.csv", index=False)

print("Error metrics saved to 'hotel_error_metrics.csv'.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the seaborn theme
sns.set_theme()

# Define figure size dynamically based on the number of hotels
fig, axes = plt.subplots(len(hotel_list), 1, figsize=(12, 6 * len(hotel_list)), sharex=True)

# Ensure axes is iterable (if there's only one hotel)
if len(hotel_list) == 1:
    axes = [axes]

# Loop through each hotel and create a separate plot
for i, hotel_id in enumerate(hotel_list):
    # Get actual and predicted values for the historical period
    hotel_forecast = all_forecasts[all_forecasts["hotel"] == hotel_id]
    historical_data = combined_df[combined_df["hotel_id"] == hotel_id]

    # Merge on the date column to align actual and predicted values
    comparison = historical_data.merge(hotel_forecast, on="ds", how="inner")

    # Plot actual values
    sns.lineplot(ax=axes[i], data=comparison, x="ds", y="y", label="Actual", color="navy")

    # Plot predicted values
    sns.lineplot(ax=axes[i], data=comparison, x="ds", y="yhat", label="Predicted", color="firebrick", linestyle="dashed")

    # Formatting for each subplot
    axes[i].set_title(f"Actual vs Predicted for Hotel {hotel_id}")
    axes[i].set_xlabel("Date")
    axes[i].set_ylabel("Value")
    axes[i].legend()
    axes[i].grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show plots
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the seaborn theme
sns.set_theme()

# Define figure size dynamically based on the number of hotels
fig, axes = plt.subplots(len(hotel_list), 1, figsize=(8, 8 * len(hotel_list)), sharex=False, sharey=False)

# Ensure axes is iterable (if there's only one hotel)
if len(hotel_list) == 1:
    axes = [axes]

# Loop through each hotel and create a separate scatter plot
for i, hotel_id in enumerate(hotel_list):
    # Get actual and predicted values for the historical period
    hotel_forecast = all_forecasts[all_forecasts["hotel"] == hotel_id]
    historical_data = combined_df[combined_df["hotel_id"] == hotel_id]

    # Merge on the date column to align actual and predicted values
    comparison = historical_data.merge(hotel_forecast, on="ds", how="inner")

    # Scatter plot of actual vs. predicted values
    sns.scatterplot(ax=axes[i], x=comparison["y"], y=comparison["yhat"], alpha=0.4)

    # Add a reference line (perfect predictions)
    min_val = min(comparison["y"].min(), comparison["yhat"].min())
    max_val = max(comparison["y"].max(), comparison["yhat"].max())
    axes[i].plot([min_val, max_val], [min_val, max_val], linestyle="dashed", color="red", label="Perfect Fit")

    # Formatting for each subplot
    axes[i].set_title(f"Observed vs. Predicted for Hotel {hotel_id}")
    axes[i].set_xlabel("Observed (Actual) Values")
    axes[i].set_ylabel("Predicted Values")
    axes[i].legend()
    axes[i].grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show plots
plt.show()


In [None]:
TODO
-Run for all the hotels
-Calculate error for all the hotels can create new summary error table
-Plot some over time and some actual vs predicted
-Store everything

-REDO all using lag version