In [None]:
import pandas as pd
import numpy as np  
import os
from prophet import Prophet
import matplotlib.pyplot as plt

In [None]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = "data/other/2021_Jan2025_PricesHotelsDates"

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)

# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
combined_df.shape

In [4]:
del current_dir, data, data_dir, dataframes, df, file_name, file_path, parent_dir, total_rows

In [None]:
combined_df

In [None]:
combined_df['Data'] = pd.to_datetime(combined_df['Data']).dt.tz_localize(None)
combined_df.head

In [None]:

# Filter for data in 2024
hotels_2024 = combined_df[combined_df['Data'].dt.year == 2024]['Hotel_ID'].unique()

# Filter the original combined_df to keep only hotels that appeared in 2024
combined_df = combined_df[combined_df['Hotel_ID'].isin(hotels_2024)]

# Display the filtered DataFrame
combined_df

In [None]:
hotels_2024.shape

In [10]:
combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

combined_df.rename(columns={'Data': 'ds', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'y'}, inplace=True)

combined_df = combined_df.loc[:, ['hotel_id',  'ds', 'y']]

In [None]:
combined_df = combined_df.groupby(['hotel_id', 'ds']).agg({'y': 'mean'}).reset_index()
combined_df.shape

In [None]:
print(combined_df['ds'].min())
print(combined_df['ds'].max())

In [None]:
combined_df = combined_df[combined_df['ds'] <= '2024-06-30']
combined_df

In [None]:
# Group by 'hotel_id' and count the number of rows for each hotel
hotel_counts = combined_df.groupby('hotel_id').size()

# Filter for hotels that have more than 92 rows
hotels_with_more_than_30_rows = hotel_counts[hotel_counts > 30].index

# Filter the original table for these hotels
filtered_combined_df = combined_df[combined_df['hotel_id'].isin(hotels_with_more_than_30_rows)]

# Display the filtered DataFrame
filtered_combined_df

In [None]:
print(combined_df.shape)
print(filtered_combined_df.shape)

In [None]:
hotel_list = filtered_combined_df['hotel_id'].drop_duplicates()
print(hotel_list.shape)

In [None]:
# Split the hotel list into 5 chunks based on the conditions you provided
chunk_1 = hotel_list[hotel_list < 5000]
chunk_2 = hotel_list[(hotel_list >= 5000) & (hotel_list < 10000)]
chunk_3 = hotel_list[(hotel_list >= 10000) & (hotel_list < 12500)]
chunk_4 = hotel_list[(hotel_list >= 12500) & (hotel_list < 13000)]
chunk_5 = hotel_list[(hotel_list >= 13000) & (hotel_list < 13500)]
chunk_6 = hotel_list[(hotel_list >= 13500) & (hotel_list < 14000)]
chunk_7 = hotel_list[(hotel_list >= 14000) & (hotel_list < 14500)]
chunk_8 = hotel_list[(hotel_list >= 14500) & (hotel_list < 14750)]
chunk_9 = hotel_list[(hotel_list >= 14750) & (hotel_list < 15000)]
chunk_10 = hotel_list[(hotel_list >= 15000) & (hotel_list < 17500)]
chunk_11 = hotel_list[(hotel_list >= 17500) & (hotel_list < 20000)]
chunk_12 = hotel_list[(hotel_list >= 20000) & (hotel_list < 25000)]


# List of chunks to process
chunks = [ chunk_1, chunk_2, chunk_3, chunk_4, chunk_5, chunk_6, chunk_7, chunk_8, chunk_9, chunk_10, chunk_11, chunk_12]

# Loop through each chunk
for idx, chunk in enumerate(chunks):
    print(f"Processing chunk {idx + 1} with {len(chunk)} hotels...")

    # Initialize an empty DataFrame to store all results for this chunk
    all_forecasts_chunk = pd.DataFrame()

    # Loop through each hotel in the current chunk
    for hotel in chunk:
        print(f"Processing forecast for Hotel {hotel}...")

        # Load data for the specific hotel
        hotel_data = combined_df[combined_df['hotel_id'].isin([hotel])]  # Define your function to get hotel data
        
        # Fit the model
        model = Prophet()
        model.fit(hotel_data)

        # Create future dataframe for predictions
        future = model.make_future_dataframe(periods=365)  # Forecast 1 year ahead

        # Make predictions
        forecast = model.predict(future)

        # Add hotel name for identification
        forecast["hotel"] = hotel
        
        # Mark historical vs. forecasted data
        forecast["data_type"] = ["historical" if date <= hotel_data["ds"].max() else "forecast" for date in forecast["ds"]]

        # Append to the main DataFrame for this chunk
        all_forecasts_chunk = pd.concat([all_forecasts_chunk, forecast], ignore_index=True)

    # Save the results for this chunk to a CSV file
    all_forecasts_chunk.to_csv(f"../out/hotel_forecasts_chunk_{idx + 1}.csv", index=False)
    
    print(f"Chunk {idx + 1} forecasting complete. Results saved as 'hotel_forecasts_chunk_{idx + 1}.csv'.")

print("All chunks processed successfully.")