In [2]:
import pandas as pd
import numpy as np  
import os

In [3]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = 'data\pull-pesquisas-city-2851556'

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)


In [4]:
# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2020.csv | Dimensions: (1, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2021.csv | Dimensions: (115243, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2022.csv | Dimensions: (1247986, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2023.csv | Dimensions: (1430037, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2024.csv | Dimensions: (1579543, 22)
Combined DataFrame Dimensions: (4372810, 22)
Row count verification successful! Total rows match.
                      Data   Data_ID  Canal_ID  Credencial_ID  Integrador_ID  \
0  2020-06-05 00:00:00 UTC  20200605       367              0             50   
1  2021-12-24 00:00:00 UTC  20211224         1              0              0   
2  2021-12-24 00:00:00 UTC  20211224        10              0              0   
3  2021-12-24 00:00:00 UTC  20211224       909           8112            120   
4  2021-12-24 00:00:00 UTC  202112

In [5]:
data_id_min = combined_df['Data_ID'].min()
data_id_max = combined_df['Data_ID'].max()
print(f"Min Data_ID: {data_id_min}, Max Data_ID: {data_id_max}")

Min Data_ID: 20200605, Max Data_ID: 20241231


In [6]:
distinct_hotel_ids = combined_df['Hotel_ID'].nunique()
print(f"Number of distinct Hotel_IDs: {distinct_hotel_ids}")

Number of distinct Hotel_IDs: 414


In [7]:
print(f"Number of distinct Hotel_IDs: {combined_df['Hotel_ID'].nunique()}")
print(f"Number of distinct Moeda_IDs: {combined_df['Moeda_ID'].nunique()}")
print(f"Number of distinct Canal_IDs: {combined_df['Canal_ID'].nunique()}")
print(f"Number of distinct Reservas: {combined_df['Reservas'].nunique()}")
print(f"Number of distinct DiariaMedia: {combined_df['DiariaMedia'].nunique()}")
print(f"Number of distinct Estadias: {combined_df['Estadia'].nunique()}")

Number of distinct Hotel_IDs: 414
Number of distinct Moeda_IDs: 2
Number of distinct Canal_IDs: 366
Number of distinct Reservas: 56
Number of distinct DiariaMedia: 472812
Number of distinct Estadias: 89


In [8]:
combined_df['DiariaMedia'] = combined_df.apply(
    lambda row: row['DiariaMedia'] * 0.16483969339817028 if row['Moeda_ID'] == 16 else row['DiariaMedia'], 
    axis=1
)

In [9]:
combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'Ocupacao_ID', 'DiariaMedia', "Estadia", 'Reservas']]

In [11]:
combined_df = combined_df.loc[combined_df.index.repeat(combined_df['Reservas'])].reset_index(drop=True)
combined_df.Reservas = 1

In [12]:
combined_df.Reservas.sum() # 4925449 OK

4925449

In [21]:
hotels_top_5 = combined_df.groupby('Hotel_ID').size().reset_index(name='Counts').sort_values(by='Counts', ascending=False).head(1)['Hotel_ID']

In [22]:
hotels_top_5

69    2094
Name: Hotel_ID, dtype: int64

In [23]:
data = combined_df[combined_df['Hotel_ID'].isin(hotels_top_5)]
data.shape

(102270, 6)

In [25]:
data = data.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

In [26]:
data

Unnamed: 0,Data,Hotel_ID,DiariaMedia
29,2021-12-24 00:00:00 UTC,2094,15.332564
31,2021-12-24 00:00:00 UTC,2094,24.453969
33,2021-12-24 00:00:00 UTC,2094,25.835875
357,2021-12-29 00:00:00 UTC,2094,33.112998
362,2021-12-29 00:00:00 UTC,2094,23.792961
...,...,...,...
4925253,2024-09-23 00:00:00 UTC,2094,59.015907
4925265,2024-09-23 00:00:00 UTC,2094,72.859144
4925310,2024-01-27 00:00:00 UTC,2094,42.569268
4925331,2024-08-26 00:00:00 UTC,2094,27.563669


In [28]:
data.rename(columns={'Data': 'date', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'price'}, inplace=True)
data = data.loc[:, ['hotel_id',  'date', 'price']]

In [27]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [29]:
# Convert to datetime and sort
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by=['hotel_id', 'date'])

In [33]:
# Normalize prices per hotel
scalers = {}
data['price_normalized'] = 0

for hotel_id, group in data.groupby('hotel_id'):
    # Create a new MinMaxScaler instance for each hotel
    scaler = MinMaxScaler()
    normalized_prices  = scaler.fit_transform(group[['price']])
    scalers[hotel_id] = scaler
    
    # Assign the normalized prices back to the corresponding rows in the dataframe
    data.loc[group.index, 'price_normalized'] = normalized_prices

  data.loc[group.index, 'price_normalized'] = normalized_prices


In [35]:
data

Unnamed: 0,hotel_id,date,price,price_normalized
1096,2094,2021-11-19 00:00:00+00:00,30.857991,0.058961
1224,2094,2021-11-19 00:00:00+00:00,32.709140,0.062498
1240,2094,2021-11-19 00:00:00+00:00,28.840353,0.055106
1279,2094,2021-11-19 00:00:00+00:00,23.143493,0.044220
1303,2094,2021-11-19 00:00:00+00:00,17.802687,0.034016
...,...,...,...,...
4847854,2094,2024-12-31 00:00:00+00:00,53.190472,0.101631
4847858,2094,2024-12-31 00:00:00+00:00,51.017885,0.097480
4847859,2094,2024-12-31 00:00:00+00:00,48.321981,0.092329
4847860,2094,2024-12-31 00:00:00+00:00,29.176626,0.055748


In [39]:
# Create sequences for lookback
lookback = 30  # Days used for prediction
forecast_horizon = 7  # Each step predicts 7 days

def create_sequences(data, lookback):
    X, y = [], []
    for i in range(len(data) - lookback - forecast_horizon + 1):
        X.append(data[i:i + lookback])
        y.append(data[i + lookback:i + lookback + forecast_horizon])
    return np.array(X), np.array(y)

# Create sequences per hotel
X_all, y_all = [], []
for hotel_id, group in data.groupby('hotel_id'):
    prices = group['price_normalized'].values
    X, y = create_sequences(prices, lookback)
    X_all.append(X)
    y_all.append(y)

# Combine all sequences
X_all = np.vstack(X_all)
y_all = np.vstack(y_all)

# Split into training and validation
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)  # (num_samples, lookback, 1)
print("Shape of y_train:", y_train.shape)  # (num_samples, forecast_horizon)

# Reshape targets to remove the extra dimension
y_train = y_train.reshape(y_train.shape[0], -1)  # Shape: (num_samples, forecast_horizon)
y_val = y_val.reshape(y_val.shape[0], -1)        # Shape: (num_samples, forecast_horizon)

print("New Shape of y_train:", y_train.shape)  # Should be (num_samples, forecast_horizon)
print("New Shape of y_val:", y_val.shape)      # Should be (num_samples, forecast_horizon)


Shape of X_train: (81787, 30)
Shape of y_train: (81787, 7)
New Shape of y_train: (81787, 7)
New Shape of y_val: (20447, 7)


In [40]:
import tensorflow as tf
import tensorflow_addons as tfa

# N-BEATS Block
def create_nbeats_block(input_shape, forecast_horizon):
    input_layer = tf.keras.layers.Input(shape=input_shape)
    
    # Fully connected layers
    x = tf.keras.layers.Dense(512, activation="relu")(input_layer)
    x = tf.keras.layers.Dense(512, activation="relu")(x)
    x = tf.keras.layers.Dense(512, activation="relu")(x)
    
    # Backcast and Forecast
    backcast = tf.keras.layers.Dense(input_shape[0])(x)
    forecast = tf.keras.layers.Dense(forecast_horizon)(x)
    
    model = tf.keras.models.Model(inputs=input_layer, outputs=[backcast, forecast])
    return model

# Full N-BEATS Model
def create_nbeats_model(input_shape, forecast_horizon):
    input_layer = tf.keras.layers.Input(shape=input_shape)
    backcast, forecast = create_nbeats_block(input_shape, forecast_horizon)(input_layer)
    model = tf.keras.models.Model(inputs=input_layer, outputs=forecast)
    return model

# Define model
input_shape = (lookback, 1)
model = create_nbeats_model(input_shape, forecast_horizon)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="mse",  # Mean squared error
    metrics=["mae"]  # Mean absolute error
)

model.summary()


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 30, 1)]           0         
                                                                 
 model_2 (Functional)        [(None, 30, 30),          545317    
                              (None, 30, 7)]                     
                                                                 
Total params: 545,317
Trainable params: 545,317
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    verbose=1
)
