In [None]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [1]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.1969, 0.6745, 0.1240],
        [0.7551, 0.7888, 0.3326],
        [0.4613, 0.8625, 0.8245],
        [0.9027, 0.6587, 0.4636],
        [0.4063, 0.6300, 0.6503]])


In [2]:
print(torch.__version__)

2.5.1+cu121


In [3]:
print(torch.cuda.is_available())

True


In [4]:
import pandas as pd
import numpy as np  
import os

In [23]:
# Get the current directory
current_dir = os.getcwd()

# Navigate one folder up
parent_dir = os.path.dirname(current_dir)

# Where the files are located
data = 'data\pull-pesquisas-city-2851556'

# Navigate down into the "data" folder
data_dir = os.path.join(parent_dir, data)


In [24]:
# Initialize an empty list to store DataFrames
dataframes = []

# Variable to track total rows
total_rows = 0

# Loop through all files in the "data" folder
try:
    for file_name in os.listdir(data_dir):
        
        if file_name.endswith('.csv'):  # Check if the file is a CSV
            
            file_path = os.path.join(data_dir, file_name)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dataframes.append(df)  # Append the DataFrame to the list
            
             # Print dimensions of the current file
            print(f"File: {file_name} | Dimensions: {df.shape}")
            
            # Add the number of rows to the total count
            total_rows += df.shape[0]

    # Concatenate all DataFrames in the list by binding rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Print dimensions of the combined DataFrame
    print(f"Combined DataFrame Dimensions: {combined_df.shape}")

    # Verify the sum of rows matches
    if total_rows == combined_df.shape[0]:
        print("Row count verification successful! Total rows match.")
    else:
        print("Row count verification failed! Mismatch in row count.")

    print(combined_df.head())  # Display the first few rows of the combined DataFrame

except FileNotFoundError:
    print(f"Folder '{data_dir}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2020.csv | Dimensions: (1, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2021.csv | Dimensions: (115243, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2022.csv | Dimensions: (1247986, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2023.csv | Dimensions: (1430037, 22)
File: data-lake-prd-314410.cz.pull-pesquisas_city_2851556_2024.csv | Dimensions: (1579543, 22)
Combined DataFrame Dimensions: (4372810, 22)
Row count verification successful! Total rows match.
                      Data   Data_ID  Canal_ID  Credencial_ID  Integrador_ID  \
0  2020-06-05 00:00:00 UTC  20200605       367              0             50   
1  2021-12-24 00:00:00 UTC  20211224         1              0              0   
2  2021-12-24 00:00:00 UTC  20211224        10              0              0   
3  2021-12-24 00:00:00 UTC  20211224       909           8112            120   
4  2021-12-24 00:00:00 UTC  202112

In [25]:
combined_df['DiariaMedia'] = combined_df.apply(
    lambda row: row['DiariaMedia'] * 0.16483969339817028 if row['Moeda_ID'] == 16 else row['DiariaMedia'], 
    axis=1
)

In [26]:
combined_df = combined_df.loc[:, ['Data',  'Hotel_ID', 'Ocupacao_ID', 'DiariaMedia', "Estadia", 'Reservas']]

In [27]:
combined_df = combined_df.loc[combined_df.index.repeat(combined_df['Reservas'])].reset_index(drop=True)
combined_df.Reservas = 1

In [28]:
hotels_top_5 = combined_df.groupby('Hotel_ID').size().reset_index(name='Counts').sort_values(by='Counts', ascending=False).head(1)['Hotel_ID']

In [29]:
data = combined_df[combined_df['Hotel_ID'].isin(hotels_top_5)]
data.shape

(102270, 6)

In [30]:
data = data.loc[:, ['Data',  'Hotel_ID', 'DiariaMedia']]

In [31]:
data.rename(columns={'Data': 'date', 'Hotel_ID': 'hotel_id', 'DiariaMedia': 'price'}, inplace=True)
data = data.loc[:, ['hotel_id',  'date', 'price']]

In [32]:
from datetime import datetime
#import holidays

In [33]:

# Convert date to datetime
data['date'] = pd.to_datetime(data['date'])

# Feature engineering
data['day_of_week'] = data['date'].dt.dayofweek
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)

# Add holiday feature
#br_holidays = holidays.BR()
#data['is_holiday'] = data['date'].isin(br_holidays).astype(int)

# Lag features
data = data.sort_values(by=['hotel_id', 'date'])
for lag in [1, 7, 30]:
    data[f'lag_{lag}'] = data.groupby('hotel_id')['price'].shift(lag)

# Rolling statistics
data['rolling_mean_7'] = data.groupby('hotel_id')['price'].rolling(window=7, min_periods=1).mean().reset_index(0, drop=True)


In [34]:
data

Unnamed: 0,hotel_id,date,price,day_of_week,month,year,is_weekend,lag_1,lag_7,lag_30,rolling_mean_7
1096,2094,2021-11-19 00:00:00+00:00,30.857991,4,11,2021,0,,,,30.857991
1224,2094,2021-11-19 00:00:00+00:00,32.709140,4,11,2021,0,30.857991,,,31.783565
1240,2094,2021-11-19 00:00:00+00:00,28.840353,4,11,2021,0,32.709140,,,30.802495
1279,2094,2021-11-19 00:00:00+00:00,23.143493,4,11,2021,0,28.840353,,,28.887744
1303,2094,2021-11-19 00:00:00+00:00,17.802687,4,11,2021,0,23.143493,,,26.670733
...,...,...,...,...,...,...,...,...,...,...,...
4847854,2094,2024-12-31 00:00:00+00:00,53.190472,1,12,2024,0,23.595022,36.884530,42.198962,46.416484
4847858,2094,2024-12-31 00:00:00+00:00,51.017885,1,12,2024,0,53.190472,61.686310,48.574961,44.892423
4847859,2094,2024-12-31 00:00:00+00:00,48.321981,1,12,2024,0,51.017885,61.715981,30.825023,42.978995
4847860,2094,2024-12-31 00:00:00+00:00,29.176626,1,12,2024,0,48.321981,48.574961,30.825023,40.207804


In [41]:
# Ensure 'date' is a pandas datetime type
data["date"] = pd.to_datetime(data["date"])

# Create a sequential time index
data = data.sort_values(["hotel_id", "date"])  # Sort by group and time
data["time_idx"] = data.groupby("hotel_id").cumcount()

In [50]:
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split



# Ensure 'time_idx' exists
data = data.sort_values(["hotel_id", "date"])  # Sort by group and time
data["time_idx"] = data.groupby("hotel_id").cumcount()

# Add date-based features
data["day_of_week"] = data["date"].dt.dayofweek
data["month"] = data["date"].dt.month
data["year"] = data["date"].dt.year
data["is_weekend"] = (data["day_of_week"] >= 5).astype(int)

# Split data into train and validation indices using sklearn's train_test_split
train_indices, val_indices = train_test_split(data.index, test_size=0.2, shuffle=False)

# Create the TimeSeriesDataSet for train and validation based on the split indices
train_data = data.loc[train_indices]
val_data = data.loc[val_indices]

# Define the TimeSeriesDataSet
max_encoder_length = 90  # Lookback window
max_prediction_length = 30  # Forecast window

train_dataset = TimeSeriesDataSet(
    train_data,
    time_idx="time_idx",
    target="price",
    group_ids=["hotel_id"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_known_reals=["day_of_week", "month", "year", "is_weekend"],
    time_varying_unknown_reals=["price"],
    target_normalizer=GroupNormalizer(groups=["hotel_id"]),
)

val_dataset = TimeSeriesDataSet(
    val_data,
    time_idx="time_idx",
    target="price",
    group_ids=["hotel_id"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_known_reals=["day_of_week", "month", "year", "is_weekend"],
    time_varying_unknown_reals=["price"],
    target_normalizer=GroupNormalizer(groups=["hotel_id"]),
)

# Create DataLoaders for train and validation
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Model definition
tft = TemporalFusionTransformer.from_dataset(
    train_dataset, learning_rate=0.03, hidden_size=16, attention_head_size=1
)

# Train model
trainer = Trainer(max_epochs=10)  # Adjust GPU usage
trainer.fit(tft, train_dataloader, val_dataloader)

# Forecast
predictions = tft.predict(val_dataloader)

print(predictions)

c:\Users\paulo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
c:\Users\paulo\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\utilities\parsing.py:209: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
  super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `TemporalFusionTransformer`