In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.6.0+cu126
True


In [2]:
import random
random.seed(69)

# NumPy random seed
import numpy as np
np.random.seed(69)

# PyTorch random seed
import torch
torch.manual_seed(69)
torch.cuda.manual_seed(69)
torch.cuda.manual_seed_all(69)  # If using multi-GPU
torch.cuda.empty_cache()
torch.backends.cudnn.deterministic = True  # For deterministic results
torch.backends.cudnn.benchmark = True  # Disabling to ensure deterministic algorithm
torch.backends.cuda.matmul.allow_tf32 = True


In [3]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("Torch CUDA Version:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


CUDA Available: True
CUDA Version: 12.6
Torch CUDA Version: NVIDIA RTX A6000


In [4]:
import torch

# Check if CUDA is available
gpu_available = torch.cuda.is_available()
print("CUDA Available:", gpu_available)

if gpu_available:
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print("Number of GPUs available:", num_gpus)

CUDA Available: True
Number of GPUs available: 1


In [5]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()


In [6]:
import torch
import pytorch_lightning as pla
import polars as pl
import pandas as pd
import numpy as np



In [7]:
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.models import TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import RMSE

  from tqdm.autonotebook import tqdm


In [8]:
file_path = "processed_sales_data_optimized_dtypes.parquet"
data = pl.read_parquet(file_path)

# Convert date column to datetime format for TimeSeriesDataSet
data = data.with_columns(pl.col("date").cast(pl.Date))



In [9]:
type(data)

polars.dataframe.frame.DataFrame

In [10]:
data.head()

id,date,store_nbr,item_nbr,unit_sales,is_onpromotion,is_holiday,transactions,day,month,weekday,year,log_unit_sales
u32,date,u8,u32,f32,u8,u8,u16,u8,u8,u8,u16,f32
0,2013-01-01,25,103665,7.0,0,1,770,1,1,2,2013,2.079442
1,2013-01-01,25,105574,1.0,0,1,770,1,1,2,2013,0.693147
2,2013-01-01,25,105575,2.0,0,1,770,1,1,2,2013,1.098612
3,2013-01-01,25,108079,1.0,0,1,770,1,1,2,2013,0.693147
4,2013-01-01,25,108701,1.0,0,1,770,1,1,2,2013,0.693147


In [11]:
data.columns

['id',
 'date',
 'store_nbr',
 'item_nbr',
 'unit_sales',
 'is_onpromotion',
 'is_holiday',
 'transactions',
 'day',
 'month',
 'weekday',
 'year',
 'log_unit_sales']

In [12]:
# Prepare TimeSeriesDataSet
max_encoder_length = 30  # Using the past 30 days
max_prediction_length = 7  # Predicting the next 7 days

In [13]:
min_date = data["date"].min()
data = data.with_columns(
    (pl.col("date").cast(pl.Int32) - pl.lit(min_date).cast(pl.Int32)).alias("time_idx")
)

In [14]:
# # Convert date to an integer index
# data['time_idx'] = (data['date'] - data['date'].min()).dt.days


In [15]:
data

id,date,store_nbr,item_nbr,unit_sales,is_onpromotion,is_holiday,transactions,day,month,weekday,year,log_unit_sales,time_idx
u32,date,u8,u32,f32,u8,u8,u16,u8,u8,u8,u16,f32,i32
0,2013-01-01,25,103665,7.0,0,1,770,1,1,2,2013,2.079442,0
1,2013-01-01,25,105574,1.0,0,1,770,1,1,2,2013,0.693147,0
2,2013-01-01,25,105575,2.0,0,1,770,1,1,2,2013,1.098612,0
3,2013-01-01,25,108079,1.0,0,1,770,1,1,2,2013,0.693147,0
4,2013-01-01,25,108701,1.0,0,1,770,1,1,2,2013,0.693147,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
125497035,2017-08-15,54,2089339,4.0,0,1,802,15,8,2,2017,1.609438,1687
125497036,2017-08-15,54,2106464,1.0,1,1,802,15,8,2,2017,0.693147,1687
125497037,2017-08-15,54,2110456,192.0,0,1,802,15,8,2,2017,5.262691,1687
125497038,2017-08-15,54,2113914,198.0,1,1,802,15,8,2,2017,5.293305,1687


In [16]:
# Convert categorical columns to string
categorical_columns = ["store_nbr", "item_nbr", "is_onpromotion", "is_holiday"]
data = data.with_columns([pl.col(col).cast(pl.Utf8) for col in categorical_columns])


In [17]:
data

id,date,store_nbr,item_nbr,unit_sales,is_onpromotion,is_holiday,transactions,day,month,weekday,year,log_unit_sales,time_idx
u32,date,str,str,f32,str,str,u16,u8,u8,u8,u16,f32,i32
0,2013-01-01,"""25""","""103665""",7.0,"""0""","""1""",770,1,1,2,2013,2.079442,0
1,2013-01-01,"""25""","""105574""",1.0,"""0""","""1""",770,1,1,2,2013,0.693147,0
2,2013-01-01,"""25""","""105575""",2.0,"""0""","""1""",770,1,1,2,2013,1.098612,0
3,2013-01-01,"""25""","""108079""",1.0,"""0""","""1""",770,1,1,2,2013,0.693147,0
4,2013-01-01,"""25""","""108701""",1.0,"""0""","""1""",770,1,1,2,2013,0.693147,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
125497035,2017-08-15,"""54""","""2089339""",4.0,"""0""","""1""",802,15,8,2,2017,1.609438,1687
125497036,2017-08-15,"""54""","""2106464""",1.0,"""1""","""1""",802,15,8,2,2017,0.693147,1687
125497037,2017-08-15,"""54""","""2110456""",192.0,"""0""","""1""",802,15,8,2,2017,5.262691,1687
125497038,2017-08-15,"""54""","""2113914""",198.0,"""1""","""1""",802,15,8,2,2017,5.293305,1687


In [18]:
# print(data['is_holiday'].dtype)
# print(data['is_holiday'].unique())
# print(data['is_holiday'].isna().sum())  # Check for NaNs

In [19]:
# print(data.memory_usage(deep=True))

In [20]:
# Convert infinite values to NaN before converting to Pandas
data = data.with_columns([
    pl.when(pl.col(col).is_infinite()).then(None).otherwise(pl.col(col))
    for col in data.select(pl.col(pl.Float64)).columns
])

In [21]:
data = data.to_pandas()


In [22]:
type(data)

pandas.core.frame.DataFrame

In [23]:
data.dropna(inplace=True)


In [24]:
print("Remaining NaN count after cleanup:", data.isna().sum().sum())  # Should be 0


Remaining NaN count after cleanup: 0


In [25]:
# Filter out rows with zero or negative unit sales
print("Zero/Negative unit_sales:", (data["unit_sales"] <= 0).sum())
data = data[data["unit_sales"] > 0].copy()



Zero/Negative unit_sales: 4155


In [26]:
# Apply log transformation
data["log_unit_sales"] = np.log1p(data["unit_sales"])


In [27]:
# Drop any remaining NaN values in log_unit_sales
data.dropna(subset=["log_unit_sales"], inplace=True)


In [28]:
# Confirm no NaNs
print("log_unit_sales NaN count:", data["log_unit_sales"].isna().sum())  # Should be 0


log_unit_sales NaN count: 0


In [29]:
# Define training cutoff for TimeSeriesDataSet
training_cutoff = data["date"].max() - pd.Timedelta(days=max_prediction_length)


In [30]:
# Convert categorical columns to `category` in Pandas
categorical_cols = ["store_nbr", "item_nbr", "is_onpromotion", "is_holiday"]
for col in categorical_cols:
    data[col] = data[col].astype("category")

print("Conversion completed successfully!")


Conversion completed successfully!


In [31]:
# Display final dataset
print(data)


                  id       date store_nbr item_nbr  unit_sales is_onpromotion  \
0                  0 2013-01-01        25   103665         7.0              0   
1                  1 2013-01-01        25   105574         1.0              0   
2                  2 2013-01-01        25   105575         2.0              0   
3                  3 2013-01-01        25   108079         1.0              0   
4                  4 2013-01-01        25   108701         1.0              0   
...              ...        ...       ...      ...         ...            ...   
127970252  125497035 2017-08-15        54  2089339         4.0              0   
127970253  125497036 2017-08-15        54  2106464         1.0              1   
127970254  125497037 2017-08-15        54  2110456       192.0              0   
127970255  125497038 2017-08-15        54  2113914       198.0              1   
127970256  125497039 2017-08-15        54  2116416         2.0              0   

          is_holiday  trans

In [32]:
print(data.dtypes)

id                        uint32
date              datetime64[ms]
store_nbr               category
item_nbr                category
unit_sales               float32
is_onpromotion          category
is_holiday              category
transactions              uint16
day                        uint8
month                      uint8
weekday                    uint8
year                      uint16
log_unit_sales           float32
time_idx                   int32
dtype: object


In [33]:
# Check for NaN values
print("NaN counts per column:\n", data.isna().sum())

NaN counts per column:
 id                0
date              0
store_nbr         0
item_nbr          0
unit_sales        0
is_onpromotion    0
is_holiday        0
transactions      0
day               0
month             0
weekday           0
year              0
log_unit_sales    0
time_idx          0
dtype: int64


In [34]:
# Check for Infinite values
float_cols = data.select_dtypes(include=[np.float32, np.float64]).columns
inf_mask = np.isinf(data[float_cols]).sum()
print("\nInf counts per float column:\n", inf_mask)


Inf counts per float column:
 unit_sales        0
log_unit_sales    0
dtype: int64


In [35]:
# Replace Inf with NaN and drop NaNs
if inf_mask.sum() > 0:
    print("\nWarning: Found Inf values, replacing with NaN.")
    data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [36]:
# Drop remaining NaNs
if data.isna().sum().sum() > 0:
    print("\nWarning: Found NaN values, dropping rows with NaNs.")
    data.dropna(inplace=True)

In [37]:
# Check categorical values for issues
cat_cols = ["store_nbr", "item_nbr", "is_onpromotion", "is_holiday"]
for col in cat_cols:
    print(f"\nChecking categorical column {col}:")
    print("Unique values:", data[col].nunique())
    if data[col].isna().sum() > 0:
        print(f"Warning: {col} contains NaNs!")


Checking categorical column store_nbr:
Unique values: 54

Checking categorical column item_nbr:
Unique values: 4036

Checking categorical column is_onpromotion:
Unique values: 2

Checking categorical column is_holiday:
Unique values: 2


In [38]:
# Check `time_idx` is sequential
if not data["time_idx"].is_monotonic_increasing:
    print("\nWarning: time_idx is not strictly increasing. Sorting data.")
    data = data.sort_values(by=["time_idx"])

In [39]:
# Check for duplicate rows
duplicate_count = data.duplicated().sum()
if duplicate_count > 0:
    print(f"\nWarning: Found {duplicate_count} duplicate rows. Dropping duplicates.")
    data.drop_duplicates(inplace=True)




In [40]:
# Print memory usage
print("\nMemory usage (bytes):", data.memory_usage(deep=True).sum())


Memory usage (bytes): 5558069503


In [42]:
data.to_parquet("further_processed_sales_data_optimized_dtypes.parquet",index=False)