DATA PREPROCEESSING AND GENERATION HAS BEEN DONE LOCALLY

**FEATURE ENGINEERING AND MODEL CONFIGURATION**

In [4]:
import numpy as np
import pandas as pd
import torch
!pip install --upgrade darts
!pip install pytorch_lightning
import pytorch_lightning as pl
from darts import TimeSeries
from darts.models import TFTModel
from darts.utils.likelihood_models import QuantileRegression # Darts has its own QuantileLoss
from darts.dataprocessing.transformers import Scaler # Darts scaler wrapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

# dataset importation
filepath = "/content/drive/MyDrive/synthetic_product_data.csv"


# Model
input_chunk_length = 168  # History length (similar to max_encoder_length) - 1 week
output_chunk_length = 24 * 7 # Prediction horizon (similar to max_prediction_length) - 1 week
n_epochs = 50
batch_size = 32
learning_rate = 0.02
hidden_size = 32
attention_head_size = 4
dropout = 0.1
hidden_continuous_size = 16 # Used for processing continuous covs

validation_split_percentage = 0.8
random_seed = 42 # for reproducibility

# Set seed for reproducibility
pl.seed_everything(random_seed, workers=True)
np.random.seed(random_seed)
torch.manual_seed(random_seed)


# Load Data
print(f"Loading data from: {filepath}")
try:
    df = pd.read_csv(filepath)
except FileNotFoundError:
    print(f"Error: File not found at {filepath}")
    exit()

print("Data loaded successfully. Shape:", df.shape)

# Feature Engineering

start_date = pd.Timestamp("2023-01-01") # 1. Create a proper Timestamp Index
df['timestamp'] = start_date + pd.to_timedelta(df.index, unit='h')
df = df.set_index('timestamp')

# Define Target and Covariate Columns
target_cols = ["Kero Product Flow Value", "Jet Fuel Product Flow Value"]

# Covariates known in the future (time features)
# Darts  one-hot covariant encode 'hour'
df['hour'] = df.index.hour
hour_encoder = OneHotEncoder(sparse_output=False, drop='first') # drop='first' to avoid multicollinearity
hour_encoded = hour_encoder.fit_transform(df[['hour']])
hour_cols = [f"hour_{cat}" for cat in hour_encoder.categories_[0][1:]] # Get column names
df[hour_cols] = hour_encoded

future_covariate_cols = hour_cols

# Covariates NOT known in the future (sensor readings, etc.)
past_covariate_cols = [
    "Kero Feed Flow Value", "Jet Fuel Feed Flow Value",
    "Kero Feed Temperature Value", "Jet Fuel Feed Temperature Value",
    "Kero Column Pressure Value", "Jet Fuel Column Pressure Value",
    "Kero Yield Value", "Jet Fuel Yield Value"
]

print("Feature engineering done.")
print(df.head())


# Data spliting
split_index = int(validation_split_percentage * len(df))
train_df = df.iloc[:split_index]
val_df = df.iloc[split_index:] # Darts needs a validation set *beyond* the training set end

print(f"Training data shape: {train_df.shape}")
print(f"Validation data shape: {val_df.shape}")


# Data scaling

# Target Scaler
target_scaler = Scaler(StandardScaler())
train_target_scaled = target_scaler.fit_transform(TimeSeries.from_dataframe(train_df, value_cols=target_cols))
val_target_scaled = target_scaler.transform(TimeSeries.from_dataframe(val_df, value_cols=target_cols))

# Keeping full target series for validation
full_target_scaled = target_scaler.transform(TimeSeries.from_dataframe(df, value_cols=target_cols))


# Past Covariates Scaler (if they exist)
if past_covariate_cols:
    past_cov_scaler = Scaler(StandardScaler())
    train_past_cov_scaled = past_cov_scaler.fit_transform(TimeSeries.from_dataframe(train_df, value_cols=past_covariate_cols))
    val_past_cov_scaled = past_cov_scaler.transform(TimeSeries.from_dataframe(val_df, value_cols=past_covariate_cols))

    # Keeping full series for validation
    full_past_cov_scaled = past_cov_scaler.transform(TimeSeries.from_dataframe(df, value_cols=past_covariate_cols))

else:
    train_past_cov_scaled = None
    val_past_cov_scaled = None
    full_past_cov_scaled = None


# Future Covariates Scaler

if future_covariate_cols:
    # If you had other *continuous* future covariates, scale them here
    # For OHE 'hour', no scaling needed
    train_fut_cov = TimeSeries.from_dataframe(train_df, value_cols=future_covariate_cols)
    val_fut_cov = TimeSeries.from_dataframe(val_df, value_cols=future_covariate_cols)
    full_fut_cov = TimeSeries.from_dataframe(df, value_cols=future_covariate_cols)

else:
    train_fut_cov = None
    val_fut_cov = None
    full_fut_cov = None

print("Scaling complete.")


# Optional: Define a specific validation cutoff for clearer evaluation later if needed
# val_cutoff = train_df.index[-1] # Last timestamp in training data
# print(f"Validation cutoff: {val_cutoff}")



INFO:lightning_fabric.utilities.seed:Global seed set to 42


Loading data from: /content/drive/MyDrive/synthetic_product_data.csv
Data loaded successfully. Shape: (131040, 11)
Feature engineering done.
                     Time (hours)  Kero Feed Flow Value  \
timestamp                                                 
2023-01-01 00:00:00             0            154.666967   
2023-01-01 01:00:00             1            213.735971   
2023-01-01 02:00:00             2            -64.133038   
2023-01-01 03:00:00             3            -81.101067   
2023-01-01 04:00:00             4            -95.007405   

                     Kero Feed Temperature Value  Kero Column Pressure Value  \
timestamp                                                                      
2023-01-01 00:00:00                   -29.283706                    0.168244   
2023-01-01 01:00:00                   -15.501893                    0.159867   
2023-01-01 02:00:00                    12.317711                    0.242521   
2023-01-01 03:00:00                    18.655

**MODEL TRAINING**

In [7]:
# Defining Darts TFT Model

# Define optimizing parameters
optimizer_kwargs = {"lr": learning_rate}

# Define PyTorch Lightning Trainer arguments
lr_scheduler_cls = torch.optim.lr_scheduler.ReduceLROnPlateau
pl_trainer_kwargs = {
    "accelerator": "auto",
    "gradient_clip_val": 0.1,
    "max_epochs": n_epochs,
    "callbacks": [
        EarlyStopping(monitor="val_loss", patience=15, mode="min", verbose=True),
        LearningRateMonitor(logging_interval="epoch"),
    ],
    # Enable checkpointing (Darts handles the default checkpoint callback)
    "enable_checkpointing": True,
}

# Defining the loss function
# Default quantiles: [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
# loss_fn = QuantileLoss()
likelihood=QuantileRegression() # using likelihood as alternative

# Model definition
print("Defining TFT Model...")

PO_tft_darts = TFTModel(
    input_chunk_length=input_chunk_length,
    output_chunk_length=output_chunk_length,
    hidden_size=hidden_size,
    lstm_layers=1,
    num_attention_heads=attention_head_size,
    dropout=dropout,
    batch_size=batch_size,
    n_epochs=n_epochs, # Max epochs passed to trainer
    optimizer_cls=torch.optim.Adam,
    optimizer_kwargs=optimizer_kwargs,
    lr_scheduler_cls=lr_scheduler_cls, # Use ReduceLROnPlateau like in PTF setup
    lr_scheduler_kwargs={"patience": 5, "factor": 0.1},
    # loss_fn=loss_fn,
    likelihood=QuantileRegression(),
    random_state=random_seed,
    pl_trainer_kwargs=pl_trainer_kwargs,
    model_name="EIA_TFT_darts", # Name for logging/checkpoints
    save_checkpoints=True, # Ensure checkpoints are saved (needed for EarlyStopping best model)
    log_tensorboard=True, # Enable tensorboard logging
)

print("Model defined.")


# Training the Model
print("Starting model training...")
PO_tft_darts.fit(
    series=train_target_scaled, # The scaled training target series
    past_covariates=train_past_cov_scaled,
    future_covariates=train_fut_cov,
    val_series=val_target_scaled, # The scaled validation target series
    val_past_covariates=val_past_cov_scaled,
    val_future_covariates=val_fut_cov,
    verbose=True,
)

print("Training finished.")

Defining TFT Model...
Model defined.
Starting model training...


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
   | Name                              | Type                             | Params
----------------------------------------------------------------------------------------
0  | train_metrics                     | MetricCollection                 | 0     
1  | val_metrics                       | MetricCollection                 | 0     
2  | input_embeddings                  | _MultiEmbedding                  | 0     
3  | static_covariates_vsn             | _VariableSelectionNetwork        | 0     
4  | encoder_vsn                       | _VariableSelectionNetwork        | 42.5 K
5  | decoder_vsn                       | _Vari

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 3.941


Validation: 0it [00:00, ?it/s]

Training finished.


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


'\n# --- Optional: Make Predictions & Evaluate ---\nprint("\n--- Predicting on Validation Set ---")\n\n# Load the best model checkpoint saved by the trainer\n# Checkpoint path might vary based on logger defaults (e.g., lightning_logs/version_X/checkpoints)\n# Or use the model object directly if training just finished\n# best_model = TFTModel.load_from_checkpoint(model_name="EIA_TFT_darts", best=True) # Requires finding path\nbest_model = PO_tft_darts # Use the model trained in this session\n\n# We need to provide the history leading up to the prediction start point\n# Predict on the validation set horizon\n# The model needs `input_chunk_length` history before the start of val_target_scaled\n# And it needs `output_chunk_length` of future covariates for the prediction period\n\n# Prepare data for prediction:\n# Need history from the *full* dataset ending just before validation starts\nn_predict = len(val_target_scaled) # Predict for the length of the validation set\npredict_start_time = 

MODEL EVALUATION

In [None]:
print("\n Predicting on Validation Set ")

# Load the best model checkpoint saved by the trainer
best_model = PO_tft_darts # Use the model trained in this session


# Prepare data for prediction:

n_predict = len(val_target_scaled) # Predict for the length of the validation set
predict_start_time = val_target_scaled.start_time()

# History for prediction comes from the full scaled series before the validation start
pred_input_target = full_target_scaled.drop_after(predict_start_time - full_target_scaled.freq)
pred_input_past_cov = full_past_cov_scaled.drop_after(predict_start_time - full_target_scaled.freq) if full_past_cov_scaled else None

# Future covariates needed for the prediction horizon
pred_future_cov = full_fut_cov[predict_start_time:predict_start_time + (n_predict - 1)*full_fut_cov.freq] if full_fut_cov else None


print(f"Predicting {n_predict} steps starting from {predict_start_time}")

# Ensure future_covariates length == n_predict
if pred_future_cov is not None:
    print(f"Length of provided future_covariates for prediction: {len(pred_future_cov)}")
    assert len(pred_future_cov) >= n_predict, f"Future covariates needed: {n_predict}, Provided: {len(pred_future_cov)}"
    # Trim if too long (model only needs n=n_predict)
    pred_future_cov = pred_future_cov[:n_predict]


predictions_scaled = best_model.predict(
    n=n_predict,
    series=pred_input_target,
    past_covariates=pred_input_past_cov,
    future_covariates=pred_future_cov,
    num_samples=100 # For probabilistic forecasts (if using QuantileLoss/Likelihood)
)

print(f"Prediction output type: {type(predictions_scaled)}")
print(f"Prediction output components: {predictions_scaled.components}") # Should match target_cols
print(f"Prediction output length: {len(predictions_scaled)}")
print(f"Prediction output quantiles: {predictions_scaled.quantiles}") # If probabilistic


# Inverse transform the predictions (usually the median/0.5 quantile for point forecast)
# Extract the median (0.5 quantile) prediction for scaled down evaluation
median_prediction_scaled = predictions_scaled.quantile_timeseries(quantile=0.5)

predictions_inv = target_scaler.inverse_transform(median_prediction_scaled)

print("\nSample of Inverse-Transformed Predictions (Median):")
print(predictions_inv.pd_dataframe().head())

# comparing `predictions_inv` with the actual validation data `val_df[target_cols]`

from darts.metrics import mae, mape

# Get actual validation data as TimeSeries
actual_val_ts = TimeSeries.from_dataframe(val_df, value_cols=target_cols)

# Calculate MAE for the median prediction
mae_val = mae(actual_val_ts, predictions_inv)
mape_val = mape(actual_val_ts, predictions_inv)

print(f"\nValidation MAE (Median Forecast): {mae_val:.4f}")
print(f"Validation MAPE (Median Forecast): {mape_val:.4f}%")

# You can also evaluate quantile metrics if needed using the full predictions_scaled object""