MOUNTING STORAGE DEVICE

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


AUTOFORMER GIT REPO CLONING

In [26]:

# Install dependencies for Autoformer (assuming requirements.txt in repo or manual install)
!pip install pandas numpy matplotlib scikit-learn
!git clone https://github.com/thuml/Autoformer.git
%cd Autoformer
!pip install -r requirements.txt


Cloning into 'Autoformer'...
remote: Enumerating objects: 376, done.[K
remote: Counting objects: 100% (212/212), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 376 (delta 157), reused 143 (delta 143), pack-reused 164 (from 2)[K
Receiving objects: 100% (376/376), 2.20 MiB | 31.78 MiB/s, done.
Resolving deltas: 100% (221/221), done.
/content/Autoformer/Autoformer/Autoformer/Autoformer/Autoformer


FEATURE ENGINEERING

In [None]:
import pandas as pd
import numpy as np
import os

# Define the path to the MICE interpolated and concatenated data
file_path = '/content/drive/MyDrive/final_concatenated_data_mice_imputed.csv' # Ensure this is the correct file path

# List of specified target columns
target_cols_list = [
    'Total_Stabilized_Naphtha_Product_Flowrate',
    'Total_Kerosene_Product_Flowrate',
    'Jet_Fuel_Product_Train1_Flowrate',
    'Total_Light_Diesel_Product_Flowrate',
    'Total_Heavy_Diesel_Product_Flowrate',
    'Total_Atmospheric_Residue_Flowrate',
    'Blend_Yield_Gas & LPG',
    'Blend_Yield_Kerosene',
    'Blend_Yield_Light Diesel',
    'Blend_Yield_Heavy Diesel',
    'Blend_Yield_RCO'
]

# Load the combined CSV file with robust error handling
if os.path.exists(file_path):
    try:
        df = pd.read_csv(file_path)

        # Check if 'date' column exists and parse dates
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df = df.dropna(subset=['date'])
            if not df.empty:
                print(f"Successfully loaded {file_path}: {len(df)} rows")
            else:
                raise ValueError(f"Error: {file_path} loaded but had no valid date entries after parsing.")
        else:
            raise ValueError(f"Error: File {file_path} does not contain a 'date' column.")

    except pd.errors.EmptyDataError:
        raise ValueError(f"Error: File {file_path} is empty.")
    except Exception as e:
        raise RuntimeError(f"Error loading file {file_path}: {e}")
else:
    raise FileNotFoundError(f"Error: File not found - {file_path}.")

# Sort by date and reset index
df = df.sort_values('date').reset_index(drop=True)

# dirty column drop
if "Tag" in df.columns:
    df = df.drop(columns=["Tag"])
    print("Tag column dropped")

# Display date range
if not df.empty:
    print(f"Date range of combined data: {df['date'].min()} to {df['date'].max()}")
else:
    print("Combined dataframe is empty.")
    exit()

# Use the entire MICE dataset
df_filtered = df.copy()

print(f"\nFiltered data (using full MICE dataset): {len(df_filtered)} rows")
if not df_filtered.empty:
    print(f"Filtered date range: {df_filtered['date'].min()} to {df_filtered['date'].max()}")
else:
    print("Filtered dataframe is empty.")
    exit()

# Identify feature columns (all columns except 'date' and the specified target columns)
feature_cols = [col for col in df_filtered.columns if col != 'date' and col not in target_cols_list]

# Check if all specified target columns exist in the DataFrame
missing_targets = [col for col in target_cols_list if col not in df_filtered.columns]
if missing_targets:
    raise ValueError(f"❌ The following specified target columns were not found in the dataset: {missing_targets}")

print(f"\n🎯 Target columns identified: {target_cols_list}")
print(f"📈 Feature columns identified: {feature_cols[:10]} ... ({len(feature_cols)} total)")

# CRITICAL FIX: Create TWO different datasets for Autoformer

# 1. Dataset with date column (for time-based operations and splitting)
df_with_date = df_filtered[['date'] + target_cols_list + feature_cols].copy()

# 2. Dataset WITHOUT date column (for model training - this is what gets fed to the neural network)
# The Autoformer library expects the data file to contain ONLY the numeric columns
# The date handling is done separately in the data loader
df_auto = df_filtered[target_cols_list + feature_cols].copy()

# CRITICAL FIX: Convert all columns to numeric types BEFORE any other processing
print(f"\n🔧 Converting all columns to numeric types...")
non_numeric_cols = []

# Convert all columns except 'date' to numeric
for col in df_auto.columns:
    if not np.issubdtype(df_auto[col].dtype, np.number):
        try:
            # Convert to numeric, coercing errors to NaN
            df_auto[col] = pd.to_numeric(df_auto[col], errors='coerce')
            df_with_date[col] = pd.to_numeric(df_with_date[col], errors='coerce')
            print(f"✅ Converted {col} to numeric")
        except Exception as e:
            non_numeric_cols.append(col)
            print(f"❌ Failed to convert {col} to numeric: {e}")

# Drop columns that couldn't be converted
if non_numeric_cols:
    print(f"\n⚠️  Dropping {len(non_numeric_cols)} columns that couldn't be converted to numeric:")
    for col in non_numeric_cols:
        print(f"   - {col}")
    df_auto = df_auto.drop(columns=non_numeric_cols)
    df_with_date = df_with_date.drop(columns=non_numeric_cols)

# Handle missing values (forward/backward fill) - This is a safeguard, MICE should have handled most
df_with_date = df_with_date.fillna(method='ffill').fillna(method='bfill')
df_auto = df_auto.fillna(method='ffill').fillna(method='bfill')

# Verify all columns are numeric (except date in the first dataset)
print(f"\nVerifying data types:")
print(f"Date column dtype: {df_with_date['date'].dtype}")

# Final verification - check for any remaining non-numeric columns
remaining_non_numeric = []
for col in df_auto.columns:
    if not np.issubdtype(df_auto[col].dtype, np.number):
        remaining_non_numeric.append(col)

if remaining_non_numeric:
    print(f"❌ Still have non-numeric columns: {remaining_non_numeric}")
    # Force conversion for any remaining non-numeric columns
    for col in remaining_non_numeric:
        df_auto[col] = pd.to_numeric(df_auto[col], errors='coerce')
        df_with_date[col] = pd.to_numeric(df_with_date[col], errors='coerce')

# Final cleanup - remove any rows with NaN values that might have been introduced
initial_rows = len(df_auto)
df_auto = df_auto.dropna()
df_with_date = df_with_date.dropna()
final_rows = len(df_auto)

if initial_rows != final_rows:
    print(f"⚠️  Removed {initial_rows - final_rows} rows with NaN values after conversion")

print(f"\n✅ All columns in training dataset are now numeric")
print(f"Training dataset shape (no date): {df_auto.shape}")
print(f"Full dataset shape (with date): {df_with_date.shape}")

# Final verification
print(f"\n🔍 Final data type verification:")
print(f"   - All columns are numeric: {all(df_auto.dtypes.apply(lambda x: np.issubdtype(x, np.number)))}")
print(f"   - No NaN values: {not df_auto.isnull().any().any()}")
print(f"   - Data types: {df_auto.dtypes.value_counts()}")

# Display summary statistics for target columns
print("\nSummary statistics for target columns:")
display(df_auto[target_cols_list].describe())

# Create output directory and save BOTH datasets
os.makedirs('./dataset/custom/', exist_ok=True)

# CRITICAL FIX: Create a dataset that Autoformer expects
# Autoformer expects: ['date', ...other_columns..., target_column]
# We need to create a proper date column and reorder columns

# Create a date range for the dataset
print(f"\n🔧 Creating proper date column for Autoformer...")
date_range = pd.date_range(start='2024-06-01', periods=len(df_auto), freq='D')
df_auto_with_date = df_auto.copy()
df_auto_with_date.insert(0, 'date', date_range)

# Reorder columns to match Autoformer's expected format: ['date', ...features..., targets]
# Put ALL target columns at the end for multivariate prediction
target_cols = [col for col in target_cols_list if col in df_auto_with_date.columns]
if target_cols:
    # Get all columns except date and targets
    feature_cols = [col for col in df_auto_with_date.columns if col not in ['date'] + target_cols]
    # Reorder: date + features + all targets
    df_auto_with_date = df_auto_with_date[['date'] + feature_cols + target_cols]
    print(f"✅ Reordered columns: date + {len(feature_cols)} features + {len(target_cols)} targets")
    print(f"   - Target columns: {target_cols}")
else:
    print(f"⚠️  No target columns found, using original order")
    target_cols = [df_auto_with_date.columns[-1]]  # Use last column as target

# Save the dataset WITH proper date column and column order (this is what Autoformer will use)
output_path = './dataset/custom/custom.csv'
df_auto_with_date.to_csv(output_path, index=False)

# Save the dataset WITHOUT date column (for reference)
output_path_no_date = './dataset/custom/custom_no_date.csv'
df_auto.to_csv(output_path_no_date, index=False)

print(f"\n✅ Saved training dataset (WITH DATE, PROPER ORDER) to {output_path}")
print(f"✅ Saved reference dataset (NO DATE) to {output_path_no_date}")
print(f"Final training dataset shape: {df_auto_with_date.shape}")
print(f"Training dataset columns: {list(df_auto_with_date.columns)[:15]} ... (total: {len(df_auto_with_date.columns)})")
print(f"Date range: {df_auto_with_date['date'].min()} to {df_auto_with_date['date'].max()}")

# Verify the training dataset has no string columns
print(f"\nFinal verification - Data types in training dataset:")
print(df_auto.dtypes.value_counts())

# Display first few rows of TRAINING dataset (without date)
print(f"\nFirst 5 rows of the TRAINING dataset (no date column):")
display(df_auto.head())

# Display last few rows of TRAINING dataset (without date)
print(f"\nLast 5 rows of the TRAINING dataset (no date column):")
display(df_auto.tail())

# Create a summary file with dataset information
summary_info = {
    'total_rows': len(df_auto),
    'total_columns': len(df_auto.columns),
    'target_columns': target_cols_list,
    'num_target_columns': len(target_cols_list),
    'feature_columns': feature_cols[:20],  # First 20 feature columns
    'num_feature_columns': len(feature_cols),
    'date_range_start': str(df_with_date['date'].min()),
    'date_range_end': str(df_with_date['date'].max()),
    'data_types': str(df_auto.dtypes.value_counts().to_dict())
}

summary_path = './dataset/custom/dataset_summary.txt'
with open(summary_path, 'w') as f:
    f.write("Autoformer Dataset Summary\n")
    f.write("=" * 30 + "\n")
    for key, value in summary_info.items():
        f.write(f"{key}: {value}\n")

print(f"\n📊 Dataset summary saved to: {summary_path}")
print(f"\n🎉 Data preparation completed successfully!")
print(f"   - Use 'custom.csv' for Autoformer training (contains {df_auto.shape[1]} numeric columns)")
print(f"   - Use 'custom_with_date.csv' for reference (contains date + {df_auto.shape[1]} numeric columns)")

Successfully loaded /content/drive/MyDrive/final_concatenated_data_mice_imputed.csv: 618 rows
Tag column dropped
Date range of combined data: 2024-06-01 00:00:00 to 2025-07-31 00:00:00

Filtered data (using full MICE dataset): 618 rows
Filtered date range: 2024-06-01 00:00:00 to 2025-07-31 00:00:00

🎯 Target columns identified: ['Total_Stabilized_Naphtha_Product_Flowrate', 'Total_Kerosene_Product_Flowrate', 'Jet_Fuel_Product_Train1_Flowrate', 'Total_Light_Diesel_Product_Flowrate', 'Total_Heavy_Diesel_Product_Flowrate', 'Total_Atmospheric_Residue_Flowrate', 'Blend_Yield_Gas & LPG', 'Blend_Yield_Kerosene', 'Blend_Yield_Light Diesel', 'Blend_Yield_Heavy Diesel', 'Blend_Yield_RCO']
📈 Feature columns identified: ['Light_Diesel_to_DHT_Unit_Flowrate', 'Kerosene_to_Light_Diesel_DHT_Flowrate', 'Atm_Residue_to_RFCC_Unit_Flowrate', 'Atm_Residue_to_Storage_Flowrate', 'Crude_Column_Naphtha_to_SGCU_Flowrate', 'Heavy_Diesel_to_MHC_Flowrate', 'Atm_Residue_to_Storage_EE1027_Flowrate', 'blend_id', 'API'

Unnamed: 0,Total_Stabilized_Naphtha_Product_Flowrate,Total_Kerosene_Product_Flowrate,Jet_Fuel_Product_Train1_Flowrate,Total_Light_Diesel_Product_Flowrate,Total_Heavy_Diesel_Product_Flowrate,Total_Atmospheric_Residue_Flowrate,Blend_Yield_Gas & LPG,Blend_Yield_Kerosene,Blend_Yield_Light Diesel,Blend_Yield_Heavy Diesel,Blend_Yield_RCO
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,
std,,,,,,,,,,,
min,,,,,,,,,,,
25%,,,,,,,,,,,
50%,,,,,,,,,,,
75%,,,,,,,,,,,
max,,,,,,,,,,,



✅ Saved training dataset (NO DATE) to ./dataset/custom/custom.csv
✅ Saved reference dataset (WITH DATE) to ./dataset/custom/custom_with_date.csv
Final training dataset shape: (0, 667)
Training dataset columns: ['Total_Stabilized_Naphtha_Product_Flowrate', 'Total_Kerosene_Product_Flowrate', 'Jet_Fuel_Product_Train1_Flowrate', 'Total_Light_Diesel_Product_Flowrate', 'Total_Heavy_Diesel_Product_Flowrate', 'Total_Atmospheric_Residue_Flowrate', 'Blend_Yield_Gas & LPG', 'Blend_Yield_Kerosene', 'Blend_Yield_Light Diesel', 'Blend_Yield_Heavy Diesel', 'Blend_Yield_RCO', 'Light_Diesel_to_DHT_Unit_Flowrate', 'Kerosene_to_Light_Diesel_DHT_Flowrate', 'Atm_Residue_to_RFCC_Unit_Flowrate', 'Atm_Residue_to_Storage_Flowrate'] ... (total: 667)

Final verification - Data types in training dataset:
float64    667
Name: count, dtype: int64

First 5 rows of the TRAINING dataset (no date column):


Unnamed: 0,Total_Stabilized_Naphtha_Product_Flowrate,Total_Kerosene_Product_Flowrate,Jet_Fuel_Product_Train1_Flowrate,Total_Light_Diesel_Product_Flowrate,Total_Heavy_Diesel_Product_Flowrate,Total_Atmospheric_Residue_Flowrate,Blend_Yield_Gas & LPG,Blend_Yield_Kerosene,Blend_Yield_Light Diesel,Blend_Yield_Heavy Diesel,...,101FI8301,101FI8401,101FI8801,101FI8901,crude_WTI Midland,crude_MERO,101FIC3802,101FIC5801,101FIC6702,MW



Last 5 rows of the TRAINING dataset (no date column):


Unnamed: 0,Total_Stabilized_Naphtha_Product_Flowrate,Total_Kerosene_Product_Flowrate,Jet_Fuel_Product_Train1_Flowrate,Total_Light_Diesel_Product_Flowrate,Total_Heavy_Diesel_Product_Flowrate,Total_Atmospheric_Residue_Flowrate,Blend_Yield_Gas & LPG,Blend_Yield_Kerosene,Blend_Yield_Light Diesel,Blend_Yield_Heavy Diesel,...,101FI8301,101FI8401,101FI8801,101FI8901,crude_WTI Midland,crude_MERO,101FIC3802,101FIC5801,101FIC6702,MW



📊 Dataset summary saved to: ./dataset/custom/dataset_summary.txt

🎉 Data preparation completed successfully!
   - Use 'custom.csv' for Autoformer training (contains 667 numeric columns)
   - Use 'custom_with_date.csv' for reference (contains date + 667 numeric columns)


MODEL TRAINING

In [None]:
from exp.exp_main import Exp_Main
import argparse
import torch
import os
import pickle
import json
from datetime import datetime
import pandas as pd # Import pandas to read the CSV and get column count
import numpy as np # Import numpy for numeric check

# First, let's check the dataset size to determine appropriate parameters
print("🔍 Checking dataset size and determining optimal parameters...")
df_size_check = pd.read_csv('./dataset/custom/custom.csv')
dataset_size = len(df_size_check)
print(f"Dataset size: {dataset_size} rows")

# Calculate optimal parameters based on dataset size
# Rule of thumb: seq_len + label_len + pred_len should be < dataset_size
# Leave some buffer for train/val/test split
max_sequence_length = max(1, min(60, dataset_size // 4))  # Use 1/4 of dataset or max 60
max_label_length = max(1, min(24, dataset_size // 8))     # Use 1/8 of dataset or max 24
max_pred_length = max(1, min(7, dataset_size // 12))      # Use 1/12 of dataset or max 7

# Ensure we have enough data for training
min_required_samples = max_sequence_length + max_label_length + max_pred_length + 10  # +10 buffer
if dataset_size < min_required_samples:
    print(f"⚠️  Dataset size ({dataset_size}) is too small for the calculated parameters.")
    print(f"   Minimum required: {min_required_samples}")
    print(f"   Adjusting parameters to fit available data...")
    
    # Use more conservative parameters
    max_sequence_length = max(1, dataset_size // 6)
    max_label_length = max(1, dataset_size // 12)
    max_pred_length = max(1, dataset_size // 20)
    
    print(f"   Adjusted seq_len: {max_sequence_length}")
    print(f"   Adjusted label_len: {max_label_length}")
    print(f"   Adjusted pred_len: {max_pred_length}")

print(f"✅ Using parameters:")
print(f"   - seq_len: {max_sequence_length}")
print(f"   - label_len: {max_label_length}")
print(f"   - pred_len: {max_pred_length}")
print(f"   - Total sequence requirement: {max_sequence_length + max_label_length + max_pred_length}")

# Arguments for Autoformer training and prediction
args = argparse.Namespace(
    is_training=1,
    model_id='autoformer_custom_multivariate_targets', # Changed model_id for clarity
    model='Autoformer',
    data='custom',
    root_path='./dataset/custom/',
    data_path='custom.csv',
    features='M', # Multivariate features
    seq_len=max_sequence_length,         # Dynamically calculated based on dataset size
    label_len=max_label_length,          # Dynamically calculated based on dataset size
    pred_len=max_pred_length,           # Dynamically calculated based on dataset size
    e_layers=1,
    d_layers=1,
    factor=3,
    # enc_in, dec_in will be determined dynamically based on all columns except date and non-numeric
    # c_out will be set to the number of target columns for multivariate output
    d_model=64,
    n_heads=8,
    d_ff=128,
    activation='gelu',
    description='autoformer multivariate target forecasting', # Updated description
    itr=1,
    train_epochs=10,     # keep small for demo
    batch_size=min(16, dataset_size // 10),  # Adjust batch size based on dataset size
    learning_rate=0.001,
    moving_avg=25,
    freq='d',
    dropout=0.3,
    embed='timeF',
    patience=3,
    checkpoint='./checkpoints/',
    output_attention=False,
    do_predict=True,
    # Add GPU arguments
    use_gpu=torch.cuda.is_available(), # Check if GPU is available
    gpu=0, # Specify GPU id, 0 by default
    use_multi_gpu=False, # Set to True if using multiple GPUs
    devices=[0], # Specify device ids if using multi-gpu
    # Add the 'target' argument back, set to the first target column name.
    # The library's data_provider might still need this, even for multivariate features.
    target='Total_Stabilized_Naphtha_Product_Flowrate',
    num_workers=0, # Add num_workers argument
    # Add checkpoints attribute
    checkpoints= './checkpoints/',
    # Add use_amp attribute
    use_amp=False,
    # Add lradj attribute
    lradj='type1'
)

# Determine input/output dimensions dynamically
try:
    # First, check if the file exists and is readable
    file_path = os.path.join(args.root_path, args.data_path)
    print(f"🔍 Checking dataset file: {file_path}")
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found: {file_path}")
    
    # Check file size
    file_size = os.path.getsize(file_path)
    print(f"   - File size: {file_size} bytes")
    
    if file_size == 0:
        raise ValueError(f"Dataset file is empty: {file_path}")
    
    # Try to read the file
    print(f"   - Reading dataset...")
    df_temp = pd.read_csv(file_path)
    print(f"   - Successfully loaded dataset with {len(df_temp)} rows and {len(df_temp.columns)} columns")

    # Print all column names to debug
    print("All columns in the dataset:")
    for i, col in enumerate(df_temp.columns):
        print(f"{i}: '{col}' (type: {df_temp[col].dtype})")

    # Verify that the dataset has the expected structure
    if 'date' not in df_temp.columns:
        raise ValueError("❌ Dataset must contain a 'date' column for Autoformer to work properly!")
    
    print(f"✅ Found 'date' column in dataset")

    # Identify target columns from the list used in the data preparation cell
    target_cols_list = [
        'Total_Stabilized_Naphtha_Product_Flowrate',
        'Total_Kerosene_Product_Flowrate',
        'Jet_Fuel_Product_Train1_Flowrate',
        'Total_Light_Diesel_Product_Flowrate',
        'Total_Heavy_Diesel_Product_Flowrate',
        'Total_Atmospheric_Residue_Flowrate',
        'Blend_Yield_Gas & LPG',
        'Blend_Yield_Kerosene',
        'Blend_Yield_Light Diesel',
        'Blend_Yield_Heavy Diesel',
        'Blend_Yield_RCO'
    ]

    # Check if all specified target columns exist in the DataFrame
    missing_targets = [col for col in target_cols_list if col not in df_temp.columns]
    if missing_targets:
        print(f"❌ The following specified target columns were not found in the dataset: {missing_targets}")
        print("Available columns that might be similar:")
        for missing_col in missing_targets:
            similar_cols = [col for col in df_temp.columns if missing_col.lower().replace('_', '').replace(' ', '') in col.lower().replace('_', '').replace(' ', '')]
            if similar_cols:
                print(f"  For '{missing_col}', similar columns found: {similar_cols}")

        # Use only the target columns that exist
        existing_targets = [col for col in target_cols_list if col in df_temp.columns]
        if not existing_targets:
            raise ValueError("No target columns found in the dataset!")
        target_cols_list = existing_targets
        print(f"Using existing target columns: {target_cols_list}")

    # CRITICAL FIX: Convert all columns to numeric types (except date)
    print("\n🔧 Converting all columns to numeric types...")
    non_numeric_cols = []
    for col in df_temp.columns:
        if col != 'date':  # Skip date column
            try:
                # Convert to numeric, coercing errors to NaN
                df_temp[col] = pd.to_numeric(df_temp[col], errors='coerce')
            except Exception as e:
                non_numeric_cols.append(col)
                print(f"⚠️  Could not convert column '{col}' to numeric: {e}")

    if non_numeric_cols:
        print(f"⚠️  Columns that could not be converted to numeric: {non_numeric_cols}")
        # Drop these columns if they can't be converted
        df_temp = df_temp.drop(columns=non_numeric_cols)
        print(f"✅ Dropped {len(non_numeric_cols)} non-numeric columns")

    # Handle missing values after conversion
    print("🔧 Handling missing values...")
    df_temp = df_temp.fillna(method='ffill').fillna(method='bfill')
    
    # Remove any rows that still have NaN values
    initial_rows = len(df_temp)
    df_temp = df_temp.dropna()
    final_rows = len(df_temp)
    if initial_rows != final_rows:
        print(f"⚠️  Removed {initial_rows - final_rows} rows with NaN values")

    # Verify that the primary target column exists and set it correctly
    if args.target not in df_temp.columns:
        print(f"Warning: Primary target column '{args.target}' not found in dataset.")
        if target_cols_list:
            args.target = target_cols_list[0]
            print(f"Setting primary target to: '{args.target}'")
        else:
            # Find the first numeric column as fallback
            numeric_cols = df_temp.select_dtypes(include=np.number).columns.tolist()
            if 'date' in numeric_cols:
                numeric_cols.remove('date')
            if numeric_cols:
                args.target = numeric_cols[0]
                print(f"Using first numeric column as target: '{args.target}'")
            else:
                raise ValueError("No suitable target column found!")

    # Identify columns to be used as features and targets (excluding 'date' and non-numeric, non-target columns)
    # Also ensure target columns are numeric
    numeric_cols = df_temp.select_dtypes(include=np.number).columns.tolist()

    # Remove 'date' from numeric columns if it exists there
    if 'date' in numeric_cols:
        numeric_cols.remove('date')

    # Create list of all relevant columns: date + all numeric columns
    all_relevant_cols = ['date'] + numeric_cols

    # Filter df_temp to only include relevant columns that exist
    existing_cols = [col for col in all_relevant_cols if col in df_temp.columns]
    df_temp_filtered = df_temp[existing_cols].copy()

    # The number of features (enc_in, dec_in) is the total number of numeric columns
    num_features = len(numeric_cols)

    args.enc_in = num_features
    args.dec_in = num_features # Decoder input size matches encoder input size for multivariate features

    # CRITICAL FIX: Configure for multivariate target prediction
    # For multivariate targets, c_out should be the number of target columns
    num_target_columns = len(target_cols_list)
    args.c_out = num_target_columns  # Set to number of target columns for multivariate output
    
    print(f"🔧 Configuring for multivariate prediction:")
    print(f"   - Number of target columns: {num_target_columns}")
    print(f"   - Target columns: {target_cols_list}")
    print(f"   - c_out (output dimension): {args.c_out}")

    print(f"✅ Successfully configured:")
    print(f"   - enc_in, dec_in: {num_features}")
    print(f"   - c_out (output dimension): {args.c_out}")
    print(f"   - Primary target column: '{args.target}'")
    print(f"   - All numeric columns ({len(numeric_cols)}): {numeric_cols[:10]}... (showing first 10)")
    print(f"   - Dataset shape: {df_temp_filtered.shape}")
    
    # Verify data types are correct
    print(f"\n🔍 Final data type verification:")
    print(f"   - All columns are numeric: {all(df_temp[numeric_cols].dtypes.apply(lambda x: np.issubdtype(x, np.number)))}")
    print(f"   - No NaN values: {not df_temp[numeric_cols].isnull().any().any()}")
    print(f"   - Sample values from target column '{args.target}': {df_temp[args.target].head().tolist()}")
    
    # CRITICAL: Validate that we have enough data for the sequence parameters
    print(f"\n🔍 Validating sequence parameters against dataset size:")
    total_sequence_need = args.seq_len + args.label_len + args.pred_len
    available_samples = len(df_temp_filtered)
    
    print(f"   - Required total sequence length: {total_sequence_need}")
    print(f"   - Available samples: {available_samples}")
    print(f"   - Buffer remaining: {available_samples - total_sequence_need}")
    
    # Debug: Check if df_temp_filtered is empty
    if available_samples == 0:
        print(f"❌ CRITICAL ERROR: df_temp_filtered is empty!")
        print(f"   - Original df_temp shape: {df_temp.shape}")
        print(f"   - df_temp columns: {list(df_temp.columns)}")
        print(f"   - df_temp_filtered shape: {df_temp_filtered.shape}")
        print(f"   - existing_cols: {existing_cols}")
        print(f"   - numeric_cols: {numeric_cols}")
        
        # Try to identify the issue
        if len(existing_cols) == 0:
            print(f"   - Issue: No existing columns found!")
        if len(numeric_cols) == 0:
            print(f"   - Issue: No numeric columns found!")
        
        # Try to recover by using all columns except date
        print(f"   - Attempting recovery by using all columns except date...")
        all_cols_except_date = [col for col in df_temp.columns if col != 'date']
        if len(all_cols_except_date) > 0:
            df_temp_filtered = df_temp[['date'] + all_cols_except_date].copy()
            available_samples = len(df_temp_filtered)
            print(f"   - Recovery successful: {available_samples} samples available")
        else:
            raise ValueError("Cannot recover: No columns available for training!")
    
    if available_samples < total_sequence_need:
        print(f"❌ ERROR: Not enough data for the specified sequence parameters!")
        print(f"   Need at least {total_sequence_need} samples, but only have {available_samples}")
        print(f"   Please reduce seq_len, label_len, or pred_len parameters")
        raise ValueError(f"Insufficient data: need {total_sequence_need} samples, have {available_samples}")
    
    # Additional validation for train/val/test split
    # Autoformer typically uses 70% train, 20% val, 10% test
    train_samples = int(available_samples * 0.7)
    val_samples = int(available_samples * 0.2)
    test_samples = available_samples - train_samples - val_samples
    
    print(f"   - Expected train samples: {train_samples}")
    print(f"   - Expected val samples: {val_samples}")
    print(f"   - Expected test samples: {test_samples}")
    
    # Ensure we have enough samples for training after splitting
    min_train_samples = total_sequence_need + 10  # +10 buffer
    if train_samples < min_train_samples:
        print(f"⚠️  WARNING: Training set may be too small after splitting!")
        print(f"   Expected train samples: {train_samples}")
        print(f"   Minimum recommended: {min_train_samples}")
        print(f"   Consider reducing sequence parameters or using more data")

except Exception as e:
    print(f"❌ Error determining input/output dimensions: {e}")
    # Fallback to default or raise error if necessary
    args.enc_in = 1 # Fallback to 1 if unable to read file
    args.dec_in = 1
    args.c_out = 1
    args.target = 'Unknown' # Set target to a placeholder
    print(f"Using default enc_in, dec_in, c_out: {args.enc_in}, {args.dec_in}, {args.c_out}")
    raise e  # Re-raise the error to stop execution


def save_model_to_drive(exp, model_name="autoformer_model", save_path="./saved_models/"):
    """
    Save the trained model and related information to drive

    Args:
        exp: Experiment object containing the trained model
        model_name: Name for the saved model
        save_path: Directory path to save the model
    """
    # Create save directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)

    # Create timestamped filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_filename = f"{model_name}_{timestamp}"

    # Save model state dict
    model_path = os.path.join(save_path, f"{model_filename}.pth")
    torch.save(exp.model.state_dict(), model_path)
    print(f"Model state dict saved to: {model_path}")

    # Save complete model (including architecture)
    complete_model_path = os.path.join(save_path, f"{model_filename}_complete.pth")
    torch.save(exp.model, complete_model_path)
    print(f"Complete model saved to: {complete_model_path}")

    # Save model configuration (args)
    config_path = os.path.join(save_path, f"{model_filename}_config.json")
    config_dict = vars(args)
    # Convert non-serializable objects to strings
    for key, value in config_dict.items():
        if not isinstance(value, (int, float, str, bool, list, dict, type(None))):
            config_dict[key] = str(value)

    with open(config_path, 'w') as f:
        json.dump(config_dict, f, indent=4)
    print(f"Model configuration saved to: {config_path}")

    # Save experiment object (if needed for later use)
    exp_path = os.path.join(save_path, f"{model_filename}_experiment.pkl")
    try:
        with open(exp_path, 'wb') as f:
            pickle.dump(exp, f)
        print(f"Experiment object saved to: {exp_path}")
    except Exception as e:
        print(f"Warning: Could not save experiment object: {e}")

    # Create a summary file
    summary_path = os.path.join(save_path, f"{model_filename}_summary.txt")
    with open(summary_path, 'w') as f:
        f.write(f"Autoformer Model Summary\n")
        f.write(f"========================\n")
        f.write(f"Training Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Model ID: {args.model_id}\n")
        f.write(f"Model Type: {args.model}\n")
        f.write(f"Training Epochs: {args.train_epochs}\n")
        f.write(f"Sequence Length: {args.seq_len}\n")
        f.write(f"Prediction Length: {args.pred_len}\n")
        f.write(f"Learning Rate: {args.learning_rate}\n")
        f.write(f"Batch Size: {args.batch_size}\n")
        f.write(f"GPU Used: {args.use_gpu}\n")
        f.write(f"Primary Target: {args.target}\n")
        f.write(f"Input Features: {args.enc_in}\n")
        f.write(f"Output Dimension: {args.c_out}\n")
        f.write(f"\nFiles saved:\n")
        f.write(f"- Model state dict: {model_filename}.pth\n")
        f.write(f"- Complete model: {model_filename}_complete.pth\n")
        f.write(f"- Configuration: {model_filename}_config.json\n")
        f.write(f"- Summary: {model_filename}_summary.txt\n")

    print(f"Model summary saved to: {summary_path}")
    print(f"\nAll model files saved successfully in: {save_path}")

    return {
        'model_path': model_path,
        'complete_model_path': complete_model_path,
        'config_path': config_path,
        'summary_path': summary_path
    }

def load_saved_model(model_path, config_path, device=None):
    """
    Load a previously saved model

    Args:
        model_path: Path to the saved model state dict
        config_path: Path to the saved configuration
        device: Device to load the model on

    Returns:
        Loaded model and configuration
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load configuration
    with open(config_path, 'r') as f:
        config = json.load(f)

    # Recreate args namespace
    loaded_args = argparse.Namespace(**config)

    # Create experiment object
    exp = Exp_Main(loaded_args)

    # Load model state dict
    model_state = torch.load(model_path, map_location=device)
    exp.model.load_state_dict(model_state)
    exp.model.eval()

    print(f"Model loaded successfully from: {model_path}")
    return exp, loaded_args

# Verify the target column exists before proceeding
try:
    print("🔍 Verifying data before training...")
    df_verify = pd.read_csv(os.path.join(args.root_path, args.data_path))
    if args.target not in df_verify.columns:
        raise ValueError(f"Target column '{args.target}' not found in the dataset columns: {list(df_verify.columns)}")
    print(f"✅ Target column '{args.target}' verified in dataset")

    # Check if there's a 'date' column
    if 'date' not in df_verify.columns:
        print("⚠️  Warning: No 'date' column found. Available columns:")
        print(list(df_verify.columns))
    else:
        print("✅ Date column found in dataset")

except Exception as e:
    print(f"❌ Error verifying data: {e}")
    raise e

# Run training and prediction
print("\n🚀 Starting Autoformer experiment...")
exp = Exp_Main(args)
print("📊 Start training...")
exp.train(setting='autoformer_custom_multivariate_targets') # Updated setting name

print("💾 Training completed! Saving model to drive...")
saved_files = save_model_to_drive(exp, model_name="autoformer_custom_multivariate_targets", save_path="./saved_models/") # Updated model name

print("🔮 Start predicting...")
# Modify the predict call if needed based on Autoformer's prediction output
exp.predict(setting='autoformer_custom_multivariate_targets', load=True) # Updated setting name

print("🎉 Training and prediction completed!")
print(f"📁 Model saved to: {saved_files['model_path']}")

# Example of how to load the model later:
# exp_loaded, args_loaded = load_saved_model(
#     saved_files['model_path'],
#     saved_files['config_path']
# )
# print("Model loaded successfully for future use!")

All columns in the dataset:
0: 'Total_Stabilized_Naphtha_Product_Flowrate' (type: object)
1: 'Total_Kerosene_Product_Flowrate' (type: object)
2: 'Jet_Fuel_Product_Train1_Flowrate' (type: object)
3: 'Total_Light_Diesel_Product_Flowrate' (type: object)
4: 'Total_Heavy_Diesel_Product_Flowrate' (type: object)
5: 'Total_Atmospheric_Residue_Flowrate' (type: object)
6: 'Blend_Yield_Gas & LPG' (type: object)
7: 'Blend_Yield_Kerosene' (type: object)
8: 'Blend_Yield_Light Diesel' (type: object)
9: 'Blend_Yield_Heavy Diesel' (type: object)
10: 'Blend_Yield_RCO' (type: object)
11: 'Light_Diesel_to_DHT_Unit_Flowrate' (type: object)
12: 'Kerosene_to_Light_Diesel_DHT_Flowrate' (type: object)
13: 'Atm_Residue_to_RFCC_Unit_Flowrate' (type: object)
14: 'Atm_Residue_to_Storage_Flowrate' (type: object)
15: 'Crude_Column_Naphtha_to_SGCU_Flowrate' (type: object)
16: 'Heavy_Diesel_to_MHC_Flowrate' (type: object)
17: 'Atm_Residue_to_Storage_EE1027_Flowrate' (type: object)
18: 'blend_id' (type: object)
19: 'AP

ValueError: list.remove(x): x not in list

MODEL EVAL

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import os
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

def get_test_predictions_manual(exp, args):
    """
    Manual method to get test predictions when data_provider is not available
    """
    try:
        # Create a simple data loader for the test data
        import pandas as pd
        from torch.utils.data import DataLoader, Dataset

        # Load the dataset
        df = pd.read_csv(os.path.join(args.root_path, args.data_path))

        # Simple train/test split (you may need to adjust this based on your data)
        train_size = int(len(df) * 0.8)
        test_df = df[train_size:]

        print(f"✓ Manual data loading: {len(test_df)} test samples")

        # Simple dataset class
        class SimpleDataset(Dataset):
            def __init__(self, data, seq_len, pred_len):
                self.data = data.values
                self.seq_len = seq_len
                self.pred_len = pred_len

            def __len__(self):
                return len(self.data) - self.seq_len - self.pred_len + 1

            def __getitem__(self, idx):
                seq_x = self.data[idx:idx + self.seq_len]
                seq_y = self.data[idx + self.seq_len:idx + self.seq_len + self.pred_len]
                return torch.FloatTensor(seq_x), torch.FloatTensor(seq_y)

        # Create dataset and dataloader
        test_dataset = SimpleDataset(test_df, args.seq_len, args.pred_len)
        test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

        # Get predictions
        predictions = []
        true_values = []

        exp.model.eval()
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                if args.use_gpu:
                    batch_x = batch_x.cuda()
                    batch_y = batch_y.cuda()

                # Simple prediction (you may need to adjust based on your model)
                pred = exp.model(batch_x)

                predictions.append(pred.cpu().numpy())
                true_values.append(batch_y.cpu().numpy())

        if predictions and true_values:
            y_pred = np.concatenate(predictions, axis=0)
            y_true = np.concatenate(true_values, axis=0)
            return y_pred, y_true
        else:
            return None, None

    except Exception as e:
        print(f"Manual prediction method failed: {e}")
        return None, None

def calculate_metrics(y_true, y_pred):
    """
    Calculate comprehensive evaluation metrics

    Args:
        y_true: True values
        y_pred: Predicted values

    Returns:
        Dictionary of metrics
    """
    # Convert to numpy arrays if they're tensors
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()

    # Flatten arrays if multidimensional
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()

    # Remove any NaN values
    mask = ~(np.isnan(y_true) | np.isnan(y_pred))
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    # Calculate metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)

    # MAPE (Mean Absolute Percentage Error)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100

    # MSPE (Mean Squared Percentage Error)
    mspe = np.mean(((y_true - y_pred) / (y_true + 1e-8)) ** 2) * 100

    # R² Score
    r2 = r2_score(y_true, y_pred)

    # Directional Accuracy (for time series)
    if len(y_true) > 1:
        true_direction = np.sign(np.diff(y_true))
        pred_direction = np.sign(np.diff(y_pred))
        directional_accuracy = np.mean(true_direction == pred_direction) * 100
    else:
        directional_accuracy = 0

    return {
        'MSE': float(mse),
        'RMSE': float(rmse),
        'MAE': float(mae),
        'MAPE': float(mape),
        'MSPE': float(mspe),
        'R2': float(r2),
        'Directional_Accuracy': float(directional_accuracy)
    }

def create_evaluation_plots(y_true, y_pred, save_path="./evaluation_plots/"):
    """
    Create comprehensive evaluation plots

    Args:
        y_true: True values
        y_pred: Predicted values
        save_path: Directory to save plots
    """
    os.makedirs(save_path, exist_ok=True)

    # Convert to numpy arrays if they're tensors
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()

    # Flatten arrays if multidimensional
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()

    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Model Evaluation Results', fontsize=16, fontweight='bold')

    # Plot 1: Actual vs Predicted (Time Series)
    axes[0, 0].plot(y_true, label='Actual', color='blue', alpha=0.7)
    axes[0, 0].plot(y_pred, label='Predicted', color='red', alpha=0.7)
    axes[0, 0].set_title('Actual vs Predicted Values (Time Series)')
    axes[0, 0].set_xlabel('Time Steps')
    axes[0, 0].set_ylabel('Values')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)

    # Plot 2: Scatter Plot
    axes[0, 1].scatter(y_true, y_pred, alpha=0.6, color='green')
    axes[0, 1].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()],
                    'r--', lw=2, label='Perfect Prediction')
    axes[0, 1].set_title('Scatter Plot: Actual vs Predicted')
    axes[0, 1].set_xlabel('Actual Values')
    axes[0, 1].set_ylabel('Predicted Values')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)

    # Plot 3: Residuals
    residuals = y_true - y_pred
    axes[1, 0].scatter(y_pred, residuals, alpha=0.6, color='orange')
    axes[1, 0].axhline(y=0, color='r', linestyle='--')
    axes[1, 0].set_title('Residual Plot')
    axes[1, 0].set_xlabel('Predicted Values')
    axes[1, 0].set_ylabel('Residuals')
    axes[1, 0].grid(True, alpha=0.3)

    # Plot 4: Residuals Distribution
    axes[1, 1].hist(residuals, bins=50, alpha=0.7, color='purple', edgecolor='black')
    axes[1, 1].set_title('Distribution of Residuals')
    axes[1, 1].set_xlabel('Residuals')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].grid(True, alpha=0.3)

    plt.tight_layout()

    # Save the plot
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plot_filename = os.path.join(save_path, f"evaluation_plots_{timestamp}.png")
    plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
    plt.show()

    print(f"Evaluation plots saved to: {plot_filename}")
    return plot_filename

def comprehensive_model_evaluation(exp, args, save_results=True):
    """
    Perform comprehensive evaluation of the trained model

    Args:
        exp: Experiment object
        args: Arguments namespace
        save_results: Whether to save results to files

    Returns:
        Dictionary containing evaluation results
    """
    print("\n" + "="*50)
    print("COMPREHENSIVE MODEL EVALUATION")
    print("="*50)

    # Set model to evaluation mode
    exp.model.eval()

    # Set is_training to 0 for evaluation
    args.is_training = 0

    # Create evaluation results directory
    eval_save_path = './evaluation_results/'
    os.makedirs(eval_save_path, exist_ok=True)

    # Initialize results dictionary
    eval_results = {
        'model_id': args.model_id,
        'model_type': args.model,
        'setting': 'autoformer_custom_autoformer_custom',
        'timestamp': datetime.now().strftime("%Y%m%d_%H%M%S"),
        'evaluation_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    print("1. Running test method for standard evaluation...")
    # Run the standard test method
    try:
        exp.test(setting='autoformer_custom_autoformer_custom')
        print("✓ Standard test method completed successfully")
    except Exception as e:
        print(f"⚠ Warning: Standard test method failed: {e}")

    print("\n2. Performing detailed evaluation...")

    # Get predictions and true values for detailed analysis
    try:
        # Method 1: Use the built-in data provider from Exp_Main
        from data_provider.data_factory import data_provider

        # Get test data loader
        test_data, test_loader = data_provider(args, 'test')

        print(f"✓ Test data loaded successfully. Dataset size: {len(test_data)}")

        # Get predictions using the test data loader
        with torch.no_grad():
            predictions = []
            true_values = []

            exp.model.eval()

            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
                # Move to device if GPU is available
                if args.use_gpu:
                    batch_x = batch_x.float().cuda()
                    batch_y = batch_y.float().cuda()
                    batch_x_mark = batch_x_mark.float().cuda()
                    batch_y_mark = batch_y_mark.float().cuda()
                else:
                    batch_x = batch_x.float()
                    batch_y = batch_y.float()
                    batch_x_mark = batch_x_mark.float()
                    batch_y_mark = batch_y_mark.float()

                # Decoder input
                dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float()
                dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float()

                if args.use_gpu:
                    dec_inp = dec_inp.cuda()

                # Get prediction
                if args.output_attention:
                    outputs = exp.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
                else:
                    outputs = exp.model(batch_x, batch_x_mark, dec_inp, batch_y_mark)

                # Extract the prediction part (last pred_len time steps)
                pred = outputs[:, -args.pred_len:, :]
                true = batch_y[:, -args.pred_len:, :]

                predictions.append(pred.cpu().numpy())
                true_values.append(true.cpu().numpy())

                # Print progress for large datasets
                if i > 0 and i % 50 == 0:
                    print(f"   Processed {i}/{len(test_loader)} batches...")

            if predictions and true_values:
                y_pred = np.concatenate(predictions, axis=0)
                y_true = np.concatenate(true_values, axis=0)

                print(f"✓ Predictions generated. Shape: {y_pred.shape}")

                # Calculate detailed metrics
                metrics = calculate_metrics(y_true, y_pred)
                eval_results.update(metrics)

                # Print metrics
                print("\n3. Detailed Evaluation Metrics:")
                print("-" * 40)
                for metric, value in metrics.items():
                    print(f"{metric:<20}: {value:.6f}")

                # Create plots
                print("\n4. Creating evaluation plots...")
                plot_filename = create_evaluation_plots(y_true, y_pred)
                eval_results['plot_filename'] = plot_filename

                print("✓ Detailed evaluation completed successfully")

                # Save predictions for further analysis
                pred_save_path = './evaluation_results/'
                os.makedirs(pred_save_path, exist_ok=True)
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

                # Save predictions as numpy arrays
                np.save(os.path.join(pred_save_path, f"predictions_{timestamp}.npy"), y_pred)
                np.save(os.path.join(pred_save_path, f"true_values_{timestamp}.npy"), y_true)

                print(f"✓ Predictions saved to: {pred_save_path}")

            else:
                print("⚠ Warning: Could not extract predictions for detailed analysis")

    except ImportError as e:
        print(f"⚠ Warning: Could not import data_provider: {e}")
        print("Trying alternative method...")

        # Method 2: Alternative approach using exp object's data loading
        try:
            # Set up data loader manually
            args.is_training = 0  # Ensure we're in test mode

            # Try to recreate the experiment for testing
            from exp.exp_main import Exp_Main
            exp_test = Exp_Main(args)

            # Get test predictions using the vali method (which is often used for testing)
            print("Using validation method for evaluation...")

            # The vali method in Exp_Main typically returns metrics
            # We'll capture the model's predictions during validation
            vali_results = exp_test.vali(test_loader=None, criterion=None)

            print("✓ Validation-based evaluation completed")

        except Exception as e2:
            print(f"⚠ Warning: Alternative evaluation method failed: {e2}")
            print("Trying manual prediction method...")

            # Method 3: Manual prediction method
            y_pred, y_true = get_test_predictions_manual(exp, args)

            if y_pred is not None and y_true is not None:
                print("✓ Manual prediction method succeeded")

                # Calculate detailed metrics
                metrics = calculate_metrics(y_true, y_pred)
                eval_results.update(metrics)

                # Print metrics
                print("\n3. Detailed Evaluation Metrics:")
                print("-" * 40)
                for metric, value in metrics.items():
                    print(f"{metric:<20}: {value:.6f}")

                # Create plots
                print("\n4. Creating evaluation plots...")
                plot_filename = create_evaluation_plots(y_true, y_pred)
                eval_results['plot_filename'] = plot_filename

                print("✓ Detailed evaluation completed successfully")
            else:
                print("⚠ All prediction methods failed")

    except Exception as e:
        print(f"⚠ Warning: Detailed evaluation failed: {e}")
        print("Please ensure your data_provider module is properly configured.")

        # Add basic model information even if detailed evaluation fails
        eval_results.update({
            'MSE': 'N/A',
            'RMSE': 'N/A',
            'MAE': 'N/A',
            'MAPE': 'N/A',
            'MSPE': 'N/A',
            'R2': 'N/A',
            'Directional_Accuracy': 'N/A'
        })

    # Add model configuration to results
    eval_results['model_config'] = {
        'seq_len': args.seq_len,
        'pred_len': args.pred_len,
        'train_epochs': args.train_epochs,
        'batch_size': args.batch_size,
        'learning_rate': args.learning_rate,
        'model_layers': f"e_layers: {args.e_layers}, d_layers: {args.d_layers}",
        'model_dim': args.d_model,
        'n_heads': args.n_heads
    }

    # Save results if requested
    if save_results:
        print("\n5. Saving evaluation results...")
        eval_filename = os.path.join(eval_save_path, f"eval_results_{eval_results['timestamp']}.json")

        with open(eval_filename, 'w') as f:
            json.dump(eval_results, f, indent=4)

        print(f"✓ Evaluation results saved to: {eval_filename}")

        # Create summary report
        summary_filename = os.path.join(eval_save_path, f"eval_summary_{eval_results['timestamp']}.txt")
        with open(summary_filename, 'w') as f:
            f.write("AUTOFORMER MODEL EVALUATION SUMMARY\n")
            f.write("="*50 + "\n")
            f.write(f"Evaluation Date: {eval_results['evaluation_date']}\n")
            f.write(f"Model ID: {eval_results['model_id']}\n")
            f.write(f"Model Type: {eval_results['model_type']}\n")
            f.write(f"Setting: {eval_results['setting']}\n\n")

            f.write("MODEL CONFIGURATION:\n")
            f.write("-" * 20 + "\n")
            for key, value in eval_results['model_config'].items():
                f.write(f"{key}: {value}\n")

            f.write("\nEVALUATION METRICS:\n")
            f.write("-" * 20 + "\n")
            metrics = ['MSE', 'RMSE', 'MAE', 'MAPE', 'MSPE', 'R2', 'Directional_Accuracy']
            for metric in metrics:
                if metric in eval_results:
                    f.write(f"{metric}: {eval_results[metric]}\n")

        print(f"✓ Evaluation summary saved to: {summary_filename}")
        eval_results['summary_filename'] = summary_filename

    print("\n" + "="*50)
    print("EVALUATION COMPLETED SUCCESSFULLY!")
    print("="*50)

    return eval_results

# MAIN EVALUATION EXECUTION
if __name__ == "__main__":
    # This assumes the 'exp' and 'args' objects are available from the training script
    # If running separately, you would need to load the model first:
    # exp, args = load_saved_model(model_path, config_path)

    print("\nStarting comprehensive model evaluation...")

    # Perform comprehensive evaluation
    evaluation_results = comprehensive_model_evaluation(exp, args, save_results=True)

    # Print final summary
    print("\nFINAL EVALUATION SUMMARY:")
    print("-" * 30)
    if 'MSE' in evaluation_results and evaluation_results['MSE'] != 'N/A':
        print(f"Mean Squared Error (MSE): {evaluation_results['MSE']:.6f}")
        print(f"Root Mean Squared Error (RMSE): {evaluation_results['RMSE']:.6f}")
        print(f"Mean Absolute Error (MAE): {evaluation_results['MAE']:.6f}")
        print(f"Mean Absolute Percentage Error (MAPE): {evaluation_results['MAPE']:.2f}%")
        print(f"R² Score: {evaluation_results['R2']:.6f}")

    print(f"\nAll evaluation files saved in: ./evaluation_results/")
    print("Evaluation completed successfully!")

In [None]:
# Fix the numpy.Inf deprecation error in utils/tools.py
!sed -i 's/np.Inf/np.inf/g' ./utils/tools.py

print("Fixed np.Inf in utils/tools.py")

## 🔍 Post-Hoc Interpretation with Explainable Boosting Machine (EBM)

Expalainable Data Prep

In [None]:

# Step 1: Prepare Static Covariates and Forecast Averages
import pandas as pd

# Assuming original dataset is loaded as 'df' and forecast saved to 'forecast_df'
static_columns= [col for col in df.columns if col.startswith('static_') or col.startswith('')]  # Replace with your static features
target_column = "RCO_Yield"  # Example target; modify as needed

# Extract static features (latest available per batch)
df['crude_batch_id'] = df['future_Blend_Num'].astype(str)
latest_static = df.groupby("crude_batch_id").last()
static_covariates = latest_static[static_columns]

# Compute average predictions per batch
forecast_df['crude_batch_id'] = forecast_df['future_Blend_Num'].astype(str)
target_avg = forecast_df.groupby("crude_batch_id")[[target_column]].mean()

# Merge into EBM dataset
ebm_data = static_covariates.join(target_avg, how="inner").dropna()
X = ebm_data[static_columns]
y = ebm_data[target_column]


EBM Fitting

In [None]:

# Step 2: Train the EBM Model
from interpret.glassbox import ExplainableBoostingRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ebm = ExplainableBoostingRegressor(interactions=10)
ebm.fit(X_train, y_train)


Global Interpretation

In [None]:

# Step 3: Interpret Feature Importance and Sample Explanation
from interpret import show

# Global explanation
ebm_global = ebm.explain_global()
show(ebm_global)


Local Interpretation

In [None]:

# Optional: Local explanation for one prediction
sample = X_test.iloc[[0]]
ebm_local = ebm.explain_local(sample, y_test.iloc[[0]])
show(ebm_local)


Feature Attribution

In [None]:

# Optional: Bar Plot of Feature Importances
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.barh(X.columns, ebm.feature_importances_)
plt.title("EBM Feature Importances on RCO_Yield")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


In [None]:
print("Columns available in the df DataFrame:")
print(df.columns.tolist())

In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/processed/final_concatenated_data_mice_imputed.csv'

try:
    df_columns = pd.read_csv(file_path, nrows=0).columns.tolist()
    print("Column names in the file:")
    for col in df_columns:
        print(col)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")