Connected to .venv (Python 3.10.16)

In [None]:
#Import Pypots Library
from pypots.optim import Adam
from pypots.imputation import SAITS
#from pypots.utils.metrics import calc_mae
from pypots.nn.functional import calc_mae


import argparse
import hashlib
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import mlflow.pytorch
import shap
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.utils.data
import data_insight
from data_insight import setup_duckdb
from duckdb import DuckDBPyConnection as DuckDB
from duckdb import DuckDBPyRelation as Relation
from pathlib import Path
import hashlib
from duckdb import DuckDBPyConnection as DuckDB
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna 
from optuna.visualization import plot_optimization_history




from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, Dataset
from pygrinder.missing_completely_at_random import mcar
from tqdm.auto import tqdm

import sensor_imputation_thesis.shared.load_data as load

torch.cuda.empty_cache()
#PatchTST might be an ideal choise if SAITS is too slow 

##Drop columns with different indexes while loading data.. Or the mean values 

df=pd.read_parquet("/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/Newdataframeforpypots.parquet")

len(df)

#current length of the dataframe is 119439

# Check nan values in each column
for col in df.columns:
    print(f"Column {col} has {df[col].isna().sum()} NaN values")
    missing_rate=df[col].isna().sum()/len(df[col])
    print(f"Column {col} has {missing_rate} Missing_rate")


#Try with smaller dataset, size 4000
##SAMPLE the percengtage of the dataset, df.sample (averagely pick samples)
#not df.sample cuz it will randomly select 
original_size=len(df)
desired_fraction=0.3 #Select data every 3 minutes 
step=int(1/desired_fraction) #step_size=10 (sample every 10th (3/10) minute)

#Systematic sampling: Start at a random offset to avoid bias 
start=np.random.randint(0,step) #Random start between 0-9
df1=df.iloc[start::step].reset_index(drop=True)

print(f"Original size:{len(df)}, Sampled size: {len(df1)}")



# Custom Dataset class
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Data processing code
sensor_cols = [col for col in df1.columns if col != "time"]
data = df1[sensor_cols].values

#¤get feature names for printing mae later 
feature_names=df1[sensor_cols].columns.tolist()

## Convert data to 3D arrays of shape n_samples, n_timesteps, n_features, X_ori refers to the original data without missing values 
## Reconstruct all columns simultaneously  #num_features: 119
n_features = data.shape[1]  # exclude the time column
n_steps = 20 #60 (was 60 previously) #(TRY TO CHANGE HERE)  # # window length, 1440 steps = 24 hours of 1-minute data, but here is revised to 60 again
#total_elements = data.shape[0] * data.shape[1]
n_samples = data.shape[0] // n_steps 



# Reshape to (n_samples // n_steps, n_steps, n_features)
#data_reshaped = data.reshape((n_samples, n_steps, n_features))
data_reshaped=data[:n_samples*n_steps].reshape(n_samples,n_steps,n_features)
print(f"Reshaped data:{data.shape}")

#Split into train, test, val, fit scaler only on the train set (prevent data leakage)

#train_size = int(0.6 * len(data))
#val_size = int(0.2 * len(data))
#test_size = len(data) - train_size - val_size

#train_data = data_reshaped[:train_size]
#val_data = data_reshaped[train_size:train_size + val_size]
#test_data= data_reshaped[train_size + val_size:]


#Apply time series split 
#Split into train(60%), val(20%), and test (20%)
train_data, temp_data=train_test_split(data_reshaped,test_size=0.4,shuffle=True)
val_data, test_data=train_test_split(temp_data, test_size=0.5, shuffle=False)

##Normalization is important because of the nature of mse calculation of saits, columns with large 
#values dominate the loss, making metrics meaningless. SAITS computes MSE/MAE column-wise and averages 
#them across all columns 
#  Apply minmax scaler here 
#normalize each feature independently
scalers={}


#train_scaled = np.zeros_like(data_reshaped[train_size])  # Initialize the normalized data array
#val_scaled=np.zeros_like(data_reshaped[train_size:train_size + val_size])
#test_scaled=np.zeros_like(data_reshaped[train_size + val_size:])

train_scaled = np.zeros_like(train_data)
val_scaled = np.zeros_like(val_data)
test_scaled = np.zeros_like(test_data)


for i in range(data_reshaped.shape[2]):
    scaler = MinMaxScaler(feature_range=(-1, 1)) #changed to -1,1
    # Flatten timesteps and samples for scaling
    train_scaled[:, :, i] = scaler.fit_transform(train_data[:, :, i].reshape(-1, 1)).reshape(train_data.shape[0], train_data.shape[1])
    val_scaled[:, :, i] = scaler.transform(val_data[:, :, i].reshape(-1, 1)).reshape(val_data.shape[0], val_data.shape[1])
    test_scaled[:, :, i] = scaler.transform(test_data[:, :, i].reshape(-1, 1)).reshape(test_data.shape[0], test_data.shape[1])
    scalers[i] = scaler  # Save scalers to inverse-transform later

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  


#Optional: Artificially mask. Mask 20% of the data (MIT part)
def mcar_f(X, mask_ratio=0.2):
    """Apply MCAR only to observed values."""
    observed_mask=~np.isnan(X) #find observed positions
    artificial_mask=mcar(X,mask_ratio).astype(bool) #generate MCAR mask, cast to boolean
    #combine masks 
    combined_mask=observed_mask & artificial_mask

    #Apply masking
    X_masked=X.copy()
    X_masked[combined_mask]=np.nan
    return X_masked,combined_mask


#Use mcar on validation data 
val_X_masked, val_mask =mcar_f(val_scaled)
val_X_ori=val_scaled.copy() 

test_X_masked, test_mask =mcar_f(test_scaled)
test_X_ori=test_scaled.copy() 

#?? Problem: Can't have the best input for testing
#1.Create synthetic test_data cuz if I drop nan values for test set, there's basically nothing left
#synthetic_data=np.random.randn(n_samples,n_steps,n_features)
#test_X_masked,test_mask=mcar_f(synthetic_data)
#test_X_ori=synthetic_data.copy() #Ground truth

# 2, Ensure no NaN values in synthetic data
#test_X_masked = np.nan_to_num(test_X_masked, nan=np.nanmean(test_X_masked))
#test_X_ori = np.nan_to_num(test_X_ori, nan=np.nanmean(test_X_ori))



class Config:
    no_cuda = False
    no_mps = False
    seed = 1

args=Config()

torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)


args.cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

if args.cuda:
    device = torch.device("cuda")
    print("Using CUDA")
elif use_mps:
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")


#MLflow set up
mlflow.set_tracking_uri("http:127.0.0.1:5001")
client = mlflow.tracking.MlflowClient()
mlflow.set_experiment("SAITS_3")
SAITS_run_name = "SAITS_1"



with mlflow.start_run(run_name=SAITS_run_name) as run:
# initialize the model (from example)
    saits = SAITS(
        n_steps=data_reshaped.shape[1],
        n_features=data_reshaped.shape[2],
        #n_layers=2, #deep network(4) for long sequences, here is revised for 2 since it always shows cuda error 
        n_layers=3,
        d_model=512,
        optimizer=Adam(lr=1e-3),
        #lr=params["lr"],
        ORT_weight=1,
        MIT_weight=1,
        #d_model=512,  #d_modle must equal n_heads * d_k
        d_ffn=512,
        n_heads=8,
        d_k=64,
        d_v=64,
        dropout=0.1,
        attn_dropout=0.1,
        diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
        #ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
       # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
        #MIT_weight=1,
        batch_size=5, #try with 5 to see if it works, was 4. 
        # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
        epochs=10, #try with 10 to see if it runs
        # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
        # You can leave it to defualt as None to disable early stopping.
        patience=6, #initially was 3, tried to increase to see more possibilities 
        # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
        # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
        #optimizer=Adam(lr=1e-3),
        # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
        # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
        # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
        num_workers=0,
        # just leave it to default as None, PyPOTS will automatically assign the best device for you.
        # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
        device="CUDA", 
        # set the path for saving tensorboard and trained model files 
        saving_path="/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/best_model",
        # only save the best model after training finished.
        # You can also set it as "better" to save models performing better ever during training.
        model_saving_strategy="best",
)





# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
#use original missigness for trainig, use the mcar masked points as ground truth
saits.fit(train_set={"X": train_scaled}, val_set={"X": val_X_masked , "X_ori":val_X_ori })


##drop null values in test set 

# Check for NaN values across the entire array
#nan_mask = np.isnan(test_X).any(axis=(1, 2))

# Filter out samples (rows) that contain NaN values
#test_X_clean = test_X[~nan_mask]
#test_X_ori = test_X_clean
#test_X_masked, test_mask = mcar_f(test_X_clean)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
# Convert the numpy array to a dictionary


# Ensure saits_imputation and test_X are numpy arrays
#if not isinstance(saits_imputation, np.ndarray):
    #saits_imputation = np.array(saits_imputation)
#if not isinstance(test_X, np.ndarray):
     #test_X = np.array(test_X)

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  

    
#Apply function to the dataset 
test_set_dict = {"X": test_X_masked}
test_imputation = saits.predict(test_set_dict)
test_imputation_array = test_imputation["imputation"]
test_imputation_denorm = inverse_scale(test_imputation_array, scalers)
test_ori_denorm=inverse_scale(test_X_ori, scalers)

if args.cuda:
    torch.cuda.empty_cache() 


#Calculate metrics per-feature: mean absolute error on the ground truth (artificially-missing values)
mae_per_feature=[]
percentage_mae_per_feature=[]

for i in range(n_features):
    #Extract imputation and ground truth for feature i
    imputation_i=test_imputation_denorm[:,:,i]
    ground_truth_i=test_ori_denorm[:,:,i]
    mask_i=test_mask[:,:,i]
    # Check for NaN values
    if np.isnan(imputation_i).any() or np.isnan(ground_truth_i).any():
        print(f"NaN values detected in feature {i}")
        continue  # Skip this feature if NaN values are found
    #Filter only artificially masked positions
    mae_i=calc_mae(imputation_i,ground_truth_i,mask_i)
    mae_per_feature.append(mae_i)
    #Calculate the original standard deviation for the feature
    std_dev_i = np.std(ground_truth_i[mask_i == 1])
    # Calculate the percentage of MAE relative to the standard deviation   
    if std_dev_i != 0:
        percentage_mae_i = (mae_i / std_dev_i) * 100
        percentage_mae_per_feature.append(percentage_mae_i)
    else:
        percentage_mae_i = float('inf')
            
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)
    mlflow.log_metric(f"Percentage_MAE_{feature_names[i]}", percentage_mae_i)

    print(f"MAE for {feature_names[i]}: {mae_i:.4f}")
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)

# calculate average MAE 
avg_mae=np.mean(mae_per_feature)
print(f"Testing mean absolute error: {avg_mae:.4f}")
mlflow.log_metric("avg_mae", avg_mae)

mlflow.end_run()

  from .autonotebook import tqdm as notebook_tqdm


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m

Column time has 0 NaN values
Column time has 0.0 Missing_rate
Column fr_eng has 0 NaN values
Column fr_eng has 0.0 Missing_rate
Column te_exh_cyl_out__0 has 73 NaN values
Column te_exh_cyl_out__0 has 0.0006111906496203082 Missing_rate
Column pd_air_ic__0 has 73 NaN values
Column pd_air_ic__0 has 0.0006111906496203082 Missing_rate
Column pr_exh_

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Using CPU


InvalidUrlException: Invalid url: http:127.0.0.1:5001/api/2.0/mlflow/experiments/get-by-name

In [None]:
#Import Pypots Library
from pypots.optim import Adam
from pypots.imputation import SAITS
#from pypots.utils.metrics import calc_mae
from pypots.nn.functional import calc_mae


import argparse
import hashlib
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import mlflow.pytorch
import shap
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.utils.data
import data_insight
from data_insight import setup_duckdb
from duckdb import DuckDBPyConnection as DuckDB
from duckdb import DuckDBPyRelation as Relation
from pathlib import Path
import hashlib
from duckdb import DuckDBPyConnection as DuckDB
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna 
from optuna.visualization import plot_optimization_history




from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, Dataset
from pygrinder.missing_completely_at_random import mcar
from tqdm.auto import tqdm

import sensor_imputation_thesis.shared.load_data as load

torch.cuda.empty_cache()
#PatchTST might be an ideal choise if SAITS is too slow 

##Drop columns with different indexes while loading data.. Or the mean values 

df=pd.read_parquet("/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/Newdataframeforpypots.parquet")

len(df)

#current length of the dataframe is 119439

# Check nan values in each column
for col in df.columns:
    print(f"Column {col} has {df[col].isna().sum()} NaN values")
    missing_rate=df[col].isna().sum()/len(df[col])
    print(f"Column {col} has {missing_rate} Missing_rate")


#Try with smaller dataset, size 4000
##SAMPLE the percengtage of the dataset, df.sample (averagely pick samples)
#not df.sample cuz it will randomly select 
original_size=len(df)
desired_fraction=0.3 #Select data every 3 minutes 
step=int(1/desired_fraction) #step_size=10 (sample every 10th (3/10) minute)

#Systematic sampling: Start at a random offset to avoid bias 
start=np.random.randint(0,step) #Random start between 0-9
df1=df.iloc[start::step].reset_index(drop=True)

print(f"Original size:{len(df)}, Sampled size: {len(df1)}")



# Custom Dataset class
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Data processing code
sensor_cols = [col for col in df1.columns if col != "time"]
data = df1[sensor_cols].values

#¤get feature names for printing mae later 
feature_names=df1[sensor_cols].columns.tolist()

## Convert data to 3D arrays of shape n_samples, n_timesteps, n_features, X_ori refers to the original data without missing values 
## Reconstruct all columns simultaneously  #num_features: 119
n_features = data.shape[1]  # exclude the time column
n_steps = 20 #60 (was 60 previously) #(TRY TO CHANGE HERE)  # # window length, 1440 steps = 24 hours of 1-minute data, but here is revised to 60 again
#total_elements = data.shape[0] * data.shape[1]
n_samples = data.shape[0] // n_steps 



# Reshape to (n_samples // n_steps, n_steps, n_features)
#data_reshaped = data.reshape((n_samples, n_steps, n_features))
data_reshaped=data[:n_samples*n_steps].reshape(n_samples,n_steps,n_features)
print(f"Reshaped data:{data.shape}")

#Split into train, test, val, fit scaler only on the train set (prevent data leakage)

#train_size = int(0.6 * len(data))
#val_size = int(0.2 * len(data))
#test_size = len(data) - train_size - val_size

#train_data = data_reshaped[:train_size]
#val_data = data_reshaped[train_size:train_size + val_size]
#test_data= data_reshaped[train_size + val_size:]


#Apply time series split 
#Split into train(60%), val(20%), and test (20%)
train_data, temp_data=train_test_split(data_reshaped,test_size=0.4,shuffle=True)
val_data, test_data=train_test_split(temp_data, test_size=0.5, shuffle=False)

##Normalization is important because of the nature of mse calculation of saits, columns with large 
#values dominate the loss, making metrics meaningless. SAITS computes MSE/MAE column-wise and averages 
#them across all columns 
#  Apply minmax scaler here 
#normalize each feature independently
scalers={}


#train_scaled = np.zeros_like(data_reshaped[train_size])  # Initialize the normalized data array
#val_scaled=np.zeros_like(data_reshaped[train_size:train_size + val_size])
#test_scaled=np.zeros_like(data_reshaped[train_size + val_size:])

train_scaled = np.zeros_like(train_data)
val_scaled = np.zeros_like(val_data)
test_scaled = np.zeros_like(test_data)


for i in range(data_reshaped.shape[2]):
    scaler = MinMaxScaler(feature_range=(-1, 1)) #changed to -1,1
    # Flatten timesteps and samples for scaling
    train_scaled[:, :, i] = scaler.fit_transform(train_data[:, :, i].reshape(-1, 1)).reshape(train_data.shape[0], train_data.shape[1])
    val_scaled[:, :, i] = scaler.transform(val_data[:, :, i].reshape(-1, 1)).reshape(val_data.shape[0], val_data.shape[1])
    test_scaled[:, :, i] = scaler.transform(test_data[:, :, i].reshape(-1, 1)).reshape(test_data.shape[0], test_data.shape[1])
    scalers[i] = scaler  # Save scalers to inverse-transform later

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  


#Optional: Artificially mask. Mask 20% of the data (MIT part)
def mcar_f(X, mask_ratio=0.2):
    """Apply MCAR only to observed values."""
    observed_mask=~np.isnan(X) #find observed positions
    artificial_mask=mcar(X,mask_ratio).astype(bool) #generate MCAR mask, cast to boolean
    #combine masks 
    combined_mask=observed_mask & artificial_mask

    #Apply masking
    X_masked=X.copy()
    X_masked[combined_mask]=np.nan
    return X_masked,combined_mask


#Use mcar on validation data 
val_X_masked, val_mask =mcar_f(val_scaled)
val_X_ori=val_scaled.copy() 

test_X_masked, test_mask =mcar_f(test_scaled)
test_X_ori=test_scaled.copy() 

#?? Problem: Can't have the best input for testing
#1.Create synthetic test_data cuz if I drop nan values for test set, there's basically nothing left
#synthetic_data=np.random.randn(n_samples,n_steps,n_features)
#test_X_masked,test_mask=mcar_f(synthetic_data)
#test_X_ori=synthetic_data.copy() #Ground truth

# 2, Ensure no NaN values in synthetic data
#test_X_masked = np.nan_to_num(test_X_masked, nan=np.nanmean(test_X_masked))
#test_X_ori = np.nan_to_num(test_X_ori, nan=np.nanmean(test_X_ori))



class Config:
    no_cuda = False
    no_mps = False
    seed = 1

args=Config()

torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)


args.cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

if args.cuda:
    device = torch.device("cuda")
    print("Using CUDA")
elif use_mps:
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")


#MLflow set up
mlflow.set_tracking_uri("http:127.0.0.1:5001")
client = mlflow.tracking.MlflowClient()
mlflow.set_experiment("SAITS_3")
SAITS_run_name = "SAITS_1"



with mlflow.start_run(run_name=SAITS_run_name) as run:
# initialize the model (from example)
    saits = SAITS(
        n_steps=data_reshaped.shape[1],
        n_features=data_reshaped.shape[2],
        #n_layers=2, #deep network(4) for long sequences, here is revised for 2 since it always shows cuda error 
        n_layers=3,
        d_model=512,
        optimizer=Adam(lr=1e-3),
        #lr=params["lr"],
        ORT_weight=1,
        MIT_weight=1,
        #d_model=512,  #d_modle must equal n_heads * d_k
        d_ffn=512,
        n_heads=8,
        d_k=64,
        d_v=64,
        dropout=0.1,
        attn_dropout=0.1,
        diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
        #ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
       # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
        #MIT_weight=1,
        batch_size=5, #try with 5 to see if it works, was 4. 
        # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
        epochs=10, #try with 10 to see if it runs
        # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
        # You can leave it to defualt as None to disable early stopping.
        patience=6, #initially was 3, tried to increase to see more possibilities 
        # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
        # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
        #optimizer=Adam(lr=1e-3),
        # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
        # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
        # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
        num_workers=0,
        # just leave it to default as None, PyPOTS will automatically assign the best device for you.
        # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
        device="CUDA", 
        # set the path for saving tensorboard and trained model files 
        saving_path="/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/best_model",
        # only save the best model after training finished.
        # You can also set it as "better" to save models performing better ever during training.
        model_saving_strategy="best",
)





# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
#use original missigness for trainig, use the mcar masked points as ground truth
saits.fit(train_set={"X": train_scaled}, val_set={"X": val_X_masked , "X_ori":val_X_ori })


##drop null values in test set 

# Check for NaN values across the entire array
#nan_mask = np.isnan(test_X).any(axis=(1, 2))

# Filter out samples (rows) that contain NaN values
#test_X_clean = test_X[~nan_mask]
#test_X_ori = test_X_clean
#test_X_masked, test_mask = mcar_f(test_X_clean)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
# Convert the numpy array to a dictionary


# Ensure saits_imputation and test_X are numpy arrays
#if not isinstance(saits_imputation, np.ndarray):
    #saits_imputation = np.array(saits_imputation)
#if not isinstance(test_X, np.ndarray):
     #test_X = np.array(test_X)

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  

    
#Apply function to the dataset 
test_set_dict = {"X": test_X_masked}
test_imputation = saits.predict(test_set_dict)
test_imputation_array = test_imputation["imputation"]
test_imputation_denorm = inverse_scale(test_imputation_array, scalers)
test_ori_denorm=inverse_scale(test_X_ori, scalers)

if args.cuda:
    torch.cuda.empty_cache() 


#Calculate metrics per-feature: mean absolute error on the ground truth (artificially-missing values)
mae_per_feature=[]
percentage_mae_per_feature=[]

for i in range(n_features):
    #Extract imputation and ground truth for feature i
    imputation_i=test_imputation_denorm[:,:,i]
    ground_truth_i=test_ori_denorm[:,:,i]
    mask_i=test_mask[:,:,i]
    # Check for NaN values
    if np.isnan(imputation_i).any() or np.isnan(ground_truth_i).any():
        print(f"NaN values detected in feature {i}")
        continue  # Skip this feature if NaN values are found
    #Filter only artificially masked positions
    mae_i=calc_mae(imputation_i,ground_truth_i,mask_i)
    mae_per_feature.append(mae_i)
    #Calculate the original standard deviation for the feature
    std_dev_i = np.std(ground_truth_i[mask_i == 1])
    # Calculate the percentage of MAE relative to the standard deviation   
    if std_dev_i != 0:
        percentage_mae_i = (mae_i / std_dev_i) * 100
        percentage_mae_per_feature.append(percentage_mae_i)
    else:
        percentage_mae_i = float('inf')
            
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)
    mlflow.log_metric(f"Percentage_MAE_{feature_names[i]}", percentage_mae_i)

    print(f"MAE for {feature_names[i]}: {mae_i:.4f}")
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)

# calculate average MAE 
avg_mae=np.mean(mae_per_feature)
print(f"Testing mean absolute error: {avg_mae:.4f}")
mlflow.log_metric("avg_mae", avg_mae)

mlflow.end_run()

Column time has 0 NaN values
Column time has 0.0 Missing_rate
Column fr_eng has 0 NaN values
Column fr_eng has 0.0 Missing_rate
Column te_exh_cyl_out__0 has 73 NaN values
Column te_exh_cyl_out__0 has 0.0006111906496203082 Missing_rate
Column pd_air_ic__0 has 73 NaN values
Column pd_air_ic__0 has 0.0006111906496203082 Missing_rate
Column pr_exh_turb_out__0 has 119439 NaN values
Column pr_exh_turb_out__0 has 1.0 Missing_rate
Column te_air_ic_out__0 has 73 NaN values
Column te_air_ic_out__0 has 0.0006111906496203082 Missing_rate
Column te_seawater has 73 NaN values
Column te_seawater has 0.0006111906496203082 Missing_rate
Column te_air_comp_in_a__0 has 119439 NaN values
Column te_air_comp_in_a__0 has 1.0 Missing_rate
Column te_air_comp_in_b__0 has 119439 NaN values
Column te_air_comp_in_b__0 has 1.0 Missing_rate
Column fr_tc__0 has 119439 NaN values
Column fr_tc__0 has 1.0 Missing_rate
Column pr_baro has 73 NaN values
Column pr_baro has 0.0006111906496203082 Missing_rate
Column pd_air_ic_

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


InvalidUrlException: Invalid url: http:127.0.0.1:5001/api/2.0/mlflow/experiments/get-by-name

In [None]:
#Import Pypots Library
from pypots.optim import Adam
from pypots.imputation import SAITS
#from pypots.utils.metrics import calc_mae
from pypots.nn.functional import calc_mae


import argparse
import hashlib
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import mlflow.pytorch
import shap
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.utils.data
import data_insight
from data_insight import setup_duckdb
from duckdb import DuckDBPyConnection as DuckDB
from duckdb import DuckDBPyRelation as Relation
from pathlib import Path
import hashlib
from duckdb import DuckDBPyConnection as DuckDB
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna 
from optuna.visualization import plot_optimization_history




from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, Dataset
from pygrinder.missing_completely_at_random import mcar
from tqdm.auto import tqdm

import sensor_imputation_thesis.shared.load_data as load

torch.cuda.empty_cache()
#PatchTST might be an ideal choise if SAITS is too slow 

##Drop columns with different indexes while loading data.. Or the mean values 

df=pd.read_parquet("/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/Newdataframeforpypots.parquet")

len(df)

#current length of the dataframe is 119439

# Check nan values in each column
for col in df.columns:
    print(f"Column {col} has {df[col].isna().sum()} NaN values")
    missing_rate=df[col].isna().sum()/len(df[col])
    print(f"Column {col} has {missing_rate} Missing_rate")


#Try with smaller dataset, size 4000
##SAMPLE the percengtage of the dataset, df.sample (averagely pick samples)
#not df.sample cuz it will randomly select 
original_size=len(df)
desired_fraction=0.3 #Select data every 3 minutes 
step=int(1/desired_fraction) #step_size=10 (sample every 10th (3/10) minute)

#Systematic sampling: Start at a random offset to avoid bias 
start=np.random.randint(0,step) #Random start between 0-9
df1=df.iloc[start::step].reset_index(drop=True)

print(f"Original size:{len(df)}, Sampled size: {len(df1)}")



# Custom Dataset class
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Data processing code
sensor_cols = [col for col in df1.columns if col != "time"]
data = df1[sensor_cols].values

#¤get feature names for printing mae later 
feature_names=df1[sensor_cols].columns.tolist()

## Convert data to 3D arrays of shape n_samples, n_timesteps, n_features, X_ori refers to the original data without missing values 
## Reconstruct all columns simultaneously  #num_features: 119
n_features = data.shape[1]  # exclude the time column
n_steps = 20 #60 (was 60 previously) #(TRY TO CHANGE HERE)  # # window length, 1440 steps = 24 hours of 1-minute data, but here is revised to 60 again
#total_elements = data.shape[0] * data.shape[1]
n_samples = data.shape[0] // n_steps 



# Reshape to (n_samples // n_steps, n_steps, n_features)
#data_reshaped = data.reshape((n_samples, n_steps, n_features))
data_reshaped=data[:n_samples*n_steps].reshape(n_samples,n_steps,n_features)
print(f"Reshaped data:{data.shape}")

#Split into train, test, val, fit scaler only on the train set (prevent data leakage)

#train_size = int(0.6 * len(data))
#val_size = int(0.2 * len(data))
#test_size = len(data) - train_size - val_size

#train_data = data_reshaped[:train_size]
#val_data = data_reshaped[train_size:train_size + val_size]
#test_data= data_reshaped[train_size + val_size:]


#Apply time series split 
#Split into train(60%), val(20%), and test (20%)
train_data, temp_data=train_test_split(data_reshaped,test_size=0.4,shuffle=True)
val_data, test_data=train_test_split(temp_data, test_size=0.5, shuffle=False)

##Normalization is important because of the nature of mse calculation of saits, columns with large 
#values dominate the loss, making metrics meaningless. SAITS computes MSE/MAE column-wise and averages 
#them across all columns 
#  Apply minmax scaler here 
#normalize each feature independently
scalers={}


#train_scaled = np.zeros_like(data_reshaped[train_size])  # Initialize the normalized data array
#val_scaled=np.zeros_like(data_reshaped[train_size:train_size + val_size])
#test_scaled=np.zeros_like(data_reshaped[train_size + val_size:])

train_scaled = np.zeros_like(train_data)
val_scaled = np.zeros_like(val_data)
test_scaled = np.zeros_like(test_data)


for i in range(data_reshaped.shape[2]):
    scaler = MinMaxScaler(feature_range=(-1, 1)) #changed to -1,1
    # Flatten timesteps and samples for scaling
    train_scaled[:, :, i] = scaler.fit_transform(train_data[:, :, i].reshape(-1, 1)).reshape(train_data.shape[0], train_data.shape[1])
    val_scaled[:, :, i] = scaler.transform(val_data[:, :, i].reshape(-1, 1)).reshape(val_data.shape[0], val_data.shape[1])
    test_scaled[:, :, i] = scaler.transform(test_data[:, :, i].reshape(-1, 1)).reshape(test_data.shape[0], test_data.shape[1])
    scalers[i] = scaler  # Save scalers to inverse-transform later

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  


#Optional: Artificially mask. Mask 20% of the data (MIT part)
def mcar_f(X, mask_ratio=0.2):
    """Apply MCAR only to observed values."""
    observed_mask=~np.isnan(X) #find observed positions
    artificial_mask=mcar(X,mask_ratio).astype(bool) #generate MCAR mask, cast to boolean
    #combine masks 
    combined_mask=observed_mask & artificial_mask

    #Apply masking
    X_masked=X.copy()
    X_masked[combined_mask]=np.nan
    return X_masked,combined_mask


#Use mcar on validation data 
val_X_masked, val_mask =mcar_f(val_scaled)
val_X_ori=val_scaled.copy() 

test_X_masked, test_mask =mcar_f(test_scaled)
test_X_ori=test_scaled.copy() 

#?? Problem: Can't have the best input for testing
#1.Create synthetic test_data cuz if I drop nan values for test set, there's basically nothing left
#synthetic_data=np.random.randn(n_samples,n_steps,n_features)
#test_X_masked,test_mask=mcar_f(synthetic_data)
#test_X_ori=synthetic_data.copy() #Ground truth

# 2, Ensure no NaN values in synthetic data
#test_X_masked = np.nan_to_num(test_X_masked, nan=np.nanmean(test_X_masked))
#test_X_ori = np.nan_to_num(test_X_ori, nan=np.nanmean(test_X_ori))



class Config:
    no_cuda = False
    no_mps = False
    seed = 1

args=Config()

torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)


args.cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

if args.cuda:
    device = torch.device("cuda")
    print("Using CUDA")
elif use_mps:
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")


#MLflow set up
mlflow.set_tracking_uri("http://localhost:5000")
client = mlflow.tracking.MlflowClient()
mlflow.set_experiment("SAITS_3")
SAITS_run_name = "SAITS_1"



with mlflow.start_run(run_name=SAITS_run_name) as run:
# initialize the model (from example)
    saits = SAITS(
        n_steps=data_reshaped.shape[1],
        n_features=data_reshaped.shape[2],
        #n_layers=2, #deep network(4) for long sequences, here is revised for 2 since it always shows cuda error 
        n_layers=3,
        d_model=512,
        optimizer=Adam(lr=1e-3),
        #lr=params["lr"],
        ORT_weight=1,
        MIT_weight=1,
        #d_model=512,  #d_modle must equal n_heads * d_k
        d_ffn=512,
        n_heads=8,
        d_k=64,
        d_v=64,
        dropout=0.1,
        attn_dropout=0.1,
        diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
        #ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
       # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
        #MIT_weight=1,
        batch_size=5, #try with 5 to see if it works, was 4. 
        # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
        epochs=10, #try with 10 to see if it runs
        # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
        # You can leave it to defualt as None to disable early stopping.
        patience=6, #initially was 3, tried to increase to see more possibilities 
        # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
        # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
        #optimizer=Adam(lr=1e-3),
        # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
        # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
        # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
        num_workers=0,
        # just leave it to default as None, PyPOTS will automatically assign the best device for you.
        # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
        device="CUDA", 
        # set the path for saving tensorboard and trained model files 
        saving_path="/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/best_model",
        # only save the best model after training finished.
        # You can also set it as "better" to save models performing better ever during training.
        model_saving_strategy="best",
)





# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
#use original missigness for trainig, use the mcar masked points as ground truth
saits.fit(train_set={"X": train_scaled}, val_set={"X": val_X_masked , "X_ori":val_X_ori })


##drop null values in test set 

# Check for NaN values across the entire array
#nan_mask = np.isnan(test_X).any(axis=(1, 2))

# Filter out samples (rows) that contain NaN values
#test_X_clean = test_X[~nan_mask]
#test_X_ori = test_X_clean
#test_X_masked, test_mask = mcar_f(test_X_clean)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
# Convert the numpy array to a dictionary


# Ensure saits_imputation and test_X are numpy arrays
#if not isinstance(saits_imputation, np.ndarray):
    #saits_imputation = np.array(saits_imputation)
#if not isinstance(test_X, np.ndarray):
     #test_X = np.array(test_X)

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  

    
#Apply function to the dataset 
test_set_dict = {"X": test_X_masked}
test_imputation = saits.predict(test_set_dict)
test_imputation_array = test_imputation["imputation"]
test_imputation_denorm = inverse_scale(test_imputation_array, scalers)
test_ori_denorm=inverse_scale(test_X_ori, scalers)

if args.cuda:
    torch.cuda.empty_cache() 


#Calculate metrics per-feature: mean absolute error on the ground truth (artificially-missing values)
mae_per_feature=[]
percentage_mae_per_feature=[]

for i in range(n_features):
    #Extract imputation and ground truth for feature i
    imputation_i=test_imputation_denorm[:,:,i]
    ground_truth_i=test_ori_denorm[:,:,i]
    mask_i=test_mask[:,:,i]
    # Check for NaN values
    if np.isnan(imputation_i).any() or np.isnan(ground_truth_i).any():
        print(f"NaN values detected in feature {i}")
        continue  # Skip this feature if NaN values are found
    #Filter only artificially masked positions
    mae_i=calc_mae(imputation_i,ground_truth_i,mask_i)
    mae_per_feature.append(mae_i)
    #Calculate the original standard deviation for the feature
    std_dev_i = np.std(ground_truth_i[mask_i == 1])
    # Calculate the percentage of MAE relative to the standard deviation   
    if std_dev_i != 0:
        percentage_mae_i = (mae_i / std_dev_i) * 100
        percentage_mae_per_feature.append(percentage_mae_i)
    else:
        percentage_mae_i = float('inf')
            
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)
    mlflow.log_metric(f"Percentage_MAE_{feature_names[i]}", percentage_mae_i)

    print(f"MAE for {feature_names[i]}: {mae_i:.4f}")
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)

# calculate average MAE 
avg_mae=np.mean(mae_per_feature)
print(f"Testing mean absolute error: {avg_mae:.4f}")
mlflow.log_metric("avg_mae", avg_mae)

mlflow.end_run()

Column time has 0 NaN values
Column time has 0.0 Missing_rate
Column fr_eng has 0 NaN values
Column fr_eng has 0.0 Missing_rate
Column te_exh_cyl_out__0 has 73 NaN values
Column te_exh_cyl_out__0 has 0.0006111906496203082 Missing_rate
Column pd_air_ic__0 has 73 NaN values
Column pd_air_ic__0 has 0.0006111906496203082 Missing_rate
Column pr_exh_turb_out__0 has 119439 NaN values
Column pr_exh_turb_out__0 has 1.0 Missing_rate
Column te_air_ic_out__0 has 73 NaN values
Column te_air_ic_out__0 has 0.0006111906496203082 Missing_rate
Column te_seawater has 73 NaN values
Column te_seawater has 0.0006111906496203082 Missing_rate
Column te_air_comp_in_a__0 has 119439 NaN values
Column te_air_comp_in_a__0 has 1.0 Missing_rate
Column te_air_comp_in_b__0 has 119439 NaN values
Column te_air_comp_in_b__0 has 1.0 Missing_rate
Column fr_tc__0 has 119439 NaN values
Column fr_tc__0 has 1.0 Missing_rate
Column pr_baro has 73 NaN values
Column pr_baro has 0.0006111906496203082 Missing_rate
Column pd_air_ic_

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Using CPU


2025-05-14 09:33:21 [INFO]: Using the given device: cuda


🏃 View run SAITS_1 at: http://localhost:5000/#/experiments/4/runs/d12424341f064d51867433866c913569
🧪 View experiment at: http://localhost:5000/#/experiments/4


AssertionError: You are trying to use CUDA for model training, but CUDA is not available in your environment.

In [None]:
#Import Pypots Library
from pypots.optim import Adam
from pypots.imputation import SAITS
#from pypots.utils.metrics import calc_mae
from pypots.nn.functional import calc_mae


import argparse
import hashlib
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import mlflow.pytorch
import shap
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.utils.data
import data_insight
from data_insight import setup_duckdb
from duckdb import DuckDBPyConnection as DuckDB
from duckdb import DuckDBPyRelation as Relation
from pathlib import Path
import hashlib
from duckdb import DuckDBPyConnection as DuckDB
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna 
from optuna.visualization import plot_optimization_history




from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, Dataset
from pygrinder.missing_completely_at_random import mcar
from tqdm.auto import tqdm

import sensor_imputation_thesis.shared.load_data as load

torch.cuda.empty_cache()
#PatchTST might be an ideal choise if SAITS is too slow 

##Drop columns with different indexes while loading data.. Or the mean values 

df=pd.read_parquet("/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/Newdataframeforpypots.parquet")

len(df)

#current length of the dataframe is 119439

# Check nan values in each column
for col in df.columns:
    print(f"Column {col} has {df[col].isna().sum()} NaN values")
    missing_rate=df[col].isna().sum()/len(df[col])
    print(f"Column {col} has {missing_rate} Missing_rate")


#Try with smaller dataset, size 4000
##SAMPLE the percengtage of the dataset, df.sample (averagely pick samples)
#not df.sample cuz it will randomly select 
original_size=len(df)
desired_fraction=0.3 #Select data every 3 minutes 
step=int(1/desired_fraction) #step_size=10 (sample every 10th (3/10) minute)

#Systematic sampling: Start at a random offset to avoid bias 
start=np.random.randint(0,step) #Random start between 0-9
df1=df.iloc[start::step].reset_index(drop=True)

print(f"Original size:{len(df)}, Sampled size: {len(df1)}")



# Custom Dataset class
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Data processing code
sensor_cols = [col for col in df1.columns if col != "time"]
data = df1[sensor_cols].values

#¤get feature names for printing mae later 
feature_names=df1[sensor_cols].columns.tolist()

## Convert data to 3D arrays of shape n_samples, n_timesteps, n_features, X_ori refers to the original data without missing values 
## Reconstruct all columns simultaneously  #num_features: 119
n_features = data.shape[1]  # exclude the time column
n_steps = 20 #60 (was 60 previously) #(TRY TO CHANGE HERE)  # # window length, 1440 steps = 24 hours of 1-minute data, but here is revised to 60 again
#total_elements = data.shape[0] * data.shape[1]
n_samples = data.shape[0] // n_steps 



# Reshape to (n_samples // n_steps, n_steps, n_features)
#data_reshaped = data.reshape((n_samples, n_steps, n_features))
data_reshaped=data[:n_samples*n_steps].reshape(n_samples,n_steps,n_features)
print(f"Reshaped data:{data.shape}")

#Split into train, test, val, fit scaler only on the train set (prevent data leakage)

#train_size = int(0.6 * len(data))
#val_size = int(0.2 * len(data))
#test_size = len(data) - train_size - val_size

#train_data = data_reshaped[:train_size]
#val_data = data_reshaped[train_size:train_size + val_size]
#test_data= data_reshaped[train_size + val_size:]


#Apply time series split 
#Split into train(60%), val(20%), and test (20%)
train_data, temp_data=train_test_split(data_reshaped,test_size=0.4,shuffle=True)
val_data, test_data=train_test_split(temp_data, test_size=0.5, shuffle=False)

##Normalization is important because of the nature of mse calculation of saits, columns with large 
#values dominate the loss, making metrics meaningless. SAITS computes MSE/MAE column-wise and averages 
#them across all columns 
#  Apply minmax scaler here 
#normalize each feature independently
scalers={}


#train_scaled = np.zeros_like(data_reshaped[train_size])  # Initialize the normalized data array
#val_scaled=np.zeros_like(data_reshaped[train_size:train_size + val_size])
#test_scaled=np.zeros_like(data_reshaped[train_size + val_size:])

train_scaled = np.zeros_like(train_data)
val_scaled = np.zeros_like(val_data)
test_scaled = np.zeros_like(test_data)


for i in range(data_reshaped.shape[2]):
    scaler = MinMaxScaler(feature_range=(-1, 1)) #changed to -1,1
    # Flatten timesteps and samples for scaling
    train_scaled[:, :, i] = scaler.fit_transform(train_data[:, :, i].reshape(-1, 1)).reshape(train_data.shape[0], train_data.shape[1])
    val_scaled[:, :, i] = scaler.transform(val_data[:, :, i].reshape(-1, 1)).reshape(val_data.shape[0], val_data.shape[1])
    test_scaled[:, :, i] = scaler.transform(test_data[:, :, i].reshape(-1, 1)).reshape(test_data.shape[0], test_data.shape[1])
    scalers[i] = scaler  # Save scalers to inverse-transform later

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  


#Optional: Artificially mask. Mask 20% of the data (MIT part)
def mcar_f(X, mask_ratio=0.2):
    """Apply MCAR only to observed values."""
    observed_mask=~np.isnan(X) #find observed positions
    artificial_mask=mcar(X,mask_ratio).astype(bool) #generate MCAR mask, cast to boolean
    #combine masks 
    combined_mask=observed_mask & artificial_mask

    #Apply masking
    X_masked=X.copy()
    X_masked[combined_mask]=np.nan
    return X_masked,combined_mask


#Use mcar on validation data 
val_X_masked, val_mask =mcar_f(val_scaled)
val_X_ori=val_scaled.copy() 

test_X_masked, test_mask =mcar_f(test_scaled)
test_X_ori=test_scaled.copy() 

#?? Problem: Can't have the best input for testing
#1.Create synthetic test_data cuz if I drop nan values for test set, there's basically nothing left
#synthetic_data=np.random.randn(n_samples,n_steps,n_features)
#test_X_masked,test_mask=mcar_f(synthetic_data)
#test_X_ori=synthetic_data.copy() #Ground truth

# 2, Ensure no NaN values in synthetic data
#test_X_masked = np.nan_to_num(test_X_masked, nan=np.nanmean(test_X_masked))
#test_X_ori = np.nan_to_num(test_X_ori, nan=np.nanmean(test_X_ori))



class Config:
    no_cuda = False
    no_mps = False
    seed = 1

args=Config()

torch.manual_seed(args.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)


args.cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

if args.cuda:
    device = torch.device("cuda")
    print("Using CUDA")
elif use_mps:
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")


#MLflow set up
mlflow.set_tracking_uri("http://localhost:5000")
client = mlflow.tracking.MlflowClient()
mlflow.set_experiment("SAITS_3")
SAITS_run_name = "SAITS_1"



with mlflow.start_run(run_name=SAITS_run_name) as run:
# initialize the model (from example)
    saits = SAITS(
        n_steps=data_reshaped.shape[1],
        n_features=data_reshaped.shape[2],
        #n_layers=2, #deep network(4) for long sequences, here is revised for 2 since it always shows cuda error 
        n_layers=3,
        d_model=512,
        optimizer=Adam(lr=1e-3),
        #lr=params["lr"],
        ORT_weight=1,
        MIT_weight=1,
        #d_model=512,  #d_modle must equal n_heads * d_k
        d_ffn=512,
        n_heads=8,
        d_k=64,
        d_v=64,
        dropout=0.1,
        attn_dropout=0.1,
        diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
        #ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
       # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
        #MIT_weight=1,
        batch_size=5, #try with 5 to see if it works, was 4. 
        # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
        epochs=10, #try with 10 to see if it runs
        # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
        # You can leave it to defualt as None to disable early stopping.
        patience=6, #initially was 3, tried to increase to see more possibilities 
        # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
        # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
        #optimizer=Adam(lr=1e-3),
        # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
        # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
        # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
        num_workers=0,
        # just leave it to default as None, PyPOTS will automatically assign the best device for you.
        # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
        device=device, 
        # set the path for saving tensorboard and trained model files 
        saving_path="/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/best_model",
        # only save the best model after training finished.
        # You can also set it as "better" to save models performing better ever during training.
        model_saving_strategy="best",
)





# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
#use original missigness for trainig, use the mcar masked points as ground truth
saits.fit(train_set={"X": train_scaled}, val_set={"X": val_X_masked , "X_ori":val_X_ori })


##drop null values in test set 

# Check for NaN values across the entire array
#nan_mask = np.isnan(test_X).any(axis=(1, 2))

# Filter out samples (rows) that contain NaN values
#test_X_clean = test_X[~nan_mask]
#test_X_ori = test_X_clean
#test_X_masked, test_mask = mcar_f(test_X_clean)

# the testing stage, impute the originally-missing values and artificially-missing values in the test set
# Convert the numpy array to a dictionary


# Ensure saits_imputation and test_X are numpy arrays
#if not isinstance(saits_imputation, np.ndarray):
    #saits_imputation = np.array(saits_imputation)
#if not isinstance(test_X, np.ndarray):
     #test_X = np.array(test_X)

#Inverse Scale
def inverse_scale(imputation, scalers):
    n_features = imputation.shape[2]
    imputation_denorm = np.empty_like(imputation)
    
    for i in range(n_features):
        imputation_denorm[:, :, i] = scalers[i].inverse_transform(imputation[:, :, i].reshape(-1, 1)).reshape(imputation.shape[0], imputation.shape[1])
    
    return imputation_denorm  

    
#Apply function to the dataset 
test_set_dict = {"X": test_X_masked}
test_imputation = saits.predict(test_set_dict)
test_imputation_array = test_imputation["imputation"]
test_imputation_denorm = inverse_scale(test_imputation_array, scalers)
test_ori_denorm=inverse_scale(test_X_ori, scalers)

if args.cuda:
    torch.cuda.empty_cache() 


#Calculate metrics per-feature: mean absolute error on the ground truth (artificially-missing values)
mae_per_feature=[]
percentage_mae_per_feature=[]

for i in range(n_features):
    #Extract imputation and ground truth for feature i
    imputation_i=test_imputation_denorm[:,:,i]
    ground_truth_i=test_ori_denorm[:,:,i]
    mask_i=test_mask[:,:,i]
    # Check for NaN values
    if np.isnan(imputation_i).any() or np.isnan(ground_truth_i).any():
        print(f"NaN values detected in feature {i}")
        continue  # Skip this feature if NaN values are found
    #Filter only artificially masked positions
    mae_i=calc_mae(imputation_i,ground_truth_i,mask_i)
    mae_per_feature.append(mae_i)
    #Calculate the original standard deviation for the feature
    std_dev_i = np.std(ground_truth_i[mask_i == 1])
    # Calculate the percentage of MAE relative to the standard deviation   
    if std_dev_i != 0:
        percentage_mae_i = (mae_i / std_dev_i) * 100
        percentage_mae_per_feature.append(percentage_mae_i)
    else:
        percentage_mae_i = float('inf')
            
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)
    mlflow.log_metric(f"Percentage_MAE_{feature_names[i]}", percentage_mae_i)

    print(f"MAE for {feature_names[i]}: {mae_i:.4f}")
    mlflow.log_metric(f"MAE_{feature_names[i]}", mae_i)

# calculate average MAE 
avg_mae=np.mean(mae_per_feature)
print(f"Testing mean absolute error: {avg_mae:.4f}")
mlflow.log_metric("avg_mae", avg_mae)

mlflow.end_run()

Column time has 0 NaN values
Column time has 0.0 Missing_rate
Column fr_eng has 0 NaN values
Column fr_eng has 0.0 Missing_rate
Column te_exh_cyl_out__0 has 73 NaN values
Column te_exh_cyl_out__0 has 0.0006111906496203082 Missing_rate
Column pd_air_ic__0 has 73 NaN values
Column pd_air_ic__0 has 0.0006111906496203082 Missing_rate
Column pr_exh_turb_out__0 has 119439 NaN values
Column pr_exh_turb_out__0 has 1.0 Missing_rate
Column te_air_ic_out__0 has 73 NaN values
Column te_air_ic_out__0 has 0.0006111906496203082 Missing_rate
Column te_seawater has 73 NaN values
Column te_seawater has 0.0006111906496203082 Missing_rate
Column te_air_comp_in_a__0 has 119439 NaN values
Column te_air_comp_in_a__0 has 1.0 Missing_rate
Column te_air_comp_in_b__0 has 119439 NaN values
Column te_air_comp_in_b__0 has 1.0 Missing_rate
Column fr_tc__0 has 119439 NaN values
Column fr_tc__0 has 1.0 Missing_rate
Column pr_baro has 73 NaN values
Column pr_baro has 0.0006111906496203082 Missing_rate
Column pd_air_ic_

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
2025-05-14 09:33:59 [INFO]: Using the given device: cpu
2025-05-14 09:33:59 [INFO]: Model files will be saved to /home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/best_model/20250514_T093359
2025-05-14 09:33:59 [INFO]: Tensorboard file will be saved to /home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/nadire/best_model/20250514_T093359/tensorboard
2025-05-1

🏃 View run SAITS_1 at: http://localhost:5000/#/experiments/4/runs/f2158cd06733406bbaa47c820046834b
🧪 View experiment at: http://localhost:5000/#/experiments/4


2025-05-14 09:34:40 [INFO]: Epoch 001 - training loss (MAE): 0.5010, validation MSE: 0.2346
2025-05-14 09:35:21 [INFO]: Epoch 002 - training loss (MAE): 0.3689, validation MSE: 0.2470
2025-05-14 09:36:11 [INFO]: Epoch 003 - training loss (MAE): 0.3208, validation MSE: 0.1938
2025-05-14 09:37:10 [INFO]: Epoch 004 - training loss (MAE): 0.2752, validation MSE: 0.1970
2025-05-14 09:38:18 [INFO]: Epoch 005 - training loss (MAE): 0.2557, validation MSE: 0.1862
2025-05-14 09:39:34 [INFO]: Epoch 006 - training loss (MAE): 0.2496, validation MSE: 0.2211
2025-05-14 09:40:49 [INFO]: Epoch 007 - training loss (MAE): 0.2405, validation MSE: 0.2205
2025-05-14 09:42:02 [INFO]: Epoch 008 - training loss (MAE): 0.2364, validation MSE: 0.2109
2025-05-14 09:43:16 [INFO]: Epoch 009 - training loss (MAE): 0.2375, validation MSE: 0.2272
2025-05-14 09:44:30 [INFO]: Epoch 010 - training loss (MAE): 0.2254, validation MSE: 0.2232
2025-05-14 09:44:30 [INFO]: Finished training. The best model is from epoch#5.
2

MAE for fr_eng: 0.1834
NaN values detected in feature 1
NaN values detected in feature 2
NaN values detected in feature 3
NaN values detected in feature 4
NaN values detected in feature 5
NaN values detected in feature 6
NaN values detected in feature 7
NaN values detected in feature 8
NaN values detected in feature 9
NaN values detected in feature 10
NaN values detected in feature 11
NaN values detected in feature 12
NaN values detected in feature 13
MAE for bo_aux_blower_running: 0.4758
NaN values detected in feature 15
MAE for pr_air_scav_ecs: 59725.1059
MAE for pr_air_scav: 59225.6507
NaN values detected in feature 18
NaN values detected in feature 19
NaN values detected in feature 20
NaN values detected in feature 21
NaN values detected in feature 22
NaN values detected in feature 23
MAE for fr_eng_setpoint: 0.1787
NaN values detected in feature 25
NaN values detected in feature 26
NaN values detected in feature 27
MAE for fr_eng_ecs: 0.1799
NaN values detected in feature 29
MAE f

In [None]:
df1["pr_air_scav_ecs"].std()

64068.60854917041

In [None]:
df1["pr_air_scav"].std()

64068.60854917041

In [None]:
df1["fr_eng_ecs"].std()

0.2104741155094039

In [None]:
df1["fr_eng_setpoint"].std()

0.2128835870768723