In [1]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import ast
import logging
from msig import Motif, NullModel
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, VARIABLES, NORMALIZE_FLAGS, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS, NTOP_MOTIFS, MOTIF_SIZE
from config import LOOKBACK_PERIOD, STEP, FORECAST_PERIOD

optuna.logging.set_verbosity(optuna.logging.WARNING)

print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")


if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

2025-02-06 16:48:02,408 - INFO - Results will be saved in: /home/mgsilva/motifpred/results/household
2025-02-06 16:48:02,408 - INFO - Images will be saved in: /home/mgsilva/motifpred/images/household
2025-02-06 16:48:02,408 - INFO - Data will be accessed from: /home/mgsilva/motifpred/data/household


Results will be saved in: /home/mgsilva/motifpred/results/household
Images will be saved in: /home/mgsilva/motifpred/images/household
Data will be accessed from: /home/mgsilva/motifpred/data/household


In [2]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)
mp_stats_table = mp_stats_table[mp_stats_table["m"] == MOTIF_SIZE]
top_motifs = mp_stats_table.sort_values(by=["#Matches", "ID"], ascending=[False, True]).head(NTOP_MOTIFS)
top_motifs = top_motifs[["m", "Indices"]]

In [3]:
# Import shared setup
from utils.setup import seed, device, early_stopper, pipeline, test_tensor

# Example usage
print(f"Device: {device}")
test_tensor()

Device: cuda
tensor([[0.3126, 0.3791, 0.3087],
        [0.0736, 0.4216, 0.0691],
        [0.2332, 0.4047, 0.2162],
        [0.9927, 0.4128, 0.5938],
        [0.6128, 0.1519, 0.0453]])


In [4]:
#load data as 
data_df = pd.read_csv(DATASET_PATH, index_col=0).astype(float)
data_df = data_df[VARIABLES]
labels = pd.read_csv(DATA_DIR  / f"labels.csv", index_col=0).astype(float)
data = data_df.values.T
data

array([[1.2944, 1.3644, 1.4672, ..., 0.7808, 0.7676, 0.634 ],
       [0.    , 0.0776, 0.2188, ..., 0.1644, 0.1492, 0.0608]])

In [5]:
from utils.utils import create_dataset
from utils.train_pipeline import run_optuna_study
from utils.utils import get_best_model_results_traindevtest, plot_best_model_results_traindevtest
from models.baseline_pytorch import BaselineAverage, BaselineLastDifference
from utils.utils import plot_preds_vs_truevalues
from utils.train_pipeline import get_preds_best_config_train_val_test


test_losses_list, test_mae_list, test_rmse_list  = [], [], []
# Loop through each of the top 10 motifs
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    
    print(f"Evaluating motif {i} with size {MOTIF_SIZE} and {len(motif_indexes)} indexes ")
    
    # Create dataset for the current motif
    X_series, X_indices, X_mask, y = create_dataset(data, LOOKBACK_PERIOD, STEP, FORECAST_PERIOD, motif_indexes, MOTIF_SIZE)

    # X_series, X2, and y are now PyTorch tensors
    print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, lookback_period, num_features)
    print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window, 1)
    print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
    print("y shape:", y.shape)    # Expected shape: (num_samples, 1)
    
    # Define the model and run the Optuna study
    n_trials = 1
    num_epochs = 1
    model_name = "BaselineAverage" 
    model_type = "Baseline"
    
    suggestion_dict = {
        "batch_size": {
            "type": "categorical",
            "args": [[4,8, 16, 32, 64, 128]]
        }
    }
    
    model_params_keys = []
    
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}")
    os.makedirs(model_results_dir, exist_ok=True)  
    
    X = {"X_series": X_series, "X_mask": X_mask, "X_indices": X_indices}
    normalize_flags = {"X_series": True, "X_mask": False, "X_indices": False}
    
    run_optuna_study(pipeline.run_train_val_test, eval(model_name), model_type, suggestion_dict,  model_params_keys, seed, X , y, normalize_flags, model_results_dir, n_trials=n_trials, num_epochs=num_epochs)
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)
    
    print(f"Best epoch: {best_epoch}")
    print(f"Test Loss: {test_loss}, Test MAE: {test_mae}, Test RMSE: {test_rmse}")

    test_losses_list.append(test_loss)
    test_mae_list.append(test_mae)
    test_rmse_list.append(test_rmse)
    
    # Plot predictions vs true values
    #epochs_train_losses, epochs_val_losses, val_losses, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(study, pipeline, eval(model_name), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags)
    #plot_preds_vs_truevalues(np.ravel(all_true_values), np.ravel(all_predictions), fold=0, save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}_fold_{0}_predictions.png"))


# Convert lists to numpy arrays for easier calculations
test_losses_array = np.array(test_losses_list)
test_mae_array = np.array(test_mae_list)
test_rmse_array = np.array(test_rmse_list)

# Calculate mean and standard deviation
mean_test_loss = np.mean(test_losses_array)
std_test_loss = np.std(test_losses_array)

mean_test_mae = np.mean(test_mae_array)
std_test_mae = np.std(test_mae_array)

mean_test_rmse = np.mean(test_rmse_array)
std_test_rmse = np.std(test_rmse_array)

# Print aggregated results
print(f"Aggregated Results Across Top 10 Motifs:")
print(f"Mean Test Loss: {mean_test_loss} ± {std_test_loss}")
print(f"Mean Test MAE: {mean_test_mae} ± {std_test_mae}")
print(f"Mean Test RMSE: {mean_test_rmse} ± {std_test_rmse}")

Evaluating motif 5 with size 24 and 359 indexes 
X_series shape: torch.Size([2977, 576, 2])
X_indices shape: torch.Size([2977, 22, 1])
X_mask shape: torch.Size([2977, 576])
y shape: torch.Size([2977, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 1746.0968017578125, Test MAE: 31.11246681213379, Test RMSE: 41.78632354736328
Evaluating motif 17 with size 24 and 358 indexes 
X_series shape: torch.Size([2974, 576, 2])
X_indices shape: torch.Size([2974, 20, 1])
X_mask shape: torch.Size([2974, 576])
y shape: torch.Size([2974, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 1394.7540283203125, Test MAE: 29.24692153930664, Test RMSE: 37.346405029296875
Evaluating motif 1 with size 24 and 298 indexes 
X_series shape: torch.Size([2987, 576, 2])
X_indices shape: torch.Size([2987, 17, 1])
X_mask shape: torch.Size([2987, 576])
y shape: torch.Size([2987, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 1608.72607421875, Test MAE: 32.095664978027344, Test RMSE: 40.10892868041992
Evaluating motif 7 with size 24 and 279 indexes 


KeyboardInterrupt: 

In [12]:
# Loop through each of the top 10 motifs
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    
    print(f"Evaluating motif {i+1} with size {MOTIF_SIZE} and {len(motif_indexes)} indexes ")
    
    # Create dataset for the current motif
    X_series, X_indices, X_mask, y = create_dataset(data, LOOKBACK_PERIOD, STEP, FORECAST_PERIOD, motif_indexes, MOTIF_SIZE)

    # X_series, X2, and y are now PyTorch tensors
    print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, LOOKBACK_PERIOD, num_features)
    print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window, 1)
    print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
    print("y shape:", y.shape)    # Expected shape: (num_samples, 1)
    
    # Define the model and run the Optuna study
    n_trials = 1
    num_epochs = 1
    model_name = "BaselineLastDifference" 
    model_type = "Baseline"
    
    suggestion_dict = {
        "batch_size": {
            "type": "categorical",
            "args": [[16, 32, 64, 128]]
        }
    }
    
    model_params_keys = []
    
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}")
    os.makedirs(model_results_dir, exist_ok=True)  
    
    X = {"X_series": X_series, "X_mask": X_mask, "X_indices": X_indices}
    normalize_flags = {"X_series": True, "X_mask": False, "X_indices": False}
    
    run_optuna_study(pipeline.run_train_val_test, eval(model_name), model_type, suggestion_dict,  model_params_keys, seed, X , y, normalize_flags, model_results_dir, n_trials=n_trials, num_epochs=num_epochs)
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)
    
    print(f"Best epoch: {best_epoch}")
    print(f"Test Loss: {test_loss}, Test MAE: {test_mae}, Test RMSE: {test_rmse}")

    test_losses_list.append(test_loss)
    test_mae_list.append(test_mae)
    test_rmse_list.append(test_rmse)
    
    # Plot predictions vs true values
    #epochs_train_losses, epochs_val_losses, val_losses, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(study, pipeline, eval(model_name), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags)
    #plot_preds_vs_truevalues(np.ravel(all_true_values), np.ravel(all_predictions), fold=0, save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}_fold_{0}_predictions.png"))


# Convert lists to numpy arrays for easier calculations
test_losses_array = np.array(test_losses_list)
test_mae_array = np.array(test_mae_list)
test_rmse_array = np.array(test_rmse_list)

# Calculate mean and standard deviation
mean_test_loss = np.mean(test_losses_array)
std_test_loss = np.std(test_losses_array)

mean_test_mae = np.mean(test_mae_array)
std_test_mae = np.std(test_mae_array)

mean_test_rmse = np.mean(test_rmse_array)
std_test_rmse = np.std(test_rmse_array)

# Print aggregated results
print(f"Aggregated Results Across Top 10 Motifs:")
print(f"Mean Test Loss: {mean_test_loss} ± {std_test_loss}")
print(f"Mean Test MAE: {mean_test_mae} ± {std_test_mae}")
print(f"Mean Test RMSE: {mean_test_rmse} ± {std_test_rmse}")

Evaluating motif 6 with size 24 and 359 indexes 
X_series shape: torch.Size([2977, 576, 2])
X_indices shape: torch.Size([2977, 22, 1])
X_mask shape: torch.Size([2977, 576])
y shape: torch.Size([2977, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 2196.96435546875, Test MAE: 35.062782287597656, Test RMSE: 46.871788024902344
Evaluating motif 18 with size 24 and 358 indexes 
X_series shape: torch.Size([2974, 576, 2])
X_indices shape: torch.Size([2974, 20, 1])
X_mask shape: torch.Size([2974, 576])
y shape: torch.Size([2974, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 2009.228759765625, Test MAE: 32.11659240722656, Test RMSE: 44.82442092895508
Evaluating motif 2 with size 24 and 298 indexes 
X_series shape: torch.Size([2987, 576, 2])
X_indices shape: torch.Size([2987, 17, 1])
X_mask shape: torch.Size([2987, 576])
y shape: torch.Size([2987, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 2750.3974609375, Test MAE: 40.12946701049805, Test RMSE: 52.44423294067383
Evaluating motif 8 with size 24 and 279 indexes 
X_series shape: torch.Size([2984, 576, 2])
X_indices shape: torch.Size([2984, 22, 1])
X_mask shape: torch.Size([2984, 576])
y shape: torch.Size([2984, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 5800.81640625, Test MAE: 59.326622009277344, Test RMSE: 76.16309356689453
Evaluating motif 7 with size 24 and 268 indexes 
X_series shape: torch.Size([2887, 576, 2])
X_indices shape: torch.Size([2887, 25, 1])
X_mask shape: torch.Size([2887, 576])
y shape: torch.Size([2887, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 128}
Best epoch: 0
Test Loss: 2349.140869140625, Test MAE: 35.6027717590332, Test RMSE: 48.46793746948242
Aggregated Results Across Top 10 Motifs:
Mean Test Loss: 2826.760205078125 ± 1827.806041792528
Mean Test MAE: 38.941161155700684 ± 12.550028840216694
Mean Test RMSE: 50.98032646179199 ± 15.091937194137751
