In [1]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import ast
import logging
from msig import Motif, NullModel
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, VARIABLES, NORMALIZE_FLAGS, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS, NTOP_MOTIFS, MOTIF_SIZE

optuna.logging.set_verbosity(optuna.logging.WARNING)

print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")


if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

2025-01-31 17:39:09,585 - INFO - Results will be saved in: /home/mgsilva/motifpred/results/household
2025-01-31 17:39:09,585 - INFO - Images will be saved in: /home/mgsilva/motifpred/images/household
2025-01-31 17:39:09,585 - INFO - Data will be accessed from: /home/mgsilva/motifpred/data/household


Results will be saved in: /home/mgsilva/motifpred/results/household
Images will be saved in: /home/mgsilva/motifpred/images/household
Data will be accessed from: /home/mgsilva/motifpred/data/household


In [2]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)
mp_stats_table = mp_stats_table[mp_stats_table["m"] == MOTIF_SIZE]
top_motifs = mp_stats_table.sort_values(by="#Matches", ascending=False).head(NTOP_MOTIFS)
top_motifs = top_motifs[["m", "Indices"]]

In [3]:
# Import shared setup
from utils.setup import seed, device, early_stopper, pipeline, test_tensor

# Example usage
print(f"Device: {device}")
test_tensor()

Device: cuda
tensor([[0.3126, 0.3791, 0.3087],
        [0.0736, 0.4216, 0.0691],
        [0.2332, 0.4047, 0.2162],
        [0.9927, 0.4128, 0.5938],
        [0.6128, 0.1519, 0.0453]])


In [4]:
#load data as 
data_df = pd.read_csv(DATASET_PATH, index_col=0).astype(float)
data_df = data_df[VARIABLES]
labels = pd.read_csv(DATA_DIR  / f"labels.csv", index_col=0).astype(float)
data = data_df.values.T
data

array([[1.3  , 1.282, 1.302, ..., 0.62 , 0.62 , 0.618],
       [5.4  , 5.2  , 5.2  , ..., 2.6  , 2.6  , 2.6  ]])

In [5]:
def plot_motif_in_ts(data, motif_indexes, MOTIF_SIZE):
    data = np.asarray(data)  # Ensure data is a NumPy array
    
    if data.ndim == 1:  # Univariate case
        data = data.reshape(1, -1)  # Reshape to (1, N) for uniformity
    
    num_variates = data.shape[0]
    fig, axes = plt.subplots(num_variates, 2, figsize=(12, 5 * num_variates), gridspec_kw={'width_ratios': [1, 3]}, sharex=False)
    
    if num_variates == 1:
        axes = [axes]  # Ensure axes is always iterable
    
    for i in range(num_variates):
        # Plot the motif matches on the left
        for idx in motif_indexes:
            motif_pattern = data[i, idx:idx + MOTIF_SIZE]
            axes[i][0].plot(motif_pattern, alpha=0.7)
        axes[i][0].set_title(f"Motif Pattern Variate {i+1}")
        axes[i][0].set_ylabel("Value")
        
        # Plot the full time series on the right
        axes[i][1].plot(data[i], label=f"Variate {i+1}")
        for idx in motif_indexes:
            axes[i][1].axvspan(idx, idx + MOTIF_SIZE, color='red', alpha=0.3, label="Motif" if idx == motif_indexes[0] else "")
        axes[i][1].legend()
        axes[i][1].set_ylabel("Value")
        axes[i][1].set_title(f"Variate {i+1}")
    
    plt.xlabel("Time Index")
    plt.tight_layout()
    plt.show()

#plot_motif_in_ts(data, motif_indexes, MOTIF_SIZE)



In [None]:
from utils.utils import create_dataset
from utils.train_pipeline import run_optuna_study
from utils.utils import get_best_model_results_traindevtest, plot_best_model_results_traindevtest
from models.ffnn_pytorch import FFNN
from utils.utils import plot_preds_vs_truevalues
from utils.train_pipeline import get_preds_best_config_train_val_test


lookback_period = 60*24*3 #window size
step = 5 #step size for the sliding window
forecast_period = 60*24*2 #forward window size

test_losses_list = []
test_mae_list = []
test_rmse_list = []

# Loop through each of the top 10 motifs
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    
    print(f"Evaluating motif {i+1} with size {MOTIF_SIZE} and {len(motif_indexes)} indexes ")
    
    # Create dataset for the current motif
    X_series, X_indices, X_mask, y = create_dataset(data, lookback_period, step, forecast_period, motif_indexes, MOTIF_SIZE)

    # X_series, X2, and y are now PyTorch tensors
    print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, lookback_period, num_features)
    print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window, 1)
    print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
    print("y shape:", y.shape)    # Expected shape: (num_samples, 1)
    
    # Define the model and run the Optuna study
    n_trials = 100
    num_epochs = 500
    model_type = "FFNN"
    model_name = "FFNNSeries_Masking"

    suggestion_dict = {
        "learning_rate": {
            "type": "float",
            "args": [1e-5, 1e-3], 
            "kwargs": {"log": True} 
        },
        "num_layers": {
            "type": "categorical",
            "args": [[1, 2, 3, 4]] 
        },        
        "batch_size": {
            "type": "categorical",
            "args": [[16, 32, 64, 128]]
        }
    }

    model_params_keys = ["hidden_sizes_list"]
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}")
    os.makedirs(model_results_dir, exist_ok=True)  

    X = {"X_series": X_series, "X_mask": X_mask}
    run_optuna_study(pipeline.run_train_val_test, eval(model_type), model_type, suggestion_dict, model_params_keys, seed, X, y, NORMALIZE_FLAGS, model_results_dir, n_trials=n_trials, num_epochs=num_epochs)

    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)

    print(f"Best epoch: {best_epoch}")
    print(f"Test Loss: {test_loss}, Test MAE: {test_mae}, Test RMSE: {test_rmse}")

    test_losses_list.append(test_loss)
    test_mae_list.append(test_mae)
    test_rmse_list.append(test_rmse)
    
    # Plot predictions vs true values
    epochs_train_losses, epochs_val_losses, val_losses, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(study, pipeline, eval(model_type), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=NORMALIZE_FLAGS)
    plot_best_model_results_traindevtest( study.trials_dataframe(),
        save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}_best_results.png")
    )    
    plot_preds_vs_truevalues(np.ravel(all_true_values), np.ravel(all_predictions), fold=0, save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}_fold_{0}_predictions.png"))


# Convert lists to numpy arrays for easier calculations
test_losses_array = np.array(test_losses_list)
test_mae_array = np.array(test_mae_list)
test_rmse_array = np.array(test_rmse_list)

# Calculate mean and standard deviation
mean_test_loss = np.mean(test_losses_array)
std_test_loss = np.std(test_losses_array)

mean_test_mae = np.mean(test_mae_array)
std_test_mae = np.std(test_mae_array)

mean_test_rmse = np.mean(test_rmse_array)
std_test_rmse = np.std(test_rmse_array)

# Print aggregated results
print(f"Aggregated Results Across Top 5 Motifs:")
print(f"Mean Test Loss: {mean_test_loss} ± {std_test_loss}")
print(f"Mean Test MAE: {mean_test_mae} ± {std_test_mae}")
print(f"Mean Test RMSE: {mean_test_rmse} ± {std_test_rmse}")


Evaluating motif 30 with size 180 and 350 indexes 


X_series shape: torch.Size([14634, 4320, 2])
X_indices shape: torch.Size([14634, 26, 1])
X_mask shape: torch.Size([14634, 4320])
y shape: torch.Size([14634, 1])


  0%|          | 0/100 [00:00<?, ?it/s]

Early stopping at epoch 111, with best epoch being 66
Early stopping at epoch 101, with best epoch being 3
Early stopping at epoch 101, with best epoch being 0
Early stopping at epoch 101, with best epoch being 17
Early stopping at epoch 101, with best epoch being 9
Early stopping at epoch 101, with best epoch being 0
Early stopping at epoch 101, with best epoch being 0
Early stopping at epoch 101, with best epoch being 16
Early stopping at epoch 101, with best epoch being 0
Early stopping at epoch 101, with best epoch being 2
Early stopping at epoch 101, with best epoch being 14
Early stopping at epoch 101, with best epoch being 14
Early stopping at epoch 101, with best epoch being 14
Early stopping at epoch 101, with best epoch being 3
Early stopping at epoch 101, with best epoch being 4
Early stopping at epoch 101, with best epoch being 12
Early stopping at epoch 101, with best epoch being 6
Early stopping at epoch 101, with best epoch being 52
Early stopping at epoch 101, with best