In [1]:
import os
import sys
import yaml
import numpy as np
import pandas as pd
import torch
import csv
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import optuna
import random
import joblib
from pathlib import Path

# Load YAML configuration
config_path = "config.yaml"  # Ensure this is the correct path
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

# Convert base_dir to absolute
BASE_DIR = Path(config["base_dir"]).resolve()

# Convert other paths to absolute using BASE_DIR
RESULTS_DIR = BASE_DIR / config["results_dir"]
IMAGES_DIR = BASE_DIR / config["images_dir"]
DATA_DIR = BASE_DIR / config["data_dir"]
DATASET_PATH = BASE_DIR / config["dataset_path"]
MOTIF_INDEXES_PATH = BASE_DIR / config["motif_indexes_path"]
LOOKBACK_PERIOD = config.get("lookback_period", None)
STEP = config.get("step", None)
FORECAST_PERIOD = config.get("forecast_period", None)

# Extract additional parameters from YAML
N = config["n"]
K = config["k"]
P = config["p"]
VARIABLES_PATTERN = config["variables_pattern"]
NORMALIZE_FLAGS = config["normalize_flags"]

# Print resolved paths for debugging
print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")
print(f"Dataset path: {DATASET_PATH}")
print(f"Motif indexes path: {MOTIF_INDEXES_PATH}")

# Ensure results directory exists
os.makedirs(RESULTS_DIR, exist_ok=True)

# Handling different environments
if '__file__' in globals():
    # For standalone scripts
    base_dir = Path(__file__).parent.resolve()
else:
    # For Jupyter or interactive environments
    base_dir = Path(os.getcwd()).resolve()

# Add the parent directory of `utils` to the Python path
sys.path.append(str(base_dir / "../"))

Results will be saved in: /home/mgsilva/motifpred/results/syntheticdata1/variables=[0,2]
Images will be saved in: /home/mgsilva/motifpred/images/syntheticdata1/variables=[0,2]
Data will be accessed from: /home/mgsilva/motifpred/data/syntheticdata1/variables=[0,2]
Dataset path: /home/mgsilva/motifpred/data/syntheticdata1/variables=[0,2]/scenario1_n=100000_k=3_p=5_min_step=5_max_step=45_variables=[0,2].csv
Motif indexes path: /home/mgsilva/motifpred/data/syntheticdata1/variables=[0,2]/motif_indexes_scenario1_n=100000_k=3_p=5_min_step=5_max_step=45.csv


In [2]:
data = np.genfromtxt(DATASET_PATH, delimiter=",").astype(int).reshape((K, N))
motif_indexes = np.genfromtxt(MOTIF_INDEXES_PATH, delimiter=",").astype(int)

print(motif_indexes)

[    0    10    22 ... 99922 99956 99992]


In [3]:
# Import shared setup
from utils.setup import seed, device, early_stopper, pipeline, test_tensor

# Example usage
print(f"Device: {device}")
test_tensor()

Device: cuda
tensor([[0.3126, 0.3791, 0.3087],
        [0.0736, 0.4216, 0.0691],
        [0.2332, 0.4047, 0.2162],
        [0.9927, 0.4128, 0.5938],
        [0.6128, 0.1519, 0.0453]])


In [4]:
from utils.utils import create_dataset

#X_series: past window, X_indices: indexes of the motif in the window,  y: next relative index of the motif
X_series, X_indices, X_mask, y = create_dataset(data, LOOKBACK_PERIOD, STEP, FORECAST_PERIOD, motif_indexes, P)

# X_series, X2, and y are now PyTorch tensors
print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, lookback_period, num_features)
print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, lookback_period)
print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
print("y shape:", y.shape)    # Expected shape: (num_samples, 1)


X_series shape: torch.Size([19979, 100, 3])
X_mask shape: torch.Size([19979, 100])
X_indices shape: torch.Size([19979, 6, 1])
y shape: torch.Size([19979, 1])


In [5]:
from models.ffnn_pytorch import FFNN
from models.lstm_pytorch import LSTM
from models.cnn_pytorch import CNN
from models.tcn_pytorch import TCN
from models.transformer_pytorch import Transformer
from models.baseline_pytorch import BaselineAverage, BaselineLastDifference
from utils.utils import print_study_results, get_best_model_results, plot_best_model_results, plot_preds_vs_truevalues
from utils.train_pipeline import get_preds_best_config

models = ["FFNN", "LSTM", "CNN", "TCN", "Transformer", "Baseline"]
inputs = ["Series", "Series_Masking", "Indexes"]

n_trials = 100
num_epochs = 500


def process_baseline_model(model_class, input_name, X, normalize_flags, n_trials, num_epochs, pipeline, seed, y):
    """Process baseline models."""
    model_name = f"{model_class.__name__}{input_name}"
    dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs")

    if os.path.exists(os.path.join(dir, "best_model_results.csv")):
        print(f"Model {model_name} already exists")
        return

    study = joblib.load(os.path.join(dir, "study.pkl"))
    fold_val_losses, fold_test_losses = get_best_model_results(study)

    (epochs_train_losses, epochs_val_losses, val_losses, test_losses,
     test_mae_per_fold, test_rmse_per_fold, all_predictions, all_true_values) = get_preds_best_config(
        study, pipeline, model_class, "Baseline", [], num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags
    )

    if not np.allclose(fold_val_losses, val_losses):
        raise Exception("Best model val losses are not close to val losses")

    # Save results to CSV
    results = pd.DataFrame({
        "fold": np.arange(1, 6),
        "val_loss": fold_val_losses,
        "test_loss": fold_test_losses,
        "test_mae": test_mae_per_fold,
        "test_rmse": test_rmse_per_fold
    })
    results.to_csv(os.path.join(dir, "best_model_results.csv"), index=False, mode='w')


def process_non_baseline_model(model_type, model_params_keys, input_name, X, normalize_flags, num_epochs, seed, pipeline, y):
    """Process non-baseline models."""
    model_name = f"{model_type}{input_name}"
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs")

    if os.path.exists(os.path.join(model_results_dir, "best_model_results.csv")):
        print(f"Model {model_name} already exists")
        return

    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    fold_val_losses, fold_test_losses = get_best_model_results(study)

    (epochs_train_losses, epochs_val_losses, val_losses, test_losses,
     test_mae_per_fold, test_rmse_per_fold, all_predictions, all_true_values) = get_preds_best_config(
        study, pipeline, eval(model_type), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags
    )

    #if not np.allclose(fold_val_losses, val_losses, atol=1):
    #    raise Exception("Best model val losses are not close to val losses")

    # Save results to CSV
    results = pd.DataFrame({
        "fold": np.arange(1, 6),
        "val_loss": fold_val_losses,
        "test_loss": fold_test_losses,
        "test_mae": test_mae_per_fold,
        "test_rmse": test_rmse_per_fold
    })
    results.to_csv(os.path.join(model_results_dir, "best_model_results.csv"), index=False, mode='w')


for model_type in models:
    for input_name in inputs:
        normalize_flags = NORMALIZE_FLAGS
        n_trials, num_epochs = (1, 1) if model_type == "Baseline" else (n_trials, num_epochs)

        if model_type == "Baseline":
            if input_name != "Indexes":
                continue

            X = {"X_series": X_series, "X_mask": X_mask, "X_indices": X_indices}
            normalize_flags = {"X_series": True, "X_mask": False, "X_indices": False}

            for model_class in [BaselineAverage, BaselineLastDifference]:
                process_baseline_model(
                    model_class, input_name, X, normalize_flags, n_trials, num_epochs, pipeline, seed, y
                )

        else:
            if input_name == "Series":
                X = {"X_series": X_series}
            elif input_name == "Series_Masking":
                X = {"X_series": X_series, "X_mask": X_mask}
            else:
                X = {"X_indices": X_indices}


            model_params_map = {
                "FFNN": ["hidden_sizes_list"],
                "LSTM": ["hidden_sizes_list"],
                "CNN": ["kernel_size", "num_filters_list", "pool_size"],
                "TCN": ["kernel_size", "num_channels_list", "dropout"],
                "Transformer": ["d_model", "n_heads", "e_layers", "dim_feedforward", "dropout"]
            }

            process_non_baseline_model(
                model_type, model_params_map[model_type], input_name, X, normalize_flags, num_epochs, seed, pipeline, y
            )


Model FFNNSeries already exists
Model FFNNSeries_Masking already exists
Model FFNNIndexes already exists
Model LSTMSeries already exists
Model LSTMSeries_Masking already exists
Model LSTMIndexes already exists
Model CNNSeries already exists
Model CNNSeries_Masking already exists
Model CNNIndexes already exists
Model TCNSeries already exists
Model TCNSeries_Masking already exists
Model TCNIndexes already exists
Model TransformerSeries already exists
Model TransformerSeries_Masking already exists
Model TransformerIndexes already exists
Model BaselineAverageIndexes already exists
Model BaselineLastDifferenceIndexes already exists


In [6]:
import numpy as np
from scipy import stats
rng = np.random.default_rng()
from scipy.stats import ttest_rel

models = ["FFNN", "LSTM", "CNN", "TCN", "Transformer", "Baseline"]
inputs = ["Series", "Series_Masking", "Indexes"]

results_df = pd.DataFrame(columns=["model", "input", "fold", "mae", "rmse"])

# Combine each model with each input
for model_type in models:
    for input_name in inputs:
        # Handle baseline-specific logic
        if model_type == "Baseline":
            n_trials, num_epochs = (1, 1)
            if input_name != "Indexes":
                continue
            
            # Process both BaselineAverage and BaselineLastDifference
            baseline_variants = ["BaselineAverage", "BaselineLastDifference"]
            for baseline_type in baseline_variants:
                model_name = f"{baseline_type}{input_name}"
                print(f"Processing Model: {model_name}")

                # Construct the results directory path
                model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs")
                results_file = os.path.join(model_results_dir, "best_model_results.csv")

                # Skip if results file doesn't exist
                if not os.path.exists(results_file):
                    print(f"Results file for {model_name} not found. Skipping.")
                    continue

                # Load results from CSV
                results = pd.read_csv(results_file)
                maes = results["test_mae"].values
                rmses = results["test_rmse"].values

                # Add results to the dataframe
                for i in range(len(maes)):  # Assuming results have folds
                    results_df = pd.concat([
                        results_df,
                        pd.DataFrame([{
                            "model": baseline_type,
                            "input": input_name,
                            "fold": i + 1,
                            "mae": maes[i],
                            "rmse": rmses[i]
                        }])
                    ], ignore_index=True)

        else:
            n_trials = 100
            num_epochs = 500

            model_name = f"{model_type}{input_name}"
            print(f"Processing Model: {model_name}")

            # Construct the results directory path
            model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs")
            results_file = os.path.join(model_results_dir, "best_model_results.csv")

            # Skip if results file doesn't exist
            if not os.path.exists(results_file):
                print(f"Results file for {model_name} not found. Skipping.")
                continue

            # Load results from CSV
            results = pd.read_csv(results_file)
            maes = results["test_mae"].values
            rmses = results["test_rmse"].values

            # Add results to the dataframe
            for i in range(len(maes)):  # Assuming results have folds
                results_df = pd.concat([
                    results_df,
                    pd.DataFrame([{
                        "model": model_type,
                        "input": input_name,
                        "fold": i + 1,
                        "mae": maes[i],
                        "rmse": rmses[i]
                    }])
                ], ignore_index=True)

# Display the results
print(results_df)

Processing Model: FFNNSeries
Processing Model: FFNNSeries_Masking
Processing Model: FFNNIndexes
Processing Model: LSTMSeries
Processing Model: LSTMSeries_Masking
Processing Model: LSTMIndexes
Processing Model: CNNSeries
Processing Model: CNNSeries_Masking
Processing Model: CNNIndexes
Processing Model: TCNSeries
Processing Model: TCNSeries_Masking
Processing Model: TCNIndexes
Processing Model: TransformerSeries
Processing Model: TransformerSeries_Masking
Processing Model: TransformerIndexes
Processing Model: BaselineAverageIndexes
Processing Model: BaselineLastDifferenceIndexes
                     model    input fold       mae       rmse
0                     FFNN   Series    1  9.389045  11.669245
1                     FFNN   Series    2  8.793365  10.960507
2                     FFNN   Series    3  9.180077  11.325398
3                     FFNN   Series    4  8.737524  11.029864
4                     FFNN   Series    5  8.585275  10.890684
..                     ...      ...  ...    

  results_df = pd.concat([


In [7]:
#average fold results for each model and input
avg_results_df = results_df.groupby(["model", "input"]).mean().reset_index()
print(avg_results_df)

                     model           input fold       mae       rmse
0          BaselineAverage         Indexes  3.0  7.061335   9.466859
1   BaselineLastDifference         Indexes  3.0  4.363204   7.537955
2                      CNN         Indexes  3.0  0.345493   0.555321
3                      CNN          Series  3.0  6.417948   8.568752
4                      CNN  Series_Masking  3.0  0.701390   0.967899
5                     FFNN         Indexes  3.0  0.549793   1.042796
6                     FFNN          Series  3.0  8.937057  11.175140
7                     FFNN  Series_Masking  3.0  1.012417   1.686616
8                     LSTM         Indexes  3.0  0.148808   0.186667
9                     LSTM          Series  3.0  4.428348   7.396018
10                    LSTM  Series_Masking  3.0  1.099881   2.904646
11                     TCN         Indexes  3.0  1.084515   1.616984
12                     TCN          Series  3.0  1.152880   1.830841
13                     TCN  Series

In [8]:
# Define parameters
models_1 = ["FFNN", "TCN", "Transformer", "LSTM", "CNN" ]
input_types_1 = ["Series"]
models_2 = ["FFNN", "TCN", "Transformer", "LSTM", "CNN" ]
input_types_2 = ["Series_Masking"]

# Filter data for the selected input types

results = []
for model1 in models_1:
    for model2 in models_2:
        for input_1 in input_types_1:
            for input_2 in input_types_2:
                for metric in ["mae", "rmse"]:

                    data1 = results_df[(results_df['model'] == model1) & (results_df['input'] == input_1)].sort_values('fold')[metric]
                    data2 = results_df[(results_df['model'] == model2) & (results_df['input'] == input_2)].sort_values('fold')[metric]

                    # Perform a paired t-test if the lengths match
                    if model1 != model2:
                        continue
                    if len(data1) == len(data2):
                        t_stat, p_value = ttest_rel(data1, data2, alternative='greater')
                        results.append({
                            "Model_1": model1,
                            "InputType_1": input_1,
                            "Model_2": model2,
                            "InputType_2": input_2,
                            "Metric": metric,
                            "P-Value": p_value
                        })

# Convert results to DataFrame and display
pval_results_df = pd.DataFrame(results)
pval_results_df

Unnamed: 0,Model_1,InputType_1,Model_2,InputType_2,Metric,P-Value
0,FFNN,Series,FFNN,Series_Masking,mae,4.524174e-07
1,FFNN,Series,FFNN,Series_Masking,rmse,3.219293e-07
2,TCN,Series,TCN,Series_Masking,mae,0.003540011
3,TCN,Series,TCN,Series_Masking,rmse,0.003328977
4,Transformer,Series,Transformer,Series_Masking,mae,0.031401
5,Transformer,Series,Transformer,Series_Masking,rmse,0.006535836
6,LSTM,Series,LSTM,Series_Masking,mae,0.008084342
7,LSTM,Series,LSTM,Series_Masking,rmse,0.002631434
8,CNN,Series,CNN,Series_Masking,mae,4.928297e-06
9,CNN,Series,CNN,Series_Masking,rmse,4.813531e-06
