In [1]:
import os
import sys
import yaml
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.image as mpimg
import optuna
import joblib
import math
import ast
import logging
from pathlib import Path
from msig import Motif, NullModel

optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load YAML configuration
config_path = "config.yaml"  # Ensure this path is correct
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

# Convert base_dir to absolute
BASE_DIR = Path(config["base_dir"]).resolve()

# Convert other paths to absolute using BASE_DIR
RESULTS_DIR = BASE_DIR / config["results_dir"]
RESULTS_MOTIF_DIR = BASE_DIR / config["results_motif_dir"]
IMAGES_DIR = BASE_DIR / config["images_dir"]
DATA_DIR = BASE_DIR / config["data_dir"]
DATASET_PATH = BASE_DIR / config["dataset_path"]

# Extract remaining parameters from YAML
TOWNSHIP_NAME = config["township_name"]
VARIABLES = config["variables"]
NORMALIZE_FLAGS = config["normalize_flags"]
STUMPY_EXCL_ZONE_DENOM = config["stumpy_excl_zone_denom"]
TOP_K_MP = config["top_k_mp"]
INCLUDE = config["include"]
NORMALIZE = config["normalize"]
SUBSEQUENCES_LENGTHS = config["subsequences_lengths"]
NTOP_MOTIFS = config["ntop_motifs"]
MOTIF_SIZE = config["motif_size"]
LOOKBACK_PERIOD = config["lookback_period"]
STEP = config["step"]
FORECAST_PERIOD = config["forecast_period"]

# Print resolved paths for debugging
print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")
print(f"Dataset path: {DATASET_PATH}")

# Handling different environments
if '__file__' in globals():
    # For standalone scripts
    base_dir = Path(__file__).parent.resolve()
else:
    # For Jupyter or interactive environments
    base_dir = Path(os.getcwd()).resolve()

# Add the parent directory of `utils` to the Python path
sys.path.append(str(base_dir / "../"))

Results will be saved in: /home/mgsilva/motifpred/results/populationdensity
Images will be saved in: /home/mgsilva/motifpred/images/populationdensity
Data will be accessed from: /home/mgsilva/motifpred/data/populationdensity
Dataset path: /home/mgsilva/motifpred/data/populationdensity/hourly_township.csv


In [2]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)
mp_stats_table = mp_stats_table[mp_stats_table["m"] == MOTIF_SIZE]
top_motifs = mp_stats_table.sort_values(by=["#Matches", "ID"], ascending=[False, True]).head(NTOP_MOTIFS)
top_motifs = top_motifs[["m", "Indices"]]

In [3]:
# read csv
data_df = pd.read_csv(
    DATASET_PATH,
    parse_dates=["one_time"],
    date_format="%Y-%m-%d %H:%M:%S",
    index_col=0,
)

data_df = data_df[data_df["township_name"] == TOWNSHIP_NAME]
#set index to one_time and township_name
data_df = data_df.set_index(["one_time", "township_name"]).sort_index()[VARIABLES]
data = data_df.to_numpy().T
data_univar = data[0]
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_terminals
one_time,township_name,Unnamed: 2_level_1
2021-09-15 00:00:00,Avenidas Novas,260700.0
2021-09-15 01:00:00,Avenidas Novas,276675.0
2021-09-15 02:00:00,Avenidas Novas,284563.0
2021-09-15 03:00:00,Avenidas Novas,279563.0
2021-09-15 04:00:00,Avenidas Novas,281460.0
...,...,...
2021-11-30 19:00:00,Avenidas Novas,391367.0
2021-11-30 20:00:00,Avenidas Novas,352361.0
2021-11-30 21:00:00,Avenidas Novas,388246.0
2021-11-30 22:00:00,Avenidas Novas,360169.0


In [4]:
# Import shared setup
from utils.setup import seed, device, early_stopper, pipeline, test_tensor

# Example usage
print(f"Device: {device}")
test_tensor()

Device: cuda
tensor([[0.3126, 0.3791, 0.3087],
        [0.0736, 0.4216, 0.0691],
        [0.2332, 0.4047, 0.2162],
        [0.9927, 0.4128, 0.5938],
        [0.6128, 0.1519, 0.0453]])


In [5]:
import os
import numpy as np
import joblib
import ast
import pandas as pd
from utils.utils import create_dataset, get_best_model_results_traindevtest, plot_best_model_results_traindevtest, plot_preds_vs_truevalues
from utils.train_pipeline import get_preds_best_config_train_val_test
from models.ffnn_pytorch import FFNN
from models.lstm_pytorch import LSTM
from models.cnn_pytorch import CNN
from models.tcn_pytorch import TCN
from models.transformer_pytorch import Transformer
from models.baseline_pytorch import BaselineAverage, BaselineLastDifference

models = [ "FFNN", "LSTM", "CNN", "TCN", "Transformer", "Baseline"]
inputs = ["Series", "Series_Masking", "Indexes"]

n_trials = 100
num_epochs = 500

def process_baseline_model(model_class, input_name, X, normalize_flags, n_trials, num_epochs, seed, pipeline, y, motif_id):
    """Process baseline models."""
    model_name = f"{model_class.__name__}{input_name}"
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
    os.makedirs(model_results_dir, exist_ok=True)
    
    if os.path.exists(os.path.join(model_results_dir, "best_model_results.csv")):
        print(f"Model {model_name} already exists for motif {motif_id}")
        return
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)
    
    _, _, _, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(
        study, pipeline, model_class, "Baseline", [], num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags
    )

    if not np.allclose(test_loss, test_losses, atol=0.1):
        print(f"Test loss: {test_loss}")
        print(f"Retrained test loss: {test_losses}")
        raise Exception("Best model test loss does not match the one obtained from the study")

    
    results = pd.DataFrame({
        "test_loss": [test_loss],
        "test_mae": [test_mae],
        "test_rmse": [test_rmse]
    })
    results.to_csv(os.path.join(model_results_dir, "best_model_results.csv"), index=False)

def process_non_baseline_model(model_type, model_params_keys, input_name, X, normalize_flags, num_epochs, seed, pipeline, y, motif_id):
    """Process non-baseline models."""
    model_name = f"{model_type}{input_name}"
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
    os.makedirs(model_results_dir, exist_ok=True)
    
    if os.path.exists(os.path.join(model_results_dir, "best_model_results.csv")):
        print(f"Model {model_name} already exists for motif {motif_id}")
        return
    
    print(f"Processing model {model_name} for motif {motif_id}")
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse  = get_best_model_results_traindevtest(study)
    
    _, _, _, retrained_test_losses, retrained_test_mae, retrained_test_rmse, retrained_all_predictions, retrained_all_true_values = get_preds_best_config_train_val_test(
        study, pipeline, eval(model_type), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags
    )
    
    if not np.allclose(test_loss, retrained_test_losses, atol=0.1):
        print(f"Test loss: {test_loss}")
        print(f"Retrained test loss: {retrained_test_losses}")
        raise Exception("Best model test loss does not match the one obtained from the study")
    
    results = pd.DataFrame({
        "test_loss": [test_loss],
        "test_mae": [test_mae],
        "test_rmse": [test_rmse]
    })
    results.to_csv(os.path.join(model_results_dir, "best_model_results.csv"), index=False)

# Loop through each motif
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    print(f"Evaluating motif {i+1} with size {MOTIF_SIZE} and {len(motif_indexes)} indexes")
    
    X_series, X_indices, X_mask, y = create_dataset(data, LOOKBACK_PERIOD, STEP, FORECAST_PERIOD, motif_indexes, MOTIF_SIZE)

    
    for model_type in models:
        for input_name in inputs:
            normalize_flags = NORMALIZE_FLAGS
            model_params_map = {
                "FFNN": ["hidden_sizes_list"],
                "LSTM": ["hidden_sizes_list"],
                "CNN": ["kernel_size", "num_filters_list", "pool_size"],
                "TCN": ["kernel_size", "num_channels_list", "dropout"],
                "Transformer": ["d_model", "n_heads", "e_layers", "dim_feedforward", "dropout"]
            }
            
            if model_type == "Baseline":
                if input_name != "Indexes":
                    continue

                X = {"X_series": X_series, "X_mask": X_mask, "X_indices": X_indices}
                normalize_flags = {"X_series": True, "X_mask": False, "X_indices": False}

                for model_class in [BaselineAverage, BaselineLastDifference]:
                    process_baseline_model(model_class, input_name, X, normalize_flags, 1, 1, seed, pipeline, y, i+1)
            else:
                if input_name == "Series":
                    X = {"X_series": X_series}
                elif input_name == "Series_Masking":
                    X = {"X_series": X_series, "X_mask": X_mask}
                else:
                    X = {"X_indices": X_indices}


                model_params_map = {
                    "FFNN": ["hidden_sizes_list"],
                    "LSTM": ["hidden_sizes_list"],
                    "CNN": ["kernel_size", "num_filters_list", "pool_size"],
                    "TCN": ["kernel_size", "num_channels_list", "dropout"],
                    "Transformer": ["d_model", "n_heads", "e_layers", "dim_feedforward", "dropout"]
                }

                process_non_baseline_model(
                    model_type, model_params_map[model_type], input_name, X, normalize_flags, num_epochs, seed, pipeline, y, i+1
                )


Evaluating motif 27 with size 12 and 58 indexes
Model FFNNSeries already exists for motif 27
Model FFNNSeries_Masking already exists for motif 27
Model FFNNIndexes already exists for motif 27
Model LSTMSeries already exists for motif 27
Model LSTMSeries_Masking already exists for motif 27
Model LSTMIndexes already exists for motif 27
Model CNNSeries already exists for motif 27
Model CNNSeries_Masking already exists for motif 27
Model CNNIndexes already exists for motif 27
Model TCNSeries already exists for motif 27
Model TCNSeries_Masking already exists for motif 27
Model TCNIndexes already exists for motif 27
Model TransformerSeries already exists for motif 27
Model TransformerSeries_Masking already exists for motif 27
Model TransformerIndexes already exists for motif 27
Model BaselineAverageIndexes already exists for motif 27
Model BaselineLastDifferenceIndexes already exists for motif 27
Evaluating motif 6 with size 12 and 47 indexes
Model FFNNSeries already exists for motif 6
Model

In [6]:
import numpy as np
from scipy import stats
rng = np.random.default_rng()
from scipy.stats import ttest_rel


models = [ "FFNN", "LSTM", "CNN", "TCN", "Transformer", "Baseline"]
inputs = ["Series", "Series_Masking", "Indexes"]


results_df = pd.DataFrame(columns=["model", "input", "motif", "mae", "rmse"])

# Loop through each motif
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    motif_id = i + 1
    print(f"Processing motif {motif_id} with {len(motif_indexes)} indexes")
    
    for model_type in models:
        for input_name in inputs:
            # Handle baseline-specific logic
            if model_type == "Baseline":
                n_trials, num_epochs = (1, 1)
                if input_name != "Indexes":
                    continue
                
                # Process both BaselineAverage and BaselineLastDifference
                baseline_variants = ["BaselineAverage", "BaselineLastDifference"]
                for baseline_type in baseline_variants:
                    model_name = f"{baseline_type}{input_name}"
                    print(f"Processing Model: {model_name}")

                    # Construct the results directory path
                    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
                    results_file = os.path.join(model_results_dir, "best_model_results.csv")

                    # Skip if results file doesn't exist
                    if not os.path.exists(results_file):
                        print(f"Results file for {model_name} not found. Skipping.")
                        continue

                    # Load results from CSV
                    results = pd.read_csv(results_file)
                    maes = results["test_mae"].values
                    rmses = results["test_rmse"].values

                    # Add results to the dataframe
                    for i in range(len(maes)):  # Assuming results have folds
                        results_df = pd.concat([
                            results_df,
                            pd.DataFrame([{
                                "model": baseline_type,
                                "input": input_name,
                                "motif": i + 1,
                                "mae": maes[i],
                                "rmse": rmses[i]
                            }])
                        ], ignore_index=True)

            else:
                n_trials = 100
                num_epochs = 500

                model_name = f"{model_type}{input_name}"
                print(f"Processing Model: {model_name}")

                # Construct the results directory path
                model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
                results_file = os.path.join(model_results_dir, "best_model_results.csv")

                # Skip if results file doesn't exist
                if not os.path.exists(results_file):
                    print(f"Results file for {model_name} not found. Skipping.")
                    continue

                # Load results from CSV
                results = pd.read_csv(results_file)
                maes = results["test_mae"].values
                rmses = results["test_rmse"].values

                # Add results to the dataframe
                for i in range(len(maes)):  # Assuming results have folds
                    results_df = pd.concat([
                        results_df,
                        pd.DataFrame([{
                            "model": model_type,
                            "input": input_name,
                            "motif": i + 1,
                            "mae": maes[i],
                            "rmse": rmses[i]
                        }])
                    ], ignore_index=True)

# Display the results
print(results_df)

Processing motif 27 with 58 indexes
Processing Model: FFNNSeries
Processing Model: FFNNSeries_Masking
Processing Model: FFNNIndexes
Processing Model: LSTMSeries
Processing Model: LSTMSeries_Masking
Processing Model: LSTMIndexes
Processing Model: CNNSeries
Processing Model: CNNSeries_Masking
Processing Model: CNNIndexes
Processing Model: TCNSeries
Processing Model: TCNSeries_Masking
Processing Model: TCNIndexes
Processing Model: TransformerSeries
Processing Model: TransformerSeries_Masking
Processing Model: TransformerIndexes
Processing Model: BaselineAverageIndexes
Processing Model: BaselineLastDifferenceIndexes
Processing motif 6 with 47 indexes
Processing Model: FFNNSeries
Processing Model: FFNNSeries_Masking
Processing Model: FFNNIndexes
Processing Model: LSTMSeries
Processing Model: LSTMSeries_Masking
Processing Model: LSTMIndexes
Processing Model: CNNSeries
Processing Model: CNNSeries_Masking
Processing Model: CNNIndexes
Processing Model: TCNSeries
Processing Model: TCNSeries_Mask

Processing Model: TCNSeries
Processing Model: TCNSeries_Masking
Processing Model: TCNIndexes
Processing Model: TransformerSeries
Processing Model: TransformerSeries_Masking
Processing Model: TransformerIndexes
Processing Model: BaselineAverageIndexes
Processing Model: BaselineLastDifferenceIndexes
Processing motif 12 with 44 indexes
Processing Model: FFNNSeries
Processing Model: FFNNSeries_Masking
Processing Model: FFNNIndexes
Processing Model: LSTMSeries
Processing Model: LSTMSeries_Masking
Processing Model: LSTMIndexes
Processing Model: CNNSeries
Processing Model: CNNSeries_Masking
Processing Model: CNNIndexes
Processing Model: TCNSeries
Processing Model: TCNSeries_Masking
Processing Model: TCNIndexes
Processing Model: TransformerSeries
Processing Model: TransformerSeries_Masking
Processing Model: TransformerIndexes
Processing Model: BaselineAverageIndexes
Processing Model: BaselineLastDifferenceIndexes
Processing motif 7 with 38 indexes
Processing Model: FFNNSeries
Processing Model:

  results_df = pd.concat([


In [7]:
#average fold results for each model and input
avg_results_df = results_df.groupby(["model", "input"]).mean().reset_index()
print(avg_results_df)

                     model           input motif        mae       rmse
0          BaselineAverage         Indexes   1.0  12.951233  15.588457
1   BaselineLastDifference         Indexes   1.0  18.528498  29.364649
2                      CNN         Indexes   1.0  10.261559  12.555540
3                      CNN          Series   1.0   9.594777  11.600351
4                      CNN  Series_Masking   1.0  11.497563  13.672227
5                     FFNN         Indexes   1.0  11.138082  13.242543
6                     FFNN          Series   1.0   9.336295  11.407887
7                     FFNN  Series_Masking   1.0   9.164855  11.873960
8                     LSTM         Indexes   1.0   9.783668  12.221509
9                     LSTM          Series   1.0   9.792237  12.216532
10                    LSTM  Series_Masking   1.0   9.555272  11.882629
11                     TCN         Indexes   1.0   9.896350  12.022695
12                     TCN          Series   1.0   9.065550  11.506223
13    

In [11]:
# Define parameters
models_1 = ["TCN" ]
input_types_1 = ["Series"]
models_2 = ["BaselineAverage" ]
input_types_2 = ["Indexes"]

# Filter data for the selected input types

results = []
for model1 in models_1:
    for model2 in models_2:
        for input_1 in input_types_1:
            for input_2 in input_types_2:
                for metric in ["mae", "rmse"]:

                    data1 = results_df[(results_df['model'] == model1) & (results_df['input'] == input_1)].sort_values('motif')[metric]
                    data2 = results_df[(results_df['model'] == model2) & (results_df['input'] == input_2)].sort_values('motif')[metric]

                    if len(data1) == len(data2):
                        t_stat, p_value = ttest_rel(data1, data2, alternative='less')
                        results.append({
                            "Model_1": model1,
                            "InputType_1": input_1,
                            "Model_2": model2,
                            "InputType_2": input_2,
                            "Metric": metric,
                            "P-Value": p_value
                        })

# Convert results to DataFrame and display
pval_results_df = pd.DataFrame(results)
pval_results_df

Unnamed: 0,Model_1,InputType_1,Model_2,InputType_2,Metric,P-Value
0,TCN,Series,BaselineAverage,Indexes,mae,0.041331
1,TCN,Series,BaselineAverage,Indexes,rmse,0.045609
