In [24]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import ast
import logging
from msig import Motif, NullModel
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, TOWNSHIP_NAME, VARIABLES, NORMALIZE_FLAGS, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS, NTOP_MOTIFS, MOTIF_SIZE

optuna.logging.set_verbosity(optuna.logging.WARNING)

print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")


if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

Results will be saved in: /home/mgsilva/motifpred/results/populationdensity
Images will be saved in: /home/mgsilva/motifpred/images/populationdensity
Data will be accessed from: /home/mgsilva/motifpred/data/populationdensity


In [25]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)
mp_stats_table = mp_stats_table[mp_stats_table["m"] == MOTIF_SIZE]
top_motifs = mp_stats_table.sort_values(by="#Matches", ascending=False).head(NTOP_MOTIFS)
top_motifs = top_motifs[["m", "Indices"]]

In [26]:
# read csv
data_df = pd.read_csv(
    DATASET_PATH,
    parse_dates=["one_time"],
    date_format="%Y-%m-%d %H:%M:%S",
    index_col=0,
)

data_df = data_df[data_df["township_name"] == TOWNSHIP_NAME]
#set index to one_time and township_name
data_df = data_df.set_index(["one_time", "township_name"]).sort_index()[VARIABLES]
data = data_df.to_numpy().T
data_univar = data[0]
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_terminals
one_time,township_name,Unnamed: 2_level_1
2021-09-15 00:00:00,Avenidas Novas,260700.0
2021-09-15 01:00:00,Avenidas Novas,276675.0
2021-09-15 02:00:00,Avenidas Novas,284563.0
2021-09-15 03:00:00,Avenidas Novas,279563.0
2021-09-15 04:00:00,Avenidas Novas,281460.0
...,...,...
2021-11-30 19:00:00,Avenidas Novas,391367.0
2021-11-30 20:00:00,Avenidas Novas,352361.0
2021-11-30 21:00:00,Avenidas Novas,388246.0
2021-11-30 22:00:00,Avenidas Novas,360169.0


In [27]:
# Import shared setup
from utils.setup import seed, device, early_stopper, pipeline, test_tensor

# Example usage
print(f"Device: {device}")
test_tensor()

Device: cuda
tensor([[0.3026, 0.8867, 0.8263],
        [0.3065, 0.8353, 0.5215],
        [0.6699, 0.6728, 0.7548],
        [0.2313, 0.5837, 0.6572],
        [0.8870, 0.4751, 0.6564]])


In [28]:
from utils.utils import create_dataset, get_best_model_results_traindevtest, plot_best_model_results_traindevtest, plot_preds_vs_truevalues
from utils.train_pipeline import run_optuna_study, get_preds_best_config_train_val_test
from models.ffnn_pytorch import FFNN
from models.lstm_pytorch import LSTM
from models.cnn_pytorch import CNN
from models.tcn_pytorch import TemporalConvNet
from models.transformer_pytorch import TimeSeriesTransformer
from models.baseline_pytorch import BaselineAverage, BaselineLastDifference

models = ["Baseline"]
inputs = ["Indexes"]

lookback_period = 24*7*3 #window size
step = 1 #step size for the sliding window
forecast_period = 24*2 #forward window size
n_trials = 100
num_epochs = 500

import os
import numpy as np
import joblib
import ast
import pandas as pd
from utils.utils import create_dataset, get_best_model_results_traindevtest, plot_best_model_results_traindevtest, plot_preds_vs_truevalues
from utils.train_pipeline import get_preds_best_config_train_val_test
from models.ffnn_pytorch import FFNN
from models.lstm_pytorch import LSTM
from models.cnn_pytorch import CNN
from models.tcn_pytorch import TemporalConvNet
from models.transformer_pytorch import TimeSeriesTransformer
from models.baseline_pytorch import BaselineAverage, BaselineLastDifference

models = ["FFNN"]
inputs = ["Series", "Series_Masking"]

n_trials = 100
num_epochs = 500

def process_baseline_model(model_class, input_name, X, normalize_flags, num_epochs, seed, pipeline, y, motif_id):
    """Process baseline models."""
    model_name = f"{model_class.__name__}{input_name}"
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
    os.makedirs(model_results_dir, exist_ok=True)
    
    if os.path.exists(os.path.join(model_results_dir, "best_model_results.csv")):
        print(f"Model {model_name} already exists")
        return
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)
    print(f"{model_name} - Motif {motif_id}: Best epoch: {best_epoch}, Test Loss: {test_loss}, Test MAE: {test_mae}, Test RMSE: {test_rmse}")
    
    _, _, _, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(
        study, pipeline, model_class, "Baseline", [], num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags
    )
    
    #plot_best_model_results_traindevtest(
    #    study.trials_dataframe(),
    #    save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}_best_results.png")
    #)
    
    #plot_preds_vs_truevalues(
    #    np.ravel(all_true_values), np.ravel(all_predictions), fold=0,
    #    save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}_fold_{0}_predictions.png")
    #)

    results = pd.DataFrame({
        "test_loss": [test_loss],
        "test_mae": [test_mae],
        "test_rmse": [test_rmse]
    })
    results.to_csv(os.path.join(model_results_dir, "best_model_results.csv"), index=False)


def process_non_baseline_model(model_type, model_params_keys, input_name, X, normalize_flags, num_epochs, seed, pipeline, y, motif_id):
    """Process non-baseline models."""
    model_name = f"{model_type}{input_name}"
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
    os.makedirs(model_results_dir, exist_ok=True)
    
    if os.path.exists(os.path.join(model_results_dir, "best_model_results.csv")):
        print(f"Model {model_name} already exists")
        return
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse  = get_best_model_results_traindevtest(study)
    print(f"{model_name} - Motif {motif_id}: Best epoch: {best_epoch}, Test Loss: {best_model_test_loss}, Test MAE: {best_model_test_mae}, Test RMSE: {best_model_test_rmse}")
    
    _, _, _, retrained_test_losses, retrained_test_mae, retrained_test_rmse, retrained_all_predictions, retrained_all_true_values = get_preds_best_config_train_val_test(
        study, pipeline, eval(model_type), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags
    )

    if not np.allclose(test_loss, retrained_test_losses, atol=1):
        raise Exception("Best model test loss does not match the one obtained from the study")
    
    #plot_best_model_results_traindevtest(
    #    study.trials_dataframe(),
    #    save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}_best_results.png")
    #)
    
    #plot_preds_vs_truevalues(
    #    np.ravel(all_true_values), np.ravel(all_predictions), fold=0,
    #    save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}_fold_{0}_predictions.png")
    #)

    results = pd.DataFrame({
        "test_loss": [test_loss],
        "test_mae": [test_mae],
        "test_rmse": [test_rmse]
    })
    results.to_csv(os.path.join(model_results_dir, "best_model_results.csv"), index=False)


# Loop through each motif
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    print(f"Evaluating motif {i+1} with size {MOTIF_SIZE} and {len(motif_indexes)} indexes")
    
    X_series, X_indices, X_mask, y = create_dataset(data, lookback_period, step, forecast_period, motif_indexes, MOTIF_SIZE)
    
    for model_type in models:
        for input_name in inputs:
            if model_type == "Baseline" and input_name != "Indexes":
                continue
            
            X = {"X_series": X_series} if input_name == "Series" else {"X_series": X_series, "X_mask": X_mask} if input_name == "Series_Masking" else {"X_indices": X_indices}
            
            model_params_map = {
                "FFNN": ["hidden_sizes_list"],
                "LSTM": ["hidden_sizes_list"],
                "CNN": ["kernel_size", "num_filters_list", "pool_size"],
                "TCN": ["kernel_size", "num_channels_list", "dropout"],
                "Transformer": ["d_model", "n_heads", "e_layers", "dim_feedforward", "dropout"]
            }
            
            if model_type == "Baseline":
                for model_class in [BaselineAverage, BaselineLastDifference]:
                    process_baseline_model(model_class, input_name, X, NORMALIZE_FLAGS, num_epochs, seed, pipeline, y, i+1)
            else:
                process_non_baseline_model(model_type, model_params_map[model_type], input_name, X, NORMALIZE_FLAGS, num_epochs, seed, pipeline, y, i+1)


Evaluating motif 27 with size 12 and 58 indexes
Model FFNNSeries already exists
Model FFNNSeries_Masking already exists
Evaluating motif 6 with size 12 and 47 indexes
Model FFNNSeries already exists
Model FFNNSeries_Masking already exists
Evaluating motif 24 with size 12 and 46 indexes
Model FFNNSeries already exists
Model FFNNSeries_Masking already exists
Evaluating motif 12 with size 12 and 44 indexes
Model FFNNSeries already exists
Model FFNNSeries_Masking already exists
Evaluating motif 7 with size 12 and 38 indexes
Model FFNNSeries already exists
Model FFNNSeries_Masking already exists


In [29]:
import numpy as np
from scipy import stats
rng = np.random.default_rng()
from scipy.stats import ttest_rel

models = ["FFNN", "LSTM", "CNN", "TCN", "Transformer", "Baseline"]
inputs = ["Series", "Series_Masking", "Indexes"]

results_df = pd.DataFrame(columns=["model", "input", "motif", "mae", "rmse"])

# Loop through each motif
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    motif_id = i + 1
    print(f"Processing motif {motif_id} with {len(motif_indexes)} indexes")
    
    for model_type in models:
        for input_name in inputs:
            if model_type == "Baseline" and input_name != "Indexes":
                continue
            
            model_name = f"{model_type}{input_name}"
            model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{motif_id}")
            results_file = os.path.join(model_results_dir, "best_model_results.csv")
            
            if not os.path.exists(results_file):
                print(f"Results file for {model_name} motif {motif_id} not found. Skipping.")
                continue
            
            results = pd.read_csv(results_file)
            test_mae = results["test_mae"].values[0]
            test_rmse = results["test_rmse"].values[0]
            
            results_df = pd.concat([
                results_df,
                pd.DataFrame([{
                    "model": model_type,
                    "input": input_name,
                    "motif": motif_id,
                    "mae": test_mae,
                    "rmse": test_rmse
                }])
            ], ignore_index=True)

# Display the results
print(results_df)

Processing motif 27 with 58 indexes
Results file for FFNNIndexes motif 27 not found. Skipping.
Results file for LSTMSeries motif 27 not found. Skipping.
Results file for LSTMSeries_Masking motif 27 not found. Skipping.
Results file for LSTMIndexes motif 27 not found. Skipping.
Results file for CNNSeries motif 27 not found. Skipping.
Results file for CNNSeries_Masking motif 27 not found. Skipping.
Results file for CNNIndexes motif 27 not found. Skipping.
Results file for TCNSeries motif 27 not found. Skipping.
Results file for TCNSeries_Masking motif 27 not found. Skipping.
Results file for TCNIndexes motif 27 not found. Skipping.
Results file for TransformerSeries motif 27 not found. Skipping.
Results file for TransformerSeries_Masking motif 27 not found. Skipping.
Results file for TransformerIndexes motif 27 not found. Skipping.
Results file for BaselineIndexes motif 27 not found. Skipping.
Processing motif 6 with 47 indexes
Results file for FFNNIndexes motif 6 not found. Skipping.
Re


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



In [30]:
#average fold results for each model and input
avg_results_df = results_df.groupby(["model", "input"]).mean().reset_index()
print(avg_results_df)

  model           input motif        mae       rmse
0  FFNN          Series  15.2   9.529744  13.467987
1  FFNN  Series_Masking  15.2  10.069367  13.113577
