In [7]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.image as mpimg
import optuna
import random
import joblib
import math
import ast
import logging
from msig import Motif, NullModel
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, TOWNSHIP_NAME, VARIABLES, NORMALIZE_FLAGS, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS, NTOP_MOTIFS, MOTIF_SIZE

optuna.logging.set_verbosity(optuna.logging.WARNING)

print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")


if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

Results will be saved in: /home/mgsilva/motifpred/results/populationdensity
Images will be saved in: /home/mgsilva/motifpred/images/populationdensity
Data will be accessed from: /home/mgsilva/motifpred/data/populationdensity


In [8]:
mp_stats_table = pd.read_csv(
    RESULTS_DIR / f"mp_stats_table_normalized_{NORMALIZE}_top_{TOP_K_MP}.csv"
)
mp_stats_table = mp_stats_table[mp_stats_table["m"] == MOTIF_SIZE]
top_motifs = mp_stats_table.sort_values(by="#Matches", ascending=False).head(NTOP_MOTIFS)
top_motifs = top_motifs[["m", "Indices"]]


In [9]:
# read csv
data_df = pd.read_csv(
    DATASET_PATH,
    parse_dates=["one_time"],
    date_format="%Y-%m-%d %H:%M:%S",
    index_col=0,
)

data_df = data_df[data_df["township_name"] == TOWNSHIP_NAME]
#set index to one_time and township_name
data_df = data_df.set_index(["one_time", "township_name"]).sort_index()[VARIABLES]
data = data_df.to_numpy().T
data_univar = data[0]
data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_terminals
one_time,township_name,Unnamed: 2_level_1
2021-09-15 00:00:00,Avenidas Novas,260700.0
2021-09-15 01:00:00,Avenidas Novas,276675.0
2021-09-15 02:00:00,Avenidas Novas,284563.0
2021-09-15 03:00:00,Avenidas Novas,279563.0
2021-09-15 04:00:00,Avenidas Novas,281460.0
...,...,...
2021-11-30 19:00:00,Avenidas Novas,391367.0
2021-11-30 20:00:00,Avenidas Novas,352361.0
2021-11-30 21:00:00,Avenidas Novas,388246.0
2021-11-30 22:00:00,Avenidas Novas,360169.0


In [10]:
# Import shared setup
from utils.setup import seed, device, early_stopper, pipeline, test_tensor

# Example usage
print(f"Device: {device}")
test_tensor()

Device: cuda
tensor([[0.3087, 0.0736, 0.4216],
        [0.0691, 0.2332, 0.4047],
        [0.2162, 0.9927, 0.4128],
        [0.5938, 0.6128, 0.1519],
        [0.0453, 0.5035, 0.9978]])


In [11]:
from utils.utils import create_dataset
from utils.train_pipeline import run_optuna_study
from utils.utils import get_best_model_results_traindevtest, plot_best_model_results_traindevtest
from models.baseline_pytorch import BaselineAverage, BaselineLastDifference
from utils.utils import plot_preds_vs_truevalues
from utils.train_pipeline import get_preds_best_config_train_val_test

lookback_period = 24*7*3 #window size
step = 1 #step size for the sliding window
forecast_period = 24*2 #forward window size
test_losses_list = []
test_mae_list = []
test_rmse_list = []

# Loop through each of the top 10 motifs
for i, top_motif in top_motifs.iterrows():
    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    
    print(f"Evaluating motif {i+1} with size {MOTIF_SIZE} and {len(motif_indexes)}  indexes ")
    
    # Create dataset for the current motif
    X_series, X_indices, X_mask, y = create_dataset(data, lookback_period, step, forecast_period, motif_indexes, MOTIF_SIZE)

    # X_series, X2, and y are now PyTorch tensors
    print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, lookback_period, num_features)
    print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window, 1)
    print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
    print("y shape:", y.shape)    # Expected shape: (num_samples, 1)
    
    # Define the model and run the Optuna study
    n_trials = 1
    num_epochs = 1
    model_name = "BaselineAverage" 
    model_type = "Baseline"
    
    suggestion_dict = {
        "batch_size": {
            "type": "categorical",
            "args": [[4, 8, 16, 32]]
        }
    }
    
    model_params_keys = []
    
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}")
    os.makedirs(model_results_dir, exist_ok=True)  
    
    X = {"X_series": X_series, "X_mask": X_mask, "X_indices": X_indices}
    normalize_flags = {"X_series": True, "X_mask": False, "X_indices": False}
    
    run_optuna_study(pipeline.run_train_val_test, eval(model_name), model_type, suggestion_dict,  model_params_keys, seed, X , y, normalize_flags, model_results_dir, n_trials=n_trials, num_epochs=num_epochs)
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)
    
    print(f"Best epoch: {best_epoch}")
    print(f"Test Loss: {test_loss}, Test MAE: {test_mae}, Test RMSE: {test_rmse}")

    test_losses_list.append(test_loss)
    test_mae_list.append(test_mae)
    test_rmse_list.append(test_rmse)
    
    # Plot predictions vs true values
    epochs_train_losses, epochs_val_losses, val_losses, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(study, pipeline, eval(model_name), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags)
    #plot_preds_vs_truevalues(np.ravel(all_true_values), np.ravel(all_predictions), fold=0, save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}_fold_{0}_predictions.png"))


# Convert lists to numpy arrays for easier calculations
test_losses_array = np.array(test_losses_list)
test_mae_array = np.array(test_mae_list)
test_rmse_array = np.array(test_rmse_list)

# Calculate mean and standard deviation
mean_test_loss = np.mean(test_losses_array)
std_test_loss = np.std(test_losses_array)

mean_test_mae = np.mean(test_mae_array)
std_test_mae = np.std(test_mae_array)

mean_test_rmse = np.mean(test_rmse_array)
std_test_rmse = np.std(test_rmse_array)

# Print aggregated results
print(f"Aggregated Results Across Top 10 Motifs:")
print(f"Mean Test Loss: {mean_test_loss} ± {std_test_loss}")
print(f"Mean Test MAE: {mean_test_mae} ± {std_test_mae}")
print(f"Mean Test RMSE: {mean_test_rmse} ± {std_test_rmse}")



Evaluating motif 27 with size 12 and 58  indexes 
X_series shape: torch.Size([1173, 504, 1])
X_indices shape: torch.Size([1173, 18, 1])
X_mask shape: torch.Size([1173, 504])
y shape: torch.Size([1173, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 250.05662536621094, Test MAE: 12.019815444946289, Test RMSE: 15.813179016113281
Best hyperparameters: {'batch_size': 32}
Evaluating motif 6 with size 12 and 47  indexes 
X_series shape: torch.Size([1032, 504, 1])
X_indices shape: torch.Size([1032, 16, 1])
X_mask shape: torch.Size([1032, 504])
y shape: torch.Size([1032, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 348.3819885253906, Test MAE: 16.235063552856445, Test RMSE: 18.664993286132812
Best hyperparameters: {'batch_size': 32}
Evaluating motif 24 with size 12 and 46  indexes 
X_series shape: torch.Size([1122, 504, 1])
X_indices shape: torch.Size([1122, 16, 1])
X_mask shape: torch.Size([1122, 504])
y shape: torch.Size([1122, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 88.6906967163086, Test MAE: 8.207314491271973, Test RMSE: 9.417573928833008
Best hyperparameters: {'batch_size': 32}
Evaluating motif 12 with size 12 and 44  indexes 
X_series shape: torch.Size([1050, 504, 1])
X_indices shape: torch.Size([1050, 15, 1])
X_mask shape: torch.Size([1050, 504])
y shape: torch.Size([1050, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 225.731689453125, Test MAE: 12.406604766845703, Test RMSE: 15.024370193481445
Best hyperparameters: {'batch_size': 32}
Evaluating motif 7 with size 12 and 38  indexes 
X_series shape: torch.Size([979, 504, 1])
X_indices shape: torch.Size([979, 12, 1])
X_mask shape: torch.Size([979, 504])
y shape: torch.Size([979, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 361.84283447265625, Test MAE: 15.887364387512207, Test RMSE: 19.022167205810547
Best hyperparameters: {'batch_size': 32}
Aggregated Results Across Top 10 Motifs:
Mean Test Loss: 254.94076690673828 ± 98.66653417451744
Mean Test MAE: 12.951232528686523 ± 2.935075195502496
Mean Test RMSE: 15.588456726074218 ± 3.4555443300646194


In [12]:
# Loop through each of the top 10 motifs
for i, top_motif in top_motifs.iterrows():

    motif_indexes = sorted(ast.literal_eval(top_motif["Indices"]))
    
    print(f"Evaluating motif {i+1} with size {MOTIF_SIZE} and {len(motif_indexes)}indexes")
    
    # Create dataset for the current motif
    X_series, X_indices, X_mask, y = create_dataset(data, lookback_period, step, forecast_period, motif_indexes, MOTIF_SIZE)

    # X_series, X2, and y are now PyTorch tensors
    print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, lookback_period, num_features)
    print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window, 1)
    print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, max_motif_length_in_window)
    print("y shape:", y.shape)    # Expected shape: (num_samples, 1)
    
    # Define the model and run the Optuna study
    n_trials = 1
    num_epochs = 1
    model_name = "BaselineLastDifference" 
    model_type = "Baseline"
    
    suggestion_dict = {
        "batch_size": {
            "type": "categorical",
            "args": [[4, 8, 16, 32]]
        }
    }
    
    model_params_keys = []
    
    model_results_dir = os.path.join(RESULTS_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}")
    os.makedirs(model_results_dir, exist_ok=True)  
    
    X = {"X_series": X_series, "X_mask": X_mask, "X_indices": X_indices}
    normalize_flags = {"X_series": True, "X_mask": False, "X_indices": False}
    
    run_optuna_study(pipeline.run_train_val_test, eval(model_name), model_type, suggestion_dict,  model_params_keys, seed, X , y, normalize_flags, model_results_dir, n_trials=n_trials, num_epochs=num_epochs)
    
    study = joblib.load(os.path.join(model_results_dir, "study.pkl"))
    train_losses, val_losses, best_epoch, test_loss, test_mae, test_rmse = get_best_model_results_traindevtest(study)
    
    print(f"Best epoch: {best_epoch}")
    print(f"Test Loss: {test_loss}, Test MAE: {test_mae}, Test RMSE: {test_rmse}")

    test_losses_list.append(test_loss)
    test_mae_list.append(test_mae)
    test_rmse_list.append(test_rmse)
    
    # Plot predictions vs true values
    epochs_train_losses, epochs_val_losses, val_losses, test_losses, test_mae, test_rmse, all_predictions, all_true_values = get_preds_best_config_train_val_test(study, pipeline, eval(model_name), model_type, model_params_keys, num_epochs=num_epochs, seed=seed, X=X, y=y, normalize_flags=normalize_flags)
    #plot_preds_vs_truevalues(np.ravel(all_true_values), np.ravel(all_predictions), fold=0, save_path=os.path.join(IMAGES_DIR, f"{model_name}_{n_trials}_trials_{num_epochs}_epochs_motif_{i+1}_fold_{0}_predictions.png"))


# Convert lists to numpy arrays for easier calculations
test_losses_array = np.array(test_losses_list)
test_mae_array = np.array(test_mae_list)
test_rmse_array = np.array(test_rmse_list)

# Calculate mean and standard deviation
mean_test_loss = np.mean(test_losses_array)
std_test_loss = np.std(test_losses_array)

mean_test_mae = np.mean(test_mae_array)
std_test_mae = np.std(test_mae_array)

mean_test_rmse = np.mean(test_rmse_array)
std_test_rmse = np.std(test_rmse_array)

# Print aggregated results
print(f"Aggregated Results Across Top 10 Motifs:")
print(f"Mean Test Loss: {mean_test_loss} ± {std_test_loss}")
print(f"Mean Test MAE: {mean_test_mae} ± {std_test_mae}")
print(f"Mean Test RMSE: {mean_test_rmse} ± {std_test_rmse}")

Evaluating motif 27 with size 12 and 58indexes
X_series shape: torch.Size([1173, 504, 1])
X_indices shape: torch.Size([1173, 18, 1])
X_mask shape: torch.Size([1173, 504])
y shape: torch.Size([1173, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 1443.6171875, Test MAE: 20.3028564453125, Test RMSE: 37.99496078491211
Best hyperparameters: {'batch_size': 32}
Evaluating motif 6 with size 12 and 47indexes
X_series shape: torch.Size([1032, 504, 1])
X_indices shape: torch.Size([1032, 16, 1])
X_mask shape: torch.Size([1032, 504])
y shape: torch.Size([1032, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 231.94805908203125, Test MAE: 7.3246750831604, Test RMSE: 15.229841232299805
Best hyperparameters: {'batch_size': 32}
Evaluating motif 24 with size 12 and 46indexes
X_series shape: torch.Size([1122, 504, 1])
X_indices shape: torch.Size([1122, 16, 1])
X_mask shape: torch.Size([1122, 504])
y shape: torch.Size([1122, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 2102.571533203125, Test MAE: 27.71428680419922, Test RMSE: 45.85380554199219
Best hyperparameters: {'batch_size': 32}
Evaluating motif 12 with size 12 and 44indexes
X_series shape: torch.Size([1050, 504, 1])
X_indices shape: torch.Size([1050, 15, 1])
X_mask shape: torch.Size([1050, 504])
y shape: torch.Size([1050, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 134.91082763671875, Test MAE: 7.191082954406738, Test RMSE: 11.6151123046875
Best hyperparameters: {'batch_size': 32}
Evaluating motif 7 with size 12 and 38indexes
X_series shape: torch.Size([979, 504, 1])
X_indices shape: torch.Size([979, 12, 1])
X_mask shape: torch.Size([979, 504])
y shape: torch.Size([979, 1])


  0%|          | 0/1 [00:00<?, ?it/s]

Best hyperparameters: {'batch_size': 32}
Best epoch: 0
Test Loss: 1305.3424072265625, Test MAE: 30.109588623046875, Test RMSE: 36.12952423095703
Best hyperparameters: {'batch_size': 32}
Aggregated Results Across Top 10 Motifs:
Mean Test Loss: 649.3093849182129 ± 666.2116701520237
Mean Test MAE: 15.739865255355834 ± 7.723661104643016
Mean Test RMSE: 22.476552772521973 ± 12.004747185699978
