In [133]:
import numpy as np
import pandas as pd
import os
import sys
import csv
import torch
from torch.nn.utils.rnn import pad_sequence
from config import RESULTS_MOTIF_DIR, RESULTS_DIR, IMAGES_DIR, DATA_DIR, DATASET_PATH, VARIABLES, NORMALIZE_FLAGS, STUMPY_EXCL_ZONE_DENOM, TOP_K_MP, INCLUDE, NORMALIZE, SUBSQUENCES_LENGTHS


print(f"Results will be saved in: {RESULTS_DIR}")
print(f"Images will be saved in: {IMAGES_DIR}")
print(f"Data will be accessed from: {DATA_DIR}")

if '__file__' in globals():
    # For standalone scripts
    base_dir = os.path.dirname(__file__)
else:
    # For Jupyter or interactive environments
    base_dir = os.getcwd()

# Add the parent directory of `utils` to the Python path
sys.path.append(os.path.abspath(os.path.join(base_dir, "../")))

def create_multi_motif_dataset(data, lookback_period, step, forecast_period, motif_indexes_list, motif_sizes_list):
    X1, X2, mask, y = [], [], [], []

    for idx in range(0, len(data[0]) - lookback_period - forecast_period, step):
        window_end_idx = idx + lookback_period
        forecast_period_end = window_end_idx + forecast_period

        # Extract the data window and transpose to (lookback_period, num_features)
        data_window = data[:, idx:window_end_idx].T

        motif_indexes_in_window = []  # Stores motif indices for the lookback window
        forecast_distances = []       # Stores forecast distances for each motif
        mask_window = torch.zeros(lookback_period, dtype=torch.float32)  # Initialize mask with zeros

        valid_instance = False

        # for each motif, check if it is in the lookback window and forecast period
        for motif_indexes, motif_size in zip(motif_indexes_list, motif_sizes_list):
            # Motif indexes in the lookback period (relative to the start of the window)
            motif_in_mask = sorted([
                int(motif_idx) - idx
                for motif_idx in motif_indexes
                if (motif_idx + motif_size > idx and motif_idx < window_end_idx)
            ])

            # Motif indexes for X_indices (only motifs fully starting within the window)
            motif_in_lookback = sorted([
                int(motif_idx) - idx
                for motif_idx in motif_indexes
                if idx <= motif_idx < window_end_idx
            ])          
            # Motif indexes in the forecast period
            motif_in_forecast = sorted([
                int(motif_idx)
                for motif_idx in motif_indexes
                if window_end_idx <= motif_idx < forecast_period_end
            ])           
             # if motif has index in the lookback window and forecast period
            if len(motif_in_lookback) >= 2 and motif_in_forecast:
                valid_instance = True

                # Compute distance to the nearest motif  in the forecast period
                motif_indexes_in_window.append(motif_in_lookback)
                forecast_distances.append(min(motif_in_forecast) - window_end_idx + 1)

                # Update the mask for the motifs in the lookback window
                for motif_start in motif_in_mask:
                    motif_end = motif_start + motif_size
                    if motif_start < lookback_period and motif_end > 0:
                        mask_window[max(0, motif_start):min(lookback_period, motif_end)] = 1
                    
            else:
                continue  # ignore motifs that are not in the lookback window and forecast period
        
        if not valid_instance:
            continue  # Skip instances without any motifs in the forecast period

        # Append to the dataset
        for i in range(len(motif_indexes_in_window)):
            X1.append(torch.tensor(data_window, dtype=torch.float32))
            X2.append(motif_indexes_in_window[i])
            y.append(torch.tensor(forecast_distances[i], dtype=torch.float32))
            mask.append(mask_window.clone())  # Clone the mask to avoid overwriting it

    # Stack the results
    X1 = torch.stack(X1)  # Shape: (num_samples, lookback_period, num_features)
    X2_padded = pad_sequence([torch.tensor(motif_indexes, dtype=torch.float32) for motif_indexes in X2], batch_first=True, padding_value=-1).unsqueeze(-1)  # Shape: (num_samples, max_num_motifs, max_num_repetitions)
    y = torch.stack(y).unsqueeze(1)  # Shape: (num_samples,1)
    mask = torch.stack(mask)  # Shape: (num_samples, lookback_period)

    return X1, X2_padded, mask, y

Results will be saved in: /home/mgsilva/motifpred/results/populationdensity
Images will be saved in: /home/mgsilva/motifpred/images/populationdensity
Data will be accessed from: /home/mgsilva/motifpred/data/populationdensity


In [134]:
#data equals indexes from 0 to 99 for 3 features
data = np.array([np.array([i, i + 100, i + 200]) for i in range(100)]).T
data.shape

(3, 100)

In [135]:
motifs_indexes = [[0, 12, 21, 35, 39], [16, 50, 60]]
motifs_sizes = [3, 3]

#plant the motifs in the data
pattern = 111
for motif_indexes, motif_size in zip(motifs_indexes, motifs_sizes):

    for motif_idx in motif_indexes:
        data[:, motif_idx:motif_idx + motif_size] = pattern
    pattern = 999
data = data.astype(np.float64)
#find the motifs in the data
import stumpy
m = 3
mp, mp_indices = stumpy.mstump(data, m)
motif_distances, motif_indices, motif_subspaces, motif_mdls = stumpy.mmotifs(
        data,
        mp,
        mp_indices,
        min_neighbors=2,
        max_distance=1.0,
        cutoffs=None,
        max_matches=99,
        max_motifs=99,
        k=2,
        include=None,
        normalize=False,
    )


In [136]:
motif_indices = [sorted([idx for idx in motif if idx != -1]) for motif in motif_indices]
motif_indices

[[0, 12, 21, 35, 39], [16, 50, 60]]

In [137]:
lookback_period = 20 #window size
step = 1 #step size for the sliding window
forecast_period = 10 #forward window size

#X_series: past window, X_indices: indexes of the motif in the window,  y: next relative index of the motif
X_series, X_indices, X_mask, y = create_multi_motif_dataset(data, lookback_period, step, forecast_period, motifs_indexes, motifs_sizes)

# X_series, X2, and y are now PyTorch tensors
print("X_series shape:", X_series.shape)  # Expected shape: (num_samples, lookback_period, num_features)
print("X_mask shape:", X_mask.shape)  # Expected shape: (num_samples, lookback_period)
print("X_indices shape:", X_indices.shape)  # Expected shape: (num_samples, max_motif_length_in_window, 1)
print("y shape:", y.shape)    # Expected shape: (num_samples, 1)

X_series shape: torch.Size([12, 20, 3])
X_mask shape: torch.Size([12, 20])
X_indices shape: torch.Size([12, 2, 1])
y shape: torch.Size([12, 1])


In [139]:
#motifs_indexes = [[0, 12, 21, 35, 39], [16, 50, 60]]

for i in range(12):
    print("Sample", i)
    print("X_series:", X_series[i, :, 0])
    print("X_mask:", X_mask[i])
    print("X_indices:", X_indices[i])
    print("y:", y[i])

Sample 0
X_series: tensor([111., 111., 111.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        111., 111., 111.,  15., 999., 999., 999.,  19.])
X_mask: tensor([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.,
        0., 0.])
X_indices: tensor([[ 0.],
        [12.]])
y: tensor([2.])
Sample 1
X_series: tensor([  6.,   7.,   8.,   9.,  10.,  11., 111., 111., 111.,  15., 999., 999.,
        999.,  19.,  20., 111., 111., 111.,  24.,  25.])
X_mask: tensor([0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
        0., 0.])
X_indices: tensor([[ 6.],
        [15.]])
y: tensor([10.])
Sample 2
X_series: tensor([  7.,   8.,   9.,  10.,  11., 111., 111., 111.,  15., 999., 999., 999.,
         19.,  20., 111., 111., 111.,  24.,  25.,  26.])
X_mask: tensor([0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
        0., 0.])
X_indices: tensor([[ 5.],
        [14.]])
y: tensor([9.])
Sample 3
X_series: tensor([  8.,   9.,  10